[med-svn] [pbseqlib] 02/04: Imported Upstream version 0~20151014+gitbe5d1bf
Afif Elghraoui
afif-guest at moszumanska.debian.org
Sat Oct 17 08:32:07 UTC 2015
This is an automated email from the git hooks/post-receive script.
afif-guest pushed a commit to branch master
in repository pbseqlib.
commit 4bf697372bc65dd482e535d429f3bed9c2bd63d9
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Thu Oct 15 23:27:53 2015 -0700
Imported Upstream version 0~20151014+gitbe5d1bf
---
.gitignore | 9 +-
.travis.yml | 6 +-
Makefile | 36 ---
README.md | 24 +-
alignment/Makefile | 113 -------
.../algorithms/anchoring/MapBySuffixArrayImpl.hpp | 20 +-
alignment/algorithms/sorting/DifferenceCovers.cpp | 3 +-
.../algorithms/sorting/LightweightSuffixArray.cpp | 11 +-
alignment/algorithms/sorting/MultikeyQuicksort.cpp | 2 +-
alignment/algorithms/sorting/qsufsort.hpp | 5 +-
alignment/build.mk | 1 +
alignment/bwt/Occ.hpp | 2 +-
alignment/bwt/PackedHash.hpp | 14 +-
.../datastructures/alignment/FilterCriteria.cpp | 2 +
alignment/files/FragmentCCSIterator.cpp | 3 +-
alignment/files/ReaderAgglomerate.cpp | 52 +++-
alignment/files/ReaderAgglomerate.hpp | 4 +
alignment/format/BAMPrinter.hpp | 18 +-
alignment/format/BAMPrinterImpl.hpp | 67 +++-
alignment/format/SAMHeaderPrinter.cpp | 25 +-
alignment/format/SAMPrinter.cpp | 7 +-
alignment/format/SAMPrinter.hpp | 1 +
alignment/format/SAMPrinterImpl.hpp | 14 +-
alignment/makefile | 47 +++
alignment/simple.mk | 82 -----
alignment/simulator/ContextOutputList.hpp | 2 +-
alignment/simulator/ContextSet.cpp | 4 +-
alignment/simulator/OutputSampleListSet.hpp | 3 +-
alignment/suffixarray/LCPTable.hpp | 21 +-
alignment/suffixarray/SuffixArray.hpp | 1 -
alignment/suffixarray/ssort.hpp | 6 +-
alignment/tuples/TupleCountTableImpl.hpp | 5 +-
alignment/tuples/TupleListImpl.hpp | 1 -
alignment/utils/RegionUtils.cpp | 111 +------
alignment/utils/RegionUtils.hpp | 48 ---
alignment/utils/RegionUtilsImpl.hpp | 231 +++-----------
common.mk | 84 -----
configure.py | 339 +++++++++++++++++++++
hdf/BufferedHDF2DArrayImpl.hpp | 15 +-
hdf/BufferedHDFArrayImpl.hpp | 3 +-
hdf/HDFAtom.cpp | 9 +-
hdf/HDFAtom.hpp | 37 ++-
hdf/HDFAttributable.cpp | 6 +-
hdf/HDFAttributable.hpp | 4 +-
hdf/HDFBasReader.hpp | 240 ++++++++-------
hdf/HDFBaseCallsWriter.cpp | 326 ++++++++++++++++++++
hdf/HDFBaseCallsWriter.hpp | 233 ++++++++++++++
hdf/HDFBaxWriter.cpp | 141 +++++++++
hdf/HDFBaxWriter.hpp | 172 +++++++++++
hdf/HDFCmpFile.hpp | 14 +-
hdf/HDFCmpReader.hpp | 1 +
hdf/HDFCmpRefAlignmentGroup.hpp | 3 +-
hdf/HDFData.cpp | 10 +-
hdf/HDFData.hpp | 10 +-
hdf/HDFPlsReader.hpp | 16 +-
hdf/HDFRegionTableReader.cpp | 115 ++++---
hdf/HDFRegionTableReader.hpp | 30 +-
hdf/HDFRegionsWriter.cpp | 99 ++++++
hdf/HDFRegionsWriter.hpp | 101 ++++++
hdf/HDFScanDataReader.cpp | 29 +-
hdf/HDFScanDataReader.hpp | 7 +-
hdf/HDFScanDataWriter.cpp | 127 +++++---
hdf/HDFScanDataWriter.hpp | 55 +++-
hdf/HDFWriteBuffer.hpp | 3 +-
hdf/HDFWriterBase.cpp | 98 ++++++
hdf/HDFWriterBase.hpp | 88 ++++++
hdf/HDFZMWMetricsWriter.cpp | 142 +++++++++
hdf/HDFZMWMetricsWriter.hpp | 117 +++++++
hdf/HDFZMWWriter.cpp | 144 +++++++++
hdf/HDFZMWWriter.hpp | 120 ++++++++
hdf/Makefile | 94 ------
hdf/build.mk | 1 +
hdf/makefile | 33 ++
makefile | 31 ++
pbdata/.gitignore | 2 +
pbdata/CCSSequence.cpp | 21 +-
pbdata/CCSSequence.hpp | 22 +-
pbdata/CompressedDNASequence.hpp | 6 +-
pbdata/CompressedSequenceImpl.hpp | 13 +-
pbdata/DNASequence.cpp | 30 +-
pbdata/DNASequence.hpp | 15 +-
pbdata/Enumerations.h | 47 ++-
pbdata/FASTAReader.cpp | 11 +-
pbdata/FASTASequence.cpp | 38 +--
pbdata/FASTASequence.hpp | 16 +-
pbdata/FASTQReader.cpp | 2 +-
pbdata/FASTQSequence.cpp | 50 ++-
pbdata/FASTQSequence.hpp | 41 ++-
pbdata/MD5Utils.cpp | 5 +-
pbdata/Makefile | 85 ------
pbdata/PacBioDefs.h | 180 +++++++++++
pbdata/PackedDNASequence.cpp | 6 +-
pbdata/ReverseCompressIndex.cpp | 3 +-
pbdata/SMRTSequence.cpp | 289 +++++++++++++-----
pbdata/SMRTSequence.hpp | 161 +++++++---
pbdata/StringUtils.cpp | 52 +---
pbdata/StringUtils.hpp | 2 +-
pbdata/build.mk | 1 +
pbdata/makefile | 39 +++
pbdata/matrix/FlatMatrixImpl.hpp | 9 +-
pbdata/matrix/MatrixImpl.hpp | 7 +-
pbdata/metagenome/SequenceIndexDatabaseImpl.hpp | 16 +-
pbdata/metagenome/TitleTable.cpp | 10 +-
pbdata/reads/BaseFile.cpp | 6 +-
pbdata/reads/PulseBaseCommon.cpp | 39 ++-
pbdata/reads/PulseBaseCommon.hpp | 39 ++-
pbdata/reads/PulseFile.cpp | 40 ++-
pbdata/reads/PulseFileImpl.hpp | 4 +-
pbdata/reads/ReadInterval.hpp | 19 +-
pbdata/reads/RegionAnnotation.cpp | 49 +++
pbdata/reads/RegionAnnotation.hpp | 241 +++++++++++++++
pbdata/reads/RegionAnnotations.cpp | 179 +++++++++++
pbdata/reads/RegionAnnotations.hpp | 122 ++++++++
pbdata/reads/RegionTable.cpp | 239 +++++++--------
pbdata/reads/RegionTable.hpp | 166 ++++++----
pbdata/reads/RegionTypeMap.cpp | 89 ++++++
pbdata/reads/RegionTypeMap.hpp | 85 ++++++
pbdata/reads/ScanData.cpp | 155 +++++++++-
pbdata/reads/ScanData.hpp | 86 +++++-
pbdata/sam/SAMReaderImpl.hpp | 2 +-
pbdata/utils.hpp | 5 +-
pbdata/utils/SMRTReadUtils.cpp | 4 +-
pbdata/utils/SMRTTitle.hpp | 14 +
pbdata/utilsImpl.hpp | 32 +-
rules.mk | 28 ++
simple.mk | 16 -
travis.sh | 14 +
unittest/.gitignore | 1 +
unittest/Makefile | 41 ---
unittest/alignment/Makefile | 9 +-
unittest/alignment/files/CCSIterator_gtest.cpp | 2 -
.../alignment/files/FragmentCCSIterator_gtest.cpp | 5 +-
.../files/FragmentCCSIterator_other_gtest.cpp | 100 ++++++
.../alignment/files/ReaderAgglomerate_gtest.cpp | 16 +
unittest/alignment/utils/RegionUtils_gtest.cpp | 1 -
unittest/build.mk | 27 ++
unittest/common.mk | 64 ----
unittest/hdf/HDFPlsReader_gtest.cpp | 3 +-
unittest/hdf/HDFScanDataWriter_gtest.cpp | 2 +
unittest/hdf/Makefile | 9 +-
unittest/makefile | 120 ++++++++
unittest/pbdata/CCSSequence_gtest.cpp | 8 +-
unittest/pbdata/DNASequence_gtest.cpp | 2 +-
unittest/pbdata/Makefile | 9 +-
unittest/pbdata/SMRTSequence_gtest.cpp | 6 +-
unittest/pbdata/StringUtils_gtest.cpp | 44 ++-
unittest/pbdata/reads/RegionAnnotations_gtest.cpp | 203 ++++++++++++
unittest/pbdata/reads/RegionTypeMap_gtest.cpp | 61 ++++
148 files changed, 5744 insertions(+), 1939 deletions(-)
diff --git a/.gitignore b/.gitignore
index bebcdca..93add20 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,10 @@
*.o
-*.a
*.d
+*.a
+*.so
+*.dylib
+defines.mk
+all.xml
+*.h5
+libconfig.h
+/hdf/hdf5-1.8.12-headers/
diff --git a/.travis.yml b/.travis.yml
index 699f0ab..b4fc2f8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,6 @@
language: cpp
script:
- - make nopbbam=1 COMMON_NO_THIRD_PARTY_REQD=true
+ - ./travis.sh
compiler:
- gcc
# - clang
@@ -13,8 +13,8 @@ addons:
packages:
- gcc-4.8
- g++-4.8
- - clang
-# - libxqilla-dev # missing, but not needed?
+# - clang
+# - libhdf5-serial-1.8.4
notifications:
email: false
sudo: false
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 6252198..0000000
--- a/Makefile
+++ /dev/null
@@ -1,36 +0,0 @@
-SHELL=/bin/bash
-
-.PHONY: all debug profile gtest clean cleanall
-
-# $Change: 140182 $
-
-all:
- make -C pbdata all
- make -C hdf all
- make -C alignment all
-
-debug:
- make -C pbdata debug
- make -C hdf debug
- make -C alignment debug
-
-profile:
- make -C pbdata profile
- make -C hdf profile
- make -C alignment profile
-
-g:
- make -C pbdata g
- make -C hdf g
- make -C alignment g
-
-gtest:
- make -C unittest gtest
-
-clean:
- @make -C pbdata clean
- @make -C hdf clean
- @make -C alignment clean
- @make -C unittest clean
-
-cleanall: clean
diff --git a/README.md b/README.md
index 7c0c052..7d95e8d 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
[![Build Status](https://travis-ci.org/PacificBiosciences/blasr_libcpp.svg?branch=master)](https://travis-ci.org/PacificBiosciences/blasr_libcpp)
+
#What is blasr_libcpp#
-Blasr_libcpp is a **library** used by blasr and other executables such as samtoh5, loadPulses for analyzing PacBio sequences. This library contains three sub-directories, including pbdata, hdf and alignment.
+**Blasr_libcpp** is a *library* used by **blasr** and other executables such as samtoh5, loadPulses for analyzing PacBio sequences. This library contains three sub-libraries, including pbdata, hdf and alignment:
+ pbdata
- contains source code for handling Pacbio sequences and can build lib ```libpbdata```.
+ hdf
@@ -9,16 +10,13 @@ Blasr_libcpp is a **library** used by blasr and other executables such as samtoh
+ alignment
- contains source code for aligning Pacbio reads to target sequences used in blasr and builds ```libblasr```.
+For more information, see
+* https://github.com/PacificBiosciences/blasr_libcpp/wiki
-##Appendix: Dependencies##
-+ libpbdata
- - does **not** depend on ```libhdf5```
- - should build without the ```pbbam``` library *for now*
-
-- libpbhdf
- - depends on the ```libpbdata``` and ```libhdf5``` libraries to build
-
-- alignment
- - depends on the ```libpbdata``` library to build
- - can build either with or without the ```libpbhdf``` library
- - can build either with or without the ```pbbam``` library
+## Building
+The simplest way is:
+```
+NOPBBAM=1 ./configure.py
+make -j all
+```
+That will skip pbbam, and it will download HDF5 headers.
diff --git a/alignment/Makefile b/alignment/Makefile
deleted file mode 100644
index 9c27250..0000000
--- a/alignment/Makefile
+++ /dev/null
@@ -1,113 +0,0 @@
-
-include ../common.mk
-
-# To enable building a shared library, invoke as "make SHARED_LIB=true ..."
-ifneq ($(SHARED_LIB),)
- # Generating shared library
- CXX_SHAREDFLAGS := -fPIC
- LD_SHAREDFLAGS := -shared -fPIC
- TARGET_LIB := libblasr.so
- # Developers should set these to appropriate defaults (other systems
- # will override these on the command line):
- HDF5_LIB := ../../../../prebuilt.out/prebuilt.out/hdf5/hdf5-1.8.12/centos-5/lib/libhdf5.so
- ZLIB_LIB := ../../../../prebuilt.tmpsrc/zlib/zlib_1.2.8/_output/install/lib/libz.so
- HTSLIB_LIB := ../../../staging/PostPrimary/pbbam/_output/install-build/lib/libpbbam.so
- PBBAM_LIB := ../../../staging/PostPrimary/pbbam/third-party/htslib/_output/install-build/lib/libhts.so
- LIBPBDATA_LIB := ../../../staging/PostPrimary/pbbam/third-party/htslib/_output/install-build/lib/libhts.so
-else
- # Generating shared library
- CXX_SHAREDFLAGS :=
- LD_SHAREDFLAGS :=
- TARGET_LIB := libblasr.a
- HDF5_LIB :=
- ZLIB_LIB :=
- HTSLIB_LIB :=
- PBBAM_LIB :=
- LIBPBDATA_LIB :=
-endif
-
-DEP_LIBS := $(HDF5_LIB) $(ZLIB_LIB) $(HTSLIB_LIB) $(PBBAM_LIB) $(PBDATA_LIB)
-
-LIBPBDATA_INCLUDE := ../pbdata
-LIBPBIHDF_INCLUDE := ../hdf
-PBBAM_INCLUDE := $(PBBAM)/include
-HTSLIB_INCLUDE ?= $(PBBAM)/third-party/htslib
-
-INCLUDES = -I$(LIBPBDATA_INCLUDE) \
- -I$(LIBPBIHDF_INCLUDE) \
- -I.
-ifneq ($(HDF5_INC),)
-INCLUDES += -I$(HDF5_INC)
-else
-HDF_HEADERS := hdf5-1.8.12-headers
-INCLUDES += -I../hdf/$(HDF_HEADERS)/src -I../hdf/$(HDF_HEADERS)/c++/src
-endif
-
-ifneq ($(ZLIB_ROOT), notfound)
- INCLUDES += -I$(ZLIB_ROOT)/include
-endif
-
-ifeq ($(origin nopbbam), undefined)
- INCLUDES += -I$(PBBAM_INCLUDE) -I$(HTSLIB_INCLUDE) -I$(BOOST_INCLUDE)
-endif
-
-CXXOPTS := -std=c++11 -pedantic -Wno-long-long -MMD -MP
-
-sources := $(wildcard algorithms/alignment/*.cpp) \
- $(wildcard algorithms/alignment/sdp/*.cpp) \
- $(wildcard algorithms/anchoring/*.cpp) \
- $(wildcard algorithms/compare/*.cpp) \
- $(wildcard algorithms/sorting/*.cpp) \
- $(wildcard datastructures/alignment/*.cpp) \
- $(wildcard datastructures/alignmentset/*.cpp) \
- $(wildcard datastructures/anchoring/*.cpp) \
- $(wildcard datastructures/tuplelists/*.cpp) \
- $(wildcard suffixarray/*.cpp) \
- $(wildcard qvs/*.cpp) \
- $(wildcard statistics/*.cpp) \
- $(wildcard tuples/*.cpp) \
- $(wildcard utils/*.cpp) \
- $(wildcard files/*.cpp) \
- $(wildcard format/*.cpp) \
- $(wildcard simulator/*.cpp) \
- $(wildcard *.cpp)
-
-ifdef nohdf
-sources := $(filter-out files/% utils/FileOfFileNames.cpp, $(sources))
-endif
-
-objects := $(sources:.cpp=.o)
-shared_objects := $(sources:.cpp=.shared.o)
-dependencies := $(objects:.o=.d) $(shared_objects:.o=.d)
-
-all : CXXFLAGS ?= -O3
-
-debug : CXXFLAGS ?= -g -ggdb -fno-inline
-
-profile : CXXFLAGS ?= -Os -pg
-
-g: CXXFLAGS = -g -ggdb -fno-inline -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fno-omit-frame-pointer
-
-all debug profile g: $(TARGET_LIB)
-
-libblasr.a: $(objects)
- $(AR_pp) $(ARFLAGS) $@ $^
-
-libblasr.so: $(shared_objects) $(DEP_LIBS)
- $(CXX_pp) $(LD_SHAREDFLAGS) -o $@ $^
-
-%.o: %.cpp
- $(CXX) $(CXXOPTS) $(CXXFLAGS) $(LEGACY) $(INCLUDES) -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.o) $(@:%.o=%.d)" -c $< -o $@
-
-%.shared.o: %.cpp
- $(CXX_pp) $(CXX_SHAREDFLAGS) $(CXXOPTS) $(CXXFLAGS) $(LEGACY) $(INCLUDES) -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.o) $(@:%.o=%.d)" -c $< -o $@
-
-# .INTERMEDIATE: $(objects)
-
-clean:
- @rm -f libblasr.a libblasr.so
- @rm -f $(objects) $(shared_objects) $(dependencies)
- @find . -type f -name \*.o -delete
- @find . -type f -name \*.d -delete
-
--include $(dependencies)
diff --git a/alignment/algorithms/anchoring/MapBySuffixArrayImpl.hpp b/alignment/algorithms/anchoring/MapBySuffixArrayImpl.hpp
index 6cbac02..2405f45 100644
--- a/alignment/algorithms/anchoring/MapBySuffixArrayImpl.hpp
+++ b/alignment/algorithms/anchoring/MapBySuffixArrayImpl.hpp
@@ -31,14 +31,14 @@ int LocateAnchorBoundsInSuffixArray(T_RefSequence &reference,
// anything shorter than that.
//
if (minPrefixMatchLength > 0 and
- read.subreadEnd - read.subreadStart < minPrefixMatchLength) {
+ read.SubreadLength() < minPrefixMatchLength) {
return 0;
}
DNALength p, m;
DNALength alignEnd;
- DNALength matchEnd = read.subreadEnd - minPrefixMatchLength + 1;
- DNALength numSearchedPositions = matchEnd - read.subreadStart;
+ DNALength matchEnd = read.SubreadEnd() - minPrefixMatchLength + 1;
+ DNALength numSearchedPositions = matchEnd - read.SubreadStart();
matchLength.resize(numSearchedPositions);
matchLow.resize(numSearchedPositions);
@@ -49,7 +49,7 @@ int LocateAnchorBoundsInSuffixArray(T_RefSequence &reference,
std::fill(matchHigh.begin(), matchHigh.end(), 0);
vector<SAIndex> lowMatchBound, highMatchBound;
- for (m = 0, p = read.subreadStart; p < matchEnd; p++, m++) {
+ for (m = 0, p = read.SubreadStart(); p < matchEnd; p++, m++) {
DNALength lcpLow, lcpHigh, lcpLength;
lowMatchBound.clear(); highMatchBound.clear();
lcpLow = 0;
@@ -212,7 +212,7 @@ int LocateAnchorBoundsInSuffixArray(T_RefSequence &reference,
template<typename T_SuffixArray,
typename T_RefSequence,
typename T_Sequence,
- typename T_MatchPos>
+ typename T_MatchPos>
int MapReadToGenome(T_RefSequence &reference,
T_SuffixArray &sa, T_Sequence &read,
unsigned int minPrefixMatchLength,
@@ -222,7 +222,7 @@ int MapReadToGenome(T_RefSequence &reference,
vector<DNALength> matchLow, matchHigh, matchLength;
int minMatchLen = anchorParameters.minMatchLength;
- if (read.subreadEnd - read.subreadStart < minMatchLen) {
+ if (read.SubreadLength() < minMatchLen) {
matchPosList.clear();
return 0;
}
@@ -269,15 +269,15 @@ int MapReadToGenome(T_RefSequence &reference,
//
DNALength endOfMapping;
DNALength trim = MAX(minMatchLen + 1, sa.lookupPrefixLength + 1);
- if (read.subreadEnd < trim) {
+ if (read.SubreadEnd() < trim) {
endOfMapping = 0;
}
else {
- endOfMapping = read.subreadEnd - trim;
+ endOfMapping = read.SubreadEnd() - trim;
}
- for (pos = read.subreadStart; pos < endOfMapping; pos++) {
- int matchIndex = pos - read.subreadStart;
+ for (pos = read.SubreadStart(); pos < endOfMapping; pos++) {
+ int matchIndex = pos - read.SubreadStart();
assert(matchIndex < matchHigh.size());
if (matchHigh[matchIndex] - matchLow[matchIndex] <=
anchorParameters.maxAnchorsPerPosition) {
diff --git a/alignment/algorithms/sorting/DifferenceCovers.cpp b/alignment/algorithms/sorting/DifferenceCovers.cpp
index 09a4be7..33d24a3 100644
--- a/alignment/algorithms/sorting/DifferenceCovers.cpp
+++ b/alignment/algorithms/sorting/DifferenceCovers.cpp
@@ -1,4 +1,5 @@
#include <cstring>
+#include "utils.hpp"
#include "DifferenceCovers.hpp"
int InitializeDifferenceCover(int diffCoverSize, UInt &diffCoverLength, UInt *&diffCover) {
@@ -6,7 +7,7 @@ int InitializeDifferenceCover(int diffCoverSize, UInt &diffCoverLength, UInt *&d
for (index = 0; index < N_COVERS; index++) {
if (diffCovers[index][0] == diffCoverSize) {
diffCoverLength = diffCovers[index][1];
- diffCover = new UInt[diffCoverLength];
+ diffCover = ProtectedNew<UInt>(diffCoverLength);
memcpy(diffCover, &diffCovers[index][2], sizeof(UInt)*diffCoverLength);
return 1;
}
diff --git a/alignment/algorithms/sorting/LightweightSuffixArray.cpp b/alignment/algorithms/sorting/LightweightSuffixArray.cpp
index eb0e607..626d0c1 100644
--- a/alignment/algorithms/sorting/LightweightSuffixArray.cpp
+++ b/alignment/algorithms/sorting/LightweightSuffixArray.cpp
@@ -1,3 +1,4 @@
+#include "utils.hpp"
#include "LightweightSuffixArray.hpp"
UInt DiffMod(UInt a, UInt b, UInt d) {
@@ -60,7 +61,7 @@ DiffCoverMu::~DiffCoverMu() {
}
void DiffCoverMu::Initialize(UInt diffCoverP[], UInt diffCoverLengthP, UInt diffCoverSizeP, UInt textSizeP) {
- diffCoverReverseLookup = new UInt[diffCoverSizeP];
+ diffCoverReverseLookup = ProtectedNew<UInt>(diffCoverSizeP);
diffCoverLength = diffCoverLengthP;
textSize = textSizeP;
diffCoverSize = diffCoverSizeP;
@@ -100,7 +101,7 @@ void BuildDiffCoverLookup(UInt diffCover[], UInt diffCoverLength, UInt v, UInt d
}
void DiffCoverDelta::Initialize(UInt diffCoverP[], UInt diffCoverLengthP, UInt diffCoverSizeP) {
- diffCoverLookup = new UInt[diffCoverSizeP];
+ diffCoverLookup = ProtectedNew<UInt>(diffCoverSizeP);
diffCoverSize = diffCoverSizeP;
BuildDiffCoverLookup(diffCoverP, diffCoverLengthP, diffCoverSizeP, diffCoverLookup);
}
@@ -299,11 +300,7 @@ bool LightweightSuffixSort(unsigned char text[], UInt textLength, UInt *index, i
// by setting s^\prime[\mu(i)] = l^v(i)
//
UInt *lexVNaming;
- lexVNaming = new UInt[dSetSize+1];
- if (lexVNaming == NULL) {
- std::cout << "Could not initialize welterweight order structure." << std::endl;
- exit(1);
- }
+ lexVNaming = ProtectedNew<UInt>(dSetSize+1);
DiffCoverMu mu;
mu.Initialize(diffCover, diffCoverLength, diffCoverSize, textLength);
UInt largestLexName;
diff --git a/alignment/algorithms/sorting/MultikeyQuicksort.cpp b/alignment/algorithms/sorting/MultikeyQuicksort.cpp
index 40936ee..1dd776b 100644
--- a/alignment/algorithms/sorting/MultikeyQuicksort.cpp
+++ b/alignment/algorithms/sorting/MultikeyQuicksort.cpp
@@ -117,7 +117,7 @@ void MediankeyBoundedQuicksort(unsigned char text[], UInt index[], UInt length,
maxChar = c;
}
}
- freq = new UInt[maxChar+1];
+ freq = ProtectedNew<UInt>(maxChar+1);
deleteFreq = true;
}
diff --git a/alignment/algorithms/sorting/qsufsort.hpp b/alignment/algorithms/sorting/qsufsort.hpp
index 5525fe4..39ff283 100644
--- a/alignment/algorithms/sorting/qsufsort.hpp
+++ b/alignment/algorithms/sorting/qsufsort.hpp
@@ -1,5 +1,6 @@
#ifndef _BLASR_QSUFSORT_HPP_
#define _BLASR_QSUFSORT_HPP_
+#include "utils.hpp"
#include <assert.h>
void suffixsort(int *x, int *p, int n, int k, int l);
@@ -204,8 +205,8 @@ suffix sorting algorithm.
assert(pi - p == pi - I);
// boundaries[pi-p] = 0;
}
- int *buckets = new int[k];
- T_Index *starts = new T_Index[k];
+ int *buckets = ProtectedNew<int>(k);
+ T_Index *starts = ProtectedNew<T_Index>(k);
/*MC+1*/
for (i = 0; i < k; i++ ){
buckets[i] = -1;
diff --git a/alignment/build.mk b/alignment/build.mk
new file mode 120000
index 0000000..2247f36
--- /dev/null
+++ b/alignment/build.mk
@@ -0,0 +1 @@
+makefile
\ No newline at end of file
diff --git a/alignment/bwt/Occ.hpp b/alignment/bwt/Occ.hpp
index 0e2640b..5f02d18 100644
--- a/alignment/bwt/Occ.hpp
+++ b/alignment/bwt/Occ.hpp
@@ -203,7 +203,7 @@ public:
DNALength bwtSeqLength;
in.read((char*)&bwtSeqLength, sizeof(bwtSeqLength));
if (full.matrix) {delete [] full.matrix;}
- full.matrix = new DNALength[bwtSeqLength *AlphabetSize];
+ full.matrix = ProtectedNew<DNALength>(bwtSeqLength *AlphabetSize);
full.nRows = bwtSeqLength;
full.nCols = AlphabetSize;
in.read((char*)&full.matrix[0], sizeof(DNALength)* bwtSeqLength * AlphabetSize);
diff --git a/alignment/bwt/PackedHash.hpp b/alignment/bwt/PackedHash.hpp
index f1fa540..5b45aef 100644
--- a/alignment/bwt/PackedHash.hpp
+++ b/alignment/bwt/PackedHash.hpp
@@ -71,8 +71,8 @@ public:
Free();
tableLength = CeilOfFraction(sequenceLength, (DNALength) BinSize);
- table = new uint32_t[tableLength];
- values = new uint64_t[tableLength];
+ table = ProtectedNew<uint32_t>(tableLength);
+ values = ProtectedNew<uint64_t>(tableLength);
std::fill(&table[0], &table[tableLength], 0);
std::fill(&values[0], &values[tableLength], 0);
hashLengths.resize(tableLength);
@@ -122,7 +122,7 @@ public:
DNALength v0, v1;
v0 = ((DNALength)storage);
v1 = ((DNALength)(storage >> 32));
- DNALength *storagePtr = new DNALength[3];
+ DNALength *storagePtr = ProtectedNew<DNALength>(3);
storage = (uint64_t) storagePtr;
//
@@ -149,7 +149,7 @@ public:
* and inserts the new value into its position that maintains
* sorted order in the list.
*/
- DNALength *newListPtr = new DNALength[curStorageLength + 1];
+ DNALength *newListPtr = ProtectedNew<DNALength>(curStorageLength + 1);
//
// Copy the values from the old list making space for the new
// value.
@@ -331,15 +331,15 @@ public:
Free();
in.read((char*)&tableLength, sizeof(tableLength));
if (tableLength > 0) {
- table = new uint32_t[tableLength];
- values = new uint64_t[tableLength];
+ table = ProtectedNew<uint32_t>(tableLength);
+ values = ProtectedNew<uint64_t>(tableLength);
in.read((char*)table, sizeof(uint32_t)*tableLength);
in.read((char*)values, sizeof(uint64_t)*tableLength);
DNALength tablePos;
for (tablePos = 0; tablePos < tableLength; tablePos++) {
int nSetBits = CountBits(table[tablePos]);
if (nSetBits > 2) {
- values[tablePos] = (uint64_t) new uint32_t[nSetBits];
+ values[tablePos] = (uint64_t)(ProtectedNew<uint32_t>(nSetBits));
in.read((char*)values[tablePos], nSetBits * sizeof(uint32_t));
}
}
diff --git a/alignment/datastructures/alignment/FilterCriteria.cpp b/alignment/datastructures/alignment/FilterCriteria.cpp
index 16f4bff..8fbb587 100644
--- a/alignment/datastructures/alignment/FilterCriteria.cpp
+++ b/alignment/datastructures/alignment/FilterCriteria.cpp
@@ -18,6 +18,8 @@
#include "FilterCriteria.hpp"
+constexpr float Score::errorunit;
+
Score::Score(const float & value, const ScoreSign & sign)
: _value(value)
, _sign(sign)
diff --git a/alignment/files/FragmentCCSIterator.cpp b/alignment/files/FragmentCCSIterator.cpp
index d02876c..ac0b64a 100644
--- a/alignment/files/FragmentCCSIterator.cpp
+++ b/alignment/files/FragmentCCSIterator.cpp
@@ -23,8 +23,7 @@ Initialize(CCSSequence *_seqPtr, RegionTable *_regionTablePtr) {
//
// Since this iterator covers all passes, and not just those
// included in the ccs, the the regions need to be loaded.
- //
- CollectSubreadIntervals(*seqPtr, regionTablePtr, subreadIntervals);
+ subreadIntervals = (*regionTablePtr)[seqPtr->HoleNumber()].SubreadIntervals(seqPtr->unrolledRead.length, true);
if (subreadIntervals.size() == 0) { return;}
readIntervalDirection.resize(subreadIntervals.size());
diff --git a/alignment/files/ReaderAgglomerate.cpp b/alignment/files/ReaderAgglomerate.cpp
index ff04a08..219da10 100644
--- a/alignment/files/ReaderAgglomerate.cpp
+++ b/alignment/files/ReaderAgglomerate.cpp
@@ -15,6 +15,7 @@ void ReaderAgglomerate::InitializeParameters() {
#ifdef USE_PBBAM
bamFilePtr = NULL;
entireFileQueryPtr = NULL;
+ zmwGroupQueryPtr = NULL;
#endif
}
@@ -156,6 +157,7 @@ bool ReaderAgglomerate::HasRegionTable() {
#define RESET_PBBAM_PTRS() \
if (bamFilePtr != NULL) {delete bamFilePtr; bamFilePtr = NULL;} \
+ if (zmwGroupQueryPtr != NULL) {delete zmwGroupQueryPtr; zmwGroupQueryPtr = NULL;} \
if (entireFileQueryPtr != NULL) {delete entireFileQueryPtr; entireFileQueryPtr = NULL;}
#endif
@@ -202,13 +204,18 @@ int ReaderAgglomerate::Initialize() {
RESET_PBBAM_PTRS();
try {
bamFilePtr = new PacBio::BAM::BamFile(fileName);
+ assert(bamFilePtr != nullptr);
} catch (std::exception e) {
cout << "ERROR! Failed to open " << fileName
<< ": " << e.what() << endl;
return 0;
}
entireFileQueryPtr = new PacBio::BAM::EntireFileQuery(*bamFilePtr);
+ assert(entireFileQueryPtr != nullptr);
bamIterator = entireFileQueryPtr->begin();
+ zmwGroupQueryPtr = new PacBio::BAM::QNameQuery(*bamFilePtr);
+ assert(zmwGroupQueryPtr != nullptr);
+ zmwGroupIterator = zmwGroupQueryPtr->begin();
break;
#endif
case HDFCCS:
@@ -315,6 +322,32 @@ int ReaderAgglomerate::GetNext(FASTQSequence &seq) {
return numRecords;
}
+int ReaderAgglomerate::GetNext(vector<SMRTSequence> & reads) {
+ int numRecords = 0;
+ reads.clear();
+
+ if (Subsample(subsample) == 0) {
+ return 0;
+ }
+ if (fileType == PBBAM) {
+#ifdef USE_PBBAM
+ if (zmwGroupIterator != zmwGroupQueryPtr->end()) {
+ const vector<PacBio::BAM::BamRecord> & records = *zmwGroupIterator;
+ numRecords = records.size();
+ reads.resize(numRecords);
+ for (size_t i=0; i < records.size(); i++) {
+ reads[i].Copy(records[i]);
+ }
+ zmwGroupIterator++;
+ }
+#endif
+ } else {
+ UNREACHABLE();
+ }
+ if (numRecords >= 1) readGroupId = reads[0].ReadGroupId();
+ return numRecords;
+}
+
int ReaderAgglomerate::GetNext(SMRTSequence &seq) {
int numRecords = 0;
@@ -355,14 +388,15 @@ int ReaderAgglomerate::GetNext(SMRTSequence &seq) {
// and should be empty, use this->readGroupId instead. Otherwise,
// read group id should be loaded from BamRecord to SMRTSequence,
// update this->readGroupId accordingly.
- if (fileType != PBBAM) seq.SetReadGroupId(readGroupId);
- else readGroupId = seq.GetReadGroupId();
+ if (fileType != PBBAM) seq.ReadGroupId(readGroupId);
+ else readGroupId = seq.ReadGroupId();
if (stride > 1)
Advance(stride-1);
return numRecords;
}
+
int ReaderAgglomerate::GetNextBases(SMRTSequence &seq, bool readQVs) {
int numRecords = 0;
@@ -400,8 +434,8 @@ int ReaderAgglomerate::GetNextBases(SMRTSequence &seq, bool readQVs) {
break;
}
- if (fileType != PBBAM) seq.SetReadGroupId(readGroupId);
- else readGroupId = seq.GetReadGroupId();
+ if (fileType != PBBAM) seq.ReadGroupId(readGroupId);
+ else readGroupId = seq.ReadGroupId();
if (stride > 1)
Advance(stride-1);
@@ -418,13 +452,11 @@ int ReaderAgglomerate::GetNext(CCSSequence &seq) {
case Fasta:
// This just reads in the fasta sequence as if it were a ccs sequence
numRecords = fastaReader.GetNext(seq);
- seq.subreadStart = 0;
- seq.subreadEnd = 0;
+ seq.SubreadStart(0).SubreadEnd(0);
break;
case Fastq:
numRecords = fastqReader.GetNext(seq);
- seq.subreadStart = 0;
- seq.subreadEnd = 0;
+ seq.SubreadStart(0).SubreadEnd(0);
break;
case HDFPulse:
case HDFBase:
@@ -444,8 +476,8 @@ int ReaderAgglomerate::GetNext(CCSSequence &seq) {
break;
}
- if (fileType != PBBAM) seq.SetReadGroupId(readGroupId);
- else readGroupId = seq.GetReadGroupId();
+ if (fileType != PBBAM) seq.ReadGroupId(readGroupId);
+ else readGroupId = seq.ReadGroupId();
if (stride > 1)
Advance(stride-1);
diff --git a/alignment/files/ReaderAgglomerate.hpp b/alignment/files/ReaderAgglomerate.hpp
index 0955498..905d432 100644
--- a/alignment/files/ReaderAgglomerate.hpp
+++ b/alignment/files/ReaderAgglomerate.hpp
@@ -19,6 +19,7 @@
#ifdef USE_PBBAM
#include "pbbam/BamFile.h"
#include "pbbam/EntireFileQuery.h"
+#include "pbbam/QNameQuery.h"
#include "pbbam/BamRecord.h"
#endif
@@ -105,6 +106,7 @@ public:
int GetNext(FASTQSequence &seq);
int GetNext(SMRTSequence &seq);
int GetNext(CCSSequence &seq);
+ int GetNext(vector<SMRTSequence> & reads);
template<typename T_Sequence>
int GetNext(T_Sequence & seq, int & randNum);
@@ -121,6 +123,8 @@ public:
PacBio::BAM::BamFile * bamFilePtr;
PacBio::BAM::EntireFileQuery * entireFileQueryPtr;
PacBio::BAM::EntireFileQuery::iterator bamIterator;
+ PacBio::BAM::QNameQuery * zmwGroupQueryPtr;
+ PacBio::BAM::QNameQuery::iterator zmwGroupIterator;
#endif
};
diff --git a/alignment/format/BAMPrinter.hpp b/alignment/format/BAMPrinter.hpp
index 50f6b79..b1a53f0 100644
--- a/alignment/format/BAMPrinter.hpp
+++ b/alignment/format/BAMPrinter.hpp
@@ -8,20 +8,32 @@
#include "pbbam/BamHeader.h"
#include "pbbam/BamWriter.h"
+namespace BAMOutput {
+
+template<typename T_Sequence>
+void SetAlignedSequence(T_AlignmentCandidate &alignment, T_Sequence &read,
+ T_Sequence &alignedSeq);
+
+template<typename T_Sequence>
+void CreateCIGARString(T_AlignmentCandidate &alignment,
+ T_Sequence &read, std::string &cigarString, const bool cigarUseSeqMatch);
+
template<typename T_Sequence>
void AlignmentToBamRecord(T_AlignmentCandidate & alignment,
T_Sequence & read, PacBio::BAM::BamRecord & bamRecord,
AlignmentContext & context, SupplementalQVList & qvList,
- Clipping clipping);
+ Clipping clipping, bool cigarUseSeqMatch);
+
-namespace BAMOutput {
template<typename T_Sequence>
void PrintAlignment(T_AlignmentCandidate &alignment, T_Sequence &read,
PacBio::BAM::BamWriter &bamWriter, AlignmentContext &context,
- SupplementalQVList & qvList, Clipping clipping);
+ SupplementalQVList & qvList, Clipping clipping,
+ bool cigarUseSeqMatch=false);
}
+
#include "BAMPrinterImpl.hpp"
#endif
diff --git a/alignment/format/BAMPrinterImpl.hpp b/alignment/format/BAMPrinterImpl.hpp
index e023a4d..2377911 100644
--- a/alignment/format/BAMPrinterImpl.hpp
+++ b/alignment/format/BAMPrinterImpl.hpp
@@ -11,10 +11,53 @@ using namespace std;
#include "pbbam/BamFile.h"
template<typename T_Sequence>
-void AlignmentToBamRecord(T_AlignmentCandidate & alignment,
+void BAMOutput::CreateCIGARString(T_AlignmentCandidate &alignment,
+ T_Sequence &read, std::string &cigarString, const bool cigarUseSeqMatch)
+{
+ cigarString = "";
+ // All cigarString use the no clipping core
+ std::vector<int> opSize;
+ std::vector<char> opChar;
+
+ SAMOutput::CreateNoClippingCigarOps(alignment, opSize, opChar, cigarUseSeqMatch);
+
+ // Clipping needs to be added
+ DNALength prefixSoftClip = alignment.QAlignStart() - read.SubreadStart();
+ DNALength suffixSoftClip = read.SubreadEnd() - alignment.QAlignEnd();
+
+ if (alignment.tStrand == 1) {
+ std::swap(prefixSoftClip, suffixSoftClip);
+ }
+ if (prefixSoftClip > 0) {
+ opSize.insert(opSize.begin(), prefixSoftClip);
+ opChar.insert(opChar.begin(), 'S');
+ }
+ if (suffixSoftClip > 0) {
+ opSize.push_back(suffixSoftClip);
+ opChar.push_back('S');
+ }
+ SAMOutput::CigarOpsToString(opSize, opChar, cigarString);
+}
+
+template<typename T_Sequence>
+void BAMOutput::SetAlignedSequence(T_AlignmentCandidate &alignment, T_Sequence &read,
+ T_Sequence &alignedSeq) {
+ if (alignment.tStrand == 0) {
+ alignedSeq.ReferenceSubstring(read);
+ }
+ else {
+ T_Sequence subSeq;
+ subSeq.ReferenceSubstring(read);
+ subSeq.MakeRC(alignedSeq);
+ }
+}
+
+template<typename T_Sequence>
+void BAMOutput::AlignmentToBamRecord(T_AlignmentCandidate & alignment,
T_Sequence & read, PacBio::BAM::BamRecord & bamRecord,
AlignmentContext & context, SupplementalQVList & qvList,
- Clipping clipping) {
+ Clipping clipping, bool cigarUseSeqMatch) {
+ // soft clipping and subread clipping are identical for BAM
assert(clipping == SAMOutput::soft or clipping == SAMOutput::subread);
// Build from scratch if input reads are not from pbbam files.
@@ -32,14 +75,11 @@ void AlignmentToBamRecord(T_AlignmentCandidate & alignment,
// build cigar string.
string cigarString;
- T_Sequence alignedSequence;
- DNALength prefixSoftClip = 0, suffixSoftClip = 0;
- DNALength prefixHardClip = 0, suffixHardClip = 0;
- CreateCIGARString(alignment, read, cigarString, clipping,
- prefixSoftClip, suffixSoftClip,
- prefixHardClip, suffixHardClip);
- SetAlignedSequence(alignment, read, alignedSequence, clipping);
+ BAMOutput::CreateCIGARString(alignment, read, cigarString, cigarUseSeqMatch);
PacBio::BAM::Cigar cigar = PacBio::BAM::Cigar::FromStdString(cigarString);
+
+ T_Sequence alignedSequence;
+ BAMOutput::SetAlignedSequence(alignment, read, alignedSequence);
// build flag
uint16_t flag;
@@ -86,8 +126,8 @@ void AlignmentToBamRecord(T_AlignmentCandidate & alignment,
PacBio::BAM::TagCollection tags;
tags["RG"] = context.readGroupId;
if (dynamic_cast<CCSSequence*>(&read) == NULL) { // subread
- tags["qs"] = read.subreadStart;
- tags["qe"] = read.subreadEnd;
+ tags["qs"] = read.SubreadStart();
+ tags["qe"] = read.SubreadEnd();
/// Number of passes for a subread should always be 1.
tags["np"] = 1;
} else { // ccs read
@@ -150,16 +190,15 @@ void AlignmentToBamRecord(T_AlignmentCandidate & alignment,
// Set Flag
bamRecord.Impl().Flag(static_cast<uint32_t>(flag));
-
}
template<typename T_Sequence>
void BAMOutput::PrintAlignment(T_AlignmentCandidate &alignment, T_Sequence &read,
PacBio::BAM::BamWriter &bamWriter, AlignmentContext &context,
- SupplementalQVList & qvList, Clipping clipping) {
+ SupplementalQVList & qvList, Clipping clipping, bool cigarUseSeqMatch) {
PacBio::BAM::BamRecord bamRecord;
- AlignmentToBamRecord(alignment, read, bamRecord, context, qvList, clipping);
+ BAMOutput::AlignmentToBamRecord(alignment, read, bamRecord, context, qvList, clipping, cigarUseSeqMatch);
bamWriter.Write(bamRecord);
}
#endif
diff --git a/alignment/format/SAMHeaderPrinter.cpp b/alignment/format/SAMHeaderPrinter.cpp
index 6bad4a5..c8fc103 100644
--- a/alignment/format/SAMHeaderPrinter.cpp
+++ b/alignment/format/SAMHeaderPrinter.cpp
@@ -1,14 +1,15 @@
+#include <assert.h>
#include "format/SAMHeaderPrinter.hpp"
const std::string SAMVERSION("1.5");
-const std::string PBBAMVERSION("3.0b5");
+const std::string PBBAMVERSION("3.0.1");
const std::string PACBIOPL("PACBIO");
std::vector<SAMHeaderItem> MakeSAMHeaderItems(const std::string & fromString){
std::vector<SAMHeaderItem> items;
std::vector<std::string> vs;
- Tokenize(fromString, ";", vs);
+ Splice(fromString, ";", vs);
std::vector<std::string>::iterator it;
for (it = vs.begin(); it != vs.end(); it++) {
items.push_back(SAMHeaderItem(*it));
@@ -33,15 +34,18 @@ std::string SAMHeaderItem::ToString() {
// SAMHeaderTag
SAMHeaderTag::SAMHeaderTag(const std::string & fromString) {
- std::vector<std::string> vs;
- Tokenize(fromString, ":", vs);
- if (vs.size() == 2) {
- _tagName = vs[0];
- if (vs[1].find("=") != std::string::npos) {
- AddItems(vs[1]);
+ size_t pos = fromString.find(":");
+ if (pos != string::npos) {
+ _tagName = fromString.substr(0, pos);
+ string tagValue = fromString.substr(pos + 1);
+ if (tagValue.find("=") != std::string::npos) {
+ AddItems(tagValue);
} else {
- _tagValue = vs[1];
+ _tagValue = tagValue;
}
+ } else {
+ cout << "Unable to parse SAM/BAM header" << fromString << endl;
+ exit(1);
}
}
@@ -86,7 +90,7 @@ void SAMHeaderTag::AddItems(const std::string & fromString) {
SAMHeaderGroup::SAMHeaderGroup(const std::string & fromString) {
if (fromString == "" || fromString[0] != '@') return;
std::vector<std::string> vs;
- Tokenize(fromString.substr(1), "\t", vs);
+ Splice(fromString.substr(1), "\t", vs);
if (vs.size() >= 1) {
std::vector<std::string>::iterator it = vs.begin();
_groupName = (*it);
@@ -306,6 +310,7 @@ SAMHeaderRGs SAMHeaderPrinter::MakeRGs(const std::vector<std::string> & readsFil
if (fileType != PBBAM) {
ReaderAgglomerate * reader = new ReaderAgglomerate();
+ assert(reader != nullptr);
std::vector<std::string>::const_iterator rfit;
for(rfit = readsFiles.begin(); rfit != readsFiles.end(); rfit++) {
std::string rf(*rfit);
diff --git a/alignment/format/SAMPrinter.cpp b/alignment/format/SAMPrinter.cpp
index 9ff7520..c44c40a 100644
--- a/alignment/format/SAMPrinter.cpp
+++ b/alignment/format/SAMPrinter.cpp
@@ -77,9 +77,10 @@ void SAMOutput::AddGaps(T_AlignmentCandidate &alignment, int gapIndex,
}
}
-void SAMOutput::AddMatchBlockCigarOps(DNASequence & qSeq, DNASequence & tSeq, blasr::Block & b,
+void SAMOutput::AddMatchBlockCigarOps(DNASequence & qSeq, DNASequence & tSeq,
+ blasr::Block & b, DNALength & qSeqPos, DNALength & tSeqPos,
std::vector<int> & opSize, std::vector<char> & opChar) {
- DNALength qPos = b.qPos, tPos = b.tPos, n = 0;
+ DNALength qPos = qSeqPos + b.qPos, tPos = tSeqPos + b.tPos, n = 0;
bool started = false, prevSeqMatch = false;
for(DNALength i = 0; i < b.length; i++) {
bool curSeqMatch = (qSeq[qPos + i] == tSeq[tPos + i]);
@@ -137,6 +138,7 @@ void SAMOutput::CreateNoClippingCigarOps(T_AlignmentCandidate &alignment,
AddMatchBlockCigarOps(alignment.qAlignedSeq,
alignment.tAlignedSeq,
alignment.blocks[b],
+ alignment.qPos, alignment.tPos,
opSize, opChar);
} else {
opSize.push_back(matchLength);
@@ -158,6 +160,7 @@ void SAMOutput::CreateNoClippingCigarOps(T_AlignmentCandidate &alignment,
AddMatchBlockCigarOps(alignment.qAlignedSeq,
alignment.tAlignedSeq,
alignment.blocks[b],
+ alignment.qPos, alignment.tPos,
opSize, opChar);
} else {
opSize.push_back(matchLength);
diff --git a/alignment/format/SAMPrinter.hpp b/alignment/format/SAMPrinter.hpp
index 2628690..c47f8ec 100644
--- a/alignment/format/SAMPrinter.hpp
+++ b/alignment/format/SAMPrinter.hpp
@@ -41,6 +41,7 @@ void AddGaps(T_AlignmentCandidate &alignment, int gapIndex,
// Add sequence match/mismatch CIGAR string Ops for block b.
void AddMatchBlockCigarOps(DNASequence & qSeq, DNASequence & tSeq, blasr::Block & b,
+ DNALength & qSeqPos, DNALength & tSeqPos,
std::vector<int> & opSize, std::vector<char> & opChar);
// If cigarUseSeqMatch is true, cigar string uses '=' and 'X'
diff --git a/alignment/format/SAMPrinterImpl.hpp b/alignment/format/SAMPrinterImpl.hpp
index f83d1a9..ec20b49 100644
--- a/alignment/format/SAMPrinterImpl.hpp
+++ b/alignment/format/SAMPrinterImpl.hpp
@@ -27,8 +27,8 @@ void SAMOutput::SetAlignedSequence(T_AlignmentCandidate &alignment, T_Sequence &
clippedStartPos = read.lowQualityPrefix;
}
else if (clipping == subread) {
- clippedReadLength = read.subreadEnd - read.subreadStart;
- clippedStartPos = read.subreadStart;
+ clippedReadLength = read.SubreadLength();
+ clippedStartPos = read.SubreadStart();
}
else {
std::cout <<" ERROR! The clipping must be none, hard, subread, or soft when setting the aligned sequence." << std::endl;
@@ -130,8 +130,8 @@ void SAMOutput::CreateCIGARString(T_AlignmentCandidate &alignment,
suffixHardClip = read.lowQualitySuffix;
}
else if (clipping == subread) {
- prefixHardClip = std::max((DNALength) read.subreadStart, read.lowQualityPrefix);
- suffixHardClip = std::max((DNALength)(read.length - read.subreadEnd), read.lowQualitySuffix);
+ prefixHardClip = std::max((DNALength) read.SubreadStart(), read.lowQualityPrefix);
+ suffixHardClip = std::max((DNALength)(read.length - read.SubreadEnd()), read.lowQualitySuffix);
}
SetSoftClip(alignment, read, prefixHardClip, suffixHardClip, prefixSoftClip, suffixSoftClip);
@@ -280,9 +280,9 @@ void SAMOutput::PrintAlignment(T_AlignmentCandidate &alignment,
assert(read.length - suffixHardClip == prefixHardClip + alignedSequence.length);
samFile << "XE:i:" << xe + 1 << "\t";
}
- samFile << "YS:i:" << read.subreadStart << "\t";
- samFile << "YE:i:" << read.subreadEnd << "\t";
- samFile << "ZM:i:" << read.zmwData.holeNumber << "\t";
+ samFile << "YS:i:" << read.SubreadStart() << "\t";
+ samFile << "YE:i:" << read.SubreadEnd() << "\t";
+ samFile << "ZM:i:" << read.HoleNumber() << "\t";
samFile << "XL:i:" << alignment.qAlignedSeq.length << "\t";
samFile << "XT:i:1\t"; // reads are allways continuous reads, not
// referenced based circular consensus when
diff --git a/alignment/makefile b/alignment/makefile
new file mode 100644
index 0000000..61295eb
--- /dev/null
+++ b/alignment/makefile
@@ -0,0 +1,47 @@
+all:
+
+THISDIR:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+-include ${CURDIR}/defines.mk
+include ${THISDIR}/../rules.mk
+
+CXXOPTS := -std=c++11 -pedantic -Wno-long-long
+INCLUDES += ${THISDIR} ${LIBPBDATA_INC} ${LIBPBIHDF_INC} ${HDF5_INC} ${PBBAM_INC} ${HTSLIB_INC} ${BOOST_INC}
+LIBS += ${LIBPBDATA_LIB} ${LIBPBIHDF_LIB} ${HDF5_LIB} ${PBBAM_LIB} ${HTSLIB_LIB} ${ZLIB_LIB}
+LDFLAGS += $(patsubst %,-L%,${LIBS})
+LDLIBS += -lpbdata
+
+ifeq (${nohdf},)
+ LDLIBS+= -lpbihdf
+ #LDFLAGS+= -flat_namespace # so we do not need LDLIBS+= -lhdf5 -lhdf5_cpp
+endif
+# We might also need some -l* for pbbam, etc.
+
+all: libblasr.a libblasr${SH_LIB_EXT}
+
+paths := . simulator format files utils tuples statistics qvs suffixarray \
+ datastructures/alignment datastructures/alignmentset datastructures/anchoring datastructures/tuplelists \
+ algorithms/alignment algorithms/alignment/sdp algorithms/anchoring algorithms/compare algorithms/sorting
+paths := $(patsubst %,${THISDIR}%,${paths})
+sources := $(shell find ${THISDIR} -name '*.cpp')
+
+ifdef nohdf
+sources := $(filter-out ${THISDIR}files/% ${THISDIR}utils/FileOfFileNames.cpp ${THISDIR}format/SAMHeaderPrinter.cpp, $(sources))
+endif
+
+sources := $(notdir ${sources})
+objects := $(sources:.cpp=.o)
+shared_objects := $(sources:.cpp=.shared.o)
+dependencies := $(objects:.o=.d) $(shared_objects:.o=.d)
+
+vpath %.cpp ${paths}
+
+libblasr.a: $(objects)
+ $(AR) $(ARFLAGS) $@ $^
+
+libblasr${SH_LIB_EXT}: $(shared_objects)
+
+clean:
+ rm -f libblasr.a libblasr.so *.o *.d
+
+-include $(dependencies)
+depend: $(dependencies:.d=.depend)
diff --git a/alignment/simple.mk b/alignment/simple.mk
deleted file mode 100644
index 253d7a2..0000000
--- a/alignment/simple.mk
+++ /dev/null
@@ -1,82 +0,0 @@
-# Requirements:
-# pbbam
-# htslib
-# hdf5
-# boost
-# Plus relative packages:
-# pbdata
-# hdf
-PREFIX?=/usr
-include ../simple.mk
-
-LIBPBDATA_INCLUDE := ../pbdata
-LIBPBIHDF_INCLUDE := ../hdf
-#PBBAM_INCLUDE := $(PBBAM)/include
-#HTSLIB_INCLUDE := $(PBBAM)/third-party/htslib
-
-INCLUDES = -I${PREFIX}/include \
- -I$(LIBPBDATA_INCLUDE) \
- -I$(LIBPBIHDF_INCLUDE) \
- -I.
-
-#ifneq ($(ZLIB_ROOT), notfound)
-# INCLUDES += -I$(ZLIB_ROOT)/include
-#endif
-
-#ifeq ($(origin nopbbam), undefined)
-# INCLUDES += -I$(PBBAM_INCLUDE) -I$(HTSLIB_INCLUDE) -I$(BOOST_INCLUDE)
-#endif
-
-CXXOPTS := -std=c++11 -pedantic -Wno-long-long -MMD -MP
-
-sources := $(wildcard algorithms/alignment/*.cpp) \
- $(wildcard algorithms/alignment/sdp/*.cpp) \
- $(wildcard algorithms/anchoring/*.cpp) \
- $(wildcard algorithms/compare/*.cpp) \
- $(wildcard algorithms/sorting/*.cpp) \
- $(wildcard datastructures/alignment/*.cpp) \
- $(wildcard datastructures/alignmentset/*.cpp) \
- $(wildcard datastructures/anchoring/*.cpp) \
- $(wildcard datastructures/tuplelists/*.cpp) \
- $(wildcard suffixarray/*.cpp) \
- $(wildcard qvs/*.cpp) \
- $(wildcard statistics/*.cpp) \
- $(wildcard tuples/*.cpp) \
- $(wildcard utils/*.cpp) \
- $(wildcard files/*.cpp) \
- $(wildcard format/*.cpp) \
- $(wildcard simulator/*.cpp) \
- $(wildcard *.cpp)
-
-ifdef nohdf
-sources := $(filter-out files/% utils/FileOfFileNames.cpp format/SAMHeaderPrinter.cpp, $(sources))
-endif
-
-objects := $(sources:.cpp=.o)
-dependencies := $(sources:.cpp=.d)
-
-all : CXXFLAGS ?= -O3
-
-debug : CXXFLAGS ?= -g -ggdb -fno-inline
-
-profile : CXXFLAGS ?= -Os -pg
-
-g: CXXFLAGS = -g -ggdb -fno-inline -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fno-omit-frame-pointer
-
-all debug profile g: libblasr.a
-
-libblasr.a: $(objects)
- $(AR_pp) $(ARFLAGS) $@ $^
-
-%.o: %.cpp
- $(CXX_pp) $(CXXOPTS) $(CXXFLAGS) $(LEGACY) $(INCLUDES) -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.o) $(@:%.o=%.d)" -c $< -o $@
-
-# .INTERMEDIATE: $(objects)
-
-clean:
- @rm -f libblasr.a
- @find . -type f -name \*.o -delete
- @find . -type f -name \*.d -delete
-
-
--include $(dependencies)
diff --git a/alignment/simulator/ContextOutputList.hpp b/alignment/simulator/ContextOutputList.hpp
index 5023463..870ac64 100644
--- a/alignment/simulator/ContextOutputList.hpp
+++ b/alignment/simulator/ContextOutputList.hpp
@@ -68,7 +68,7 @@ public:
int e;
i = 0;
if (outputMap.find(context) == outputMap.end()) {
- outputMap[context] = new OutputList;
+ outputMap[context] = ProtectedNew<OutputList>();
}
while(i < outputsLine.size()) {
diff --git a/alignment/simulator/ContextSet.cpp b/alignment/simulator/ContextSet.cpp
index 1d84d72..7f08785 100644
--- a/alignment/simulator/ContextSet.cpp
+++ b/alignment/simulator/ContextSet.cpp
@@ -21,13 +21,13 @@ void ContextSampleMap::Read(std::ifstream &in) {
int numContext;
in.read((char*)&numContext, sizeof(numContext));
int i;
- char *context = new char[contextLength+1];
+ char *context = ProtectedNew<char>(contextLength+1);
context[contextLength] = '\0';
for (i = 0; i < numContext; i++) {
in.read(context, contextLength);
std::string contextString = context;
// Allocate the context
- (*this)[contextString] = new ContextSample;
+ (*this)[contextString] = ProtectedNew<ContextSample>();
(*this)[contextString]->Read(in);
}
delete[] context;
diff --git a/alignment/simulator/OutputSampleListSet.hpp b/alignment/simulator/OutputSampleListSet.hpp
index dbfe8d0..0a36ae2 100644
--- a/alignment/simulator/OutputSampleListSet.hpp
+++ b/alignment/simulator/OutputSampleListSet.hpp
@@ -4,6 +4,7 @@
#include <map>
#include <string>
#include <iostream>
+#include "utils.hpp"
#include "OutputSampleList.hpp"
@@ -67,7 +68,7 @@ class OutputSampleListSet {
in.read((char*) &keyLength, sizeof(int));
if (keyLength == 0 or setSize == 0) { return; }
- char *key = new char[keyLength+1];
+ char *key = ProtectedNew<char>(keyLength+1);
key[keyLength] = '\0';
int i;
for (i = 0; i < setSize; i++) {
diff --git a/alignment/suffixarray/LCPTable.hpp b/alignment/suffixarray/LCPTable.hpp
index 85037f2..99f594e 100644
--- a/alignment/suffixarray/LCPTable.hpp
+++ b/alignment/suffixarray/LCPTable.hpp
@@ -3,6 +3,7 @@
#include <map>
#include <fstream>
+#include "utils.hpp"
template <typename T>
class LCPTable {
@@ -38,8 +39,8 @@ class LCPTable {
void Init(T* data, unsigned int pTableLength, unsigned int *index) {
tableLength = pTableLength;
maxPrefixLength = (PrefixLength) (SignedPrefixLength(-1));
- llcp = new PrefixLength[tableLength];
- rlcp = new PrefixLength[tableLength];
+ llcp = ProtectedNew<PrefixLength>(tableLength);
+ rlcp = ProtectedNew<PrefixLength>(tableLength);
std::fill(llcp, llcp + tableLength, 0);
std::fill(rlcp, rlcp + tableLength, 0);
FillTable(data, index);
@@ -94,14 +95,14 @@ class LCPTable {
~LCPTable() {
- /*
- if (llcp != NULL)
- delete[] llcp;
- llcp = NULL;
- if (rlcp != NULL)
- delete[] rlcp;
- rlcp = NULL;
- */
+ if (llcp != NULL) {
+ delete[] llcp;
+ llcp = NULL;
+ }
+ if (rlcp != NULL) {
+ delete[] rlcp;
+ rlcp = NULL;
+ }
// the two maps automatically go away.
}
diff --git a/alignment/suffixarray/SuffixArray.hpp b/alignment/suffixarray/SuffixArray.hpp
index 907bf1b..b816d5e 100644
--- a/alignment/suffixarray/SuffixArray.hpp
+++ b/alignment/suffixarray/SuffixArray.hpp
@@ -309,7 +309,6 @@ public:
std::fill(count.begin(), count.end(), 0);
assert(index == NULL or not deleteStructures);
index = ProtectedNew<SAIndex>(targetLength);
- //index = new SAIndex[targetLength];
deleteStructures = true;
for (a = 0; a < alphabet.size(); a++ ) {
bucket[a] = -1;
diff --git a/alignment/suffixarray/ssort.hpp b/alignment/suffixarray/ssort.hpp
index 0a6f9b8..7b451bc 100644
--- a/alignment/suffixarray/ssort.hpp
+++ b/alignment/suffixarray/ssort.hpp
@@ -234,6 +234,7 @@ Bad input
*/
#include <stdlib.h>
+#include "utils.hpp"
enum {
ORIG = ~(~0u>>1), /* sign bit */
@@ -262,8 +263,7 @@ ssort(SAIndex a[], SAIndex s[])
j = a[n]; /* and max element */
if(a[n++]<0 || j>=n)
finish(2);
- // p = malloc(n*sizeof(int));
- p = new SAIndex[n];
+ p = ProtectedNew<SAIndex>(n);
if(p == 0)
finish(1);
@@ -272,7 +272,7 @@ ssort(SAIndex a[], SAIndex s[])
if(s) { /* shared lengths */
// q = malloc(n*sizeof(int));
- q = new SAIndex[n];
+ q = ProtectedNew<SAIndex>(n);
if(q == 0)
finish(1);
}
diff --git a/alignment/tuples/TupleCountTableImpl.hpp b/alignment/tuples/TupleCountTableImpl.hpp
index 805dc66..a2d9e0e 100644
--- a/alignment/tuples/TupleCountTableImpl.hpp
+++ b/alignment/tuples/TupleCountTableImpl.hpp
@@ -1,5 +1,6 @@
#ifndef _BLASR_TUPLE_COUNT_TABLE_IMPL_HPP_
#define _BLASR_TUPLE_COUNT_TABLE_IMPL_HPP_
+#include "utils.hpp"
using namespace std;
@@ -15,7 +16,7 @@ void TupleCountTable<TSequence, TTuple>::InitCountTable(TupleMetrics &ptm) {
countTableLength = countTableLength << ((tm.tupleSize - 1)*2);
assert(countTableLength > 0);
- countTable = new int[countTableLength];
+ countTable = ProtectedNew<int>(countTableLength);
deleteStructures = true;
fill(&countTable[0], &countTable[countTableLength], 0);
nTuples = 0;
@@ -93,7 +94,7 @@ void TupleCountTable<TSequence, TTuple>::Read(ifstream &in) {
in.read((char*) &nTuples, sizeof(int));
in.read((char*) &tm.tupleSize, sizeof(int));
tm.InitializeMask();
- countTable = new int[countTableLength];
+ countTable = ProtectedNew<int>(countTableLength);
deleteStructures = true;
in.read((char*) countTable, sizeof(int) * countTableLength);
}
diff --git a/alignment/tuples/TupleListImpl.hpp b/alignment/tuples/TupleListImpl.hpp
index fdbf91f..54bfde7 100644
--- a/alignment/tuples/TupleListImpl.hpp
+++ b/alignment/tuples/TupleListImpl.hpp
@@ -43,7 +43,6 @@ int TupleList<T>::InitFromFile(std::string &fileName) {
listIn.read((char*) &listLength, sizeof(int));
listIn.read((char*) &tm.tupleSize, sizeof(int));
tm.InitializeMask();
- //list = new T[listLength];
tupleList.resize(listLength);
listIn.read((char*) &tupleList[0], sizeof(T) * listLength);
return 1;
diff --git a/alignment/utils/RegionUtils.cpp b/alignment/utils/RegionUtils.cpp
index 0448f2e..305c362 100644
--- a/alignment/utils/RegionUtils.cpp
+++ b/alignment/utils/RegionUtils.cpp
@@ -4,109 +4,18 @@
bool LookupHQRegion(int holeNumber, RegionTable ®ionTable,
int &start, int &end, int &score) {
- int regionLowIndex, regionHighIndex;
- regionLowIndex = regionHighIndex = 0;
-
- regionTable.LookupRegionsByHoleNumber(holeNumber,
- regionLowIndex, regionHighIndex);
-
- bool readHasGoodRegion = true;
- int regionIndex = regionLowIndex;
- while (regionIndex < regionHighIndex and
- regionTable.GetType(regionIndex) != HQRegion) {
- regionIndex++;
- }
-
- if (regionIndex == regionHighIndex) {
- start = end = score = 0;
- return false;
- }
- else {
- start = regionTable.GetStart(regionIndex);
- end = regionTable.GetEnd(regionIndex);
- score = regionTable.GetScore(regionIndex);
- return true;
- }
-}
-
-// Functions of class CompareRegionIndicesByStart.
-int CompareRegionIndicesByStart::
-operator()(const int a, const int b) const {
- if (regionTablePtr->GetStart(a) == regionTablePtr->GetStart(b)) {
- return (regionTablePtr->GetEnd(a) < regionTablePtr->GetEnd(b));
- }
- else {
- return (regionTablePtr->GetStart(a) < regionTablePtr->GetStart(b));
- }
-}
-
-// General functions.
-int SortRegionIndicesByStart(RegionTable ®ionTable,
- std::vector<int> &indices) {
-
- CompareRegionIndicesByStart cmpFctr;
- cmpFctr.regionTablePtr = ®ionTable;
- std::sort(indices.begin(), indices.end(), cmpFctr);
- return indices.size();
-}
-
-
-// Functions of OrderRegionsByReadStart:
-int OrderRegionsByReadStart::
-operator()(const ReadInterval &lhs, const ReadInterval &rhs) const {
- return lhs.start < rhs.start;
-}
-
-
-// General functions.
-int FindRegionIndices(unsigned int holeNumber, RegionTable *regionTablePtr,
- int ®ionLowIndex, int ®ionHighIndex) {
-
- int regionIndex;
- regionLowIndex = regionHighIndex = 0;
-
- regionTablePtr->LookupRegionsByHoleNumber(holeNumber,
- regionLowIndex, regionHighIndex);
-
- return regionHighIndex - regionLowIndex;
-}
-
-
-int FindRegionIndices(SMRTSequence &read, RegionTable *regionTablePtr,
- int ®ionLowIndex, int ®ionHighIndex) {
- return FindRegionIndices(read.zmwData.holeNumber,
- regionTablePtr, regionLowIndex, regionHighIndex);
-}
-
-
-//
-// Collect region indices for either all region types, or just a few specific region types.
-//
-//
-int CollectRegionIndices(SMRTSequence &read, RegionTable ®ionTable,
- std::vector<int> ®ionIndices, RegionType *regionTypes,
- int numRegionTypes) {
-
- int regionLow, regionHigh;
- int prevNumRegionIndices = regionIndices.size();
- if (FindRegionIndices(read, ®ionTable, regionLow, regionHigh)) {
- int i;
- for (i = regionLow; i < regionHigh; i++) {
- if (regionTypes == NULL) {
- regionIndices.push_back(i);
- }
- else {
- int t;
- for (t = 0; t < numRegionTypes; t++) {
- if (regionTable.GetType(i) == regionTypes[t]) {
- regionIndices.push_back(i);
- break;
- }
- }
- }
+ if (regionTable.HasHoleNumber(holeNumber)) {
+ RegionAnnotations zmwRegions = regionTable[holeNumber];
+ if (zmwRegions.HasHQRegion()) {
+ start = zmwRegions.HQStart();
+ end = zmwRegions.HQEnd();
+ score = zmwRegions.HQScore();
+ return true;
}
}
- return regionIndices.size() - prevNumRegionIndices;
+
+ start = end = score = 0;
+ return false;
}
diff --git a/alignment/utils/RegionUtils.hpp b/alignment/utils/RegionUtils.hpp
index f5fb870..e5a56d5 100644
--- a/alignment/utils/RegionUtils.hpp
+++ b/alignment/utils/RegionUtils.hpp
@@ -21,54 +21,6 @@ bool GetReadTrimCoordinates(T_Sequence &fastaRead,
ZMWGroupEntry &zmwData, RegionTable ®ionTable,
DNALength &readStart ,DNALength &readEnd, int &score);
-template<typename T_Sequence>
-bool TrimRead(T_Sequence &fastaRead, ZMWGroupEntry &zmwData,
- RegionTable ®ionTable, T_Sequence &trimmedRead);
-
-
-class CompareRegionIndicesByStart {
-public:
- RegionTable *regionTablePtr;
- int operator()(const int a, const int b) const;
-};
-
-
-int SortRegionIndicesByStart(RegionTable ®ionTable,
- std::vector<int> &indices);
-
-class OrderRegionsByReadStart {
-public:
- int operator()(const ReadInterval &lhs, const ReadInterval &rhs) const;
-};
-
-int FindRegionIndices(unsigned int holeNumber, RegionTable *regionTablePtr,
- int ®ionLowIndex, int ®ionHighIndex);
-
-int FindRegionIndices(SMRTSequence &read, RegionTable *regionTablePtr,
- int ®ionLowIndex, int ®ionHighIndex);
-
-//
-// Collect region indices for either all region types, or just a few
-// specific region types.
-//
-int CollectRegionIndices(SMRTSequence &read, RegionTable ®ionTable,
- std::vector<int> ®ionIndices, RegionType *regionTypes=NULL,
- int numRegionTypes = 0);
-
-
-template<typename T_Sequence>
-void CollectSubreadIntervals(T_Sequence &read, RegionTable *regionTablePtr,
- std::vector<ReadInterval> &subreadIntervals, bool byAdapter=false);
-
-// Get all adapter intervals of a ZMW.
-// Input:
-// read - read.zmwData.holeNumber specifies the zmw.
-// regionTablePtr - a pointer to a region table.
-// Output:
-// adapterIntervals - where to assign all adapter intervals of the zmw
-template<typename T_Sequence>
-void CollectAdapterIntervals(T_Sequence &read, RegionTable *regionTablePtr,
- std::vector<ReadInterval> &adapterIntervals);
// Given a vecotr of ReadInterval objects and their corresponding
// directions, intersect each object with an interval
diff --git a/alignment/utils/RegionUtilsImpl.hpp b/alignment/utils/RegionUtilsImpl.hpp
index fc4b859..3ccfbb2 100644
--- a/alignment/utils/RegionUtilsImpl.hpp
+++ b/alignment/utils/RegionUtilsImpl.hpp
@@ -2,214 +2,51 @@
#define _BLASR_REGION_UTILS_IMPL_HPP
+//FIXME: move all functions to class SMRTSequence
template<typename T_Sequence>
-bool MaskRead(T_Sequence &fastaRead, ZMWGroupEntry &zmwData,
- RegionTable ®ionTable) {
- int regionIndex;
- int regionLowIndex, regionHighIndex;
- regionLowIndex = regionHighIndex = 0;
-
- regionTable.LookupRegionsByHoleNumber(zmwData.holeNumber,
- regionLowIndex, regionHighIndex);
-
- bool readHasGoodRegion = true;
-
- DNALength readPos;
-
- regionIndex = regionLowIndex;
- int lastHQRegionIndex;
-
- int hqRegionStart=0, hqRegionEnd=0, hqRegionScore = 0;
- readHasGoodRegion = LookupHQRegion(zmwData.holeNumber,
- regionTable, hqRegionStart, hqRegionEnd, hqRegionScore);
-
- //
- // Mask off the low quality portion of this read.
- //
- for (readPos = 0; (readPos < hqRegionStart and
- readPos < fastaRead.length); readPos++) {
- fastaRead.seq[readPos] = 'N';
- }
-
- for (readPos = hqRegionEnd; readPos < fastaRead.length; readPos++) {
- fastaRead.seq[readPos] = 'N';
- }
-
- //
- // Look to see if there is region information provided, but the
- // entire read is bad.
- //
- if (hqRegionEnd == hqRegionStart) {
- //
- // This read is entirely bad, flag that.
- //
- readHasGoodRegion = false;
+bool MaskRead(T_Sequence &fastaRead,
+ ZMWGroupEntry &zmwData,
+ RegionTable ®ionTable)
+{
+ if (not regionTable.HasHoleNumber(zmwData.holeNumber)) {
+ return false;
+ } else {
+ RegionAnnotations regions = regionTable[zmwData.holeNumber];
+
+ // Mask off the low quality portion of this read.
+ DNALength readPos;
+ for (readPos = 0; readPos < std::min(regions.HQStart(), fastaRead.length); readPos++) {
+ fastaRead.seq[readPos] = 'N';
+ }
+ for (readPos = regions.HQEnd(); readPos < fastaRead.length; readPos++) {
+ fastaRead.seq[readPos] = 'N';
+ }
+ return regions.HasHQRegion();
}
-
- return readHasGoodRegion;
}
-
+/// \params[in] - fastaRead, zmwData, regionTable
+/// \params[out] - readStart
+/// \params[out] - readEnd
+/// \params[out] - score
+/// \returns Whether or not read coordinate trimmed according to HQRegion
template<typename T_Sequence>
bool GetReadTrimCoordinates(T_Sequence &fastaRead,
ZMWGroupEntry &zmwData, RegionTable ®ionTable,
DNALength &readStart ,DNALength &readEnd, int &score) {
- int regionIndex;
- int regionLowIndex, regionHighIndex;
- regionLowIndex = regionHighIndex = 0;
-
- regionTable.LookupRegionsByHoleNumber(zmwData.holeNumber,
- regionLowIndex, regionHighIndex);
-
- bool readHasGoodRegion = true;
-
- DNALength readPos;
-
- regionIndex = regionLowIndex;
- int lastHQRegionIndex;
-
- while (regionIndex < regionHighIndex and
- regionTable.GetType(regionIndex) != HQRegion) {
- regionIndex++;
- }
-
- if (regionIndex < regionHighIndex ) {
- readStart = regionTable.GetStart(regionIndex);
- readEnd = regionTable.GetEnd(regionIndex);
- score = regionTable.GetScore(regionIndex);
- return true;
- }
- else {
- readStart = 0;
- readEnd = fastaRead.length;
- return false;
- }
-}
-
-
-template<typename T_Sequence>
-bool TrimRead(T_Sequence &fastaRead, ZMWGroupEntry &zmwData,
- RegionTable ®ionTable, T_Sequence &trimmedRead) {
-
- DNALength readStart, readEnd;
-
- GetReadTrimCoordinates(fastaRead, zmwData, regionTable,
- readStart, readEnd);
-
- if (readEnd - readStart > 0) {
- trimmedRead.CopySubsequence((FASTQSequence&)fastaRead,
- readStart, readEnd);
- // signal that the read has a good region.
- return true;
- }
- else {
- //
- // There is no information for this read. Make it skipped.
- //
- trimmedRead.seq = NULL;
- trimmedRead.CopyTitle(fastaRead.title);
- // signal this read has no good region.
- return false;
- }
-}
-
-template<typename T_Sequence>
-void CollectSubreadIntervals(T_Sequence &read, RegionTable *regionTablePtr,
- std::vector<ReadInterval> &subreadIntervals, bool byAdapter) {
-
- int regionIndex;
- int regionLowIndex, regionHighIndex;
- regionLowIndex = regionHighIndex = 0;
-
- regionTablePtr->LookupRegionsByHoleNumber(read.zmwData.holeNumber,
- regionLowIndex, regionHighIndex);
-
- if (byAdapter == false) {
- // read subreads (insert) directly from region table.
- for (regionIndex = regionLowIndex;
- regionIndex < regionHighIndex; regionIndex++) {
- if (regionTablePtr->GetType(regionIndex) == Insert) {
-
- RegionAnnotation & ra = regionTablePtr->table[regionIndex];
- subreadIntervals.push_back(ReadInterval(
- ra.row[RegionAnnotation::RegionStart],
- ra.row[RegionAnnotation::RegionEnd],
- ra.row[RegionAnnotation::RegionScore]));
- }
- }
- }
- else { // Determine subreads according to adapters only.
- std::vector<int> adapterIntervalIndices;
- for (regionIndex = regionLowIndex;
- regionIndex < regionHighIndex; regionIndex++) {
- if (regionTablePtr->GetType(regionIndex) == Adapter) {
- adapterIntervalIndices.push_back(regionIndex);
- }
- }
- // Sort indices so that the intervals appear in order on the read.
- SortRegionIndicesByStart(*regionTablePtr, adapterIntervalIndices);
- int curIntervalStart = 0;
- int i;
- if (adapterIntervalIndices.size() == 0) {
- // no adapter, this zmw has only one subread (pass)
- subreadIntervals.push_back(ReadInterval(0, read.length));
- }
- else {
- // The first subread covers [0, RegionStart of first adapter)
- subreadIntervals.push_back(ReadInterval(0,
- regionTablePtr->table[adapterIntervalIndices[0]].
- row[RegionAnnotation::RegionStart]));
-
- // The subread[i] covers (RegionEnd of i-1-th adapter, RegionStart of i-th adapter)
- for (i = 0; i + 1 < adapterIntervalIndices.size(); i++) {
- subreadIntervals.push_back(ReadInterval(
- regionTablePtr->table[adapterIntervalIndices[i]].
- row[RegionAnnotation::RegionEnd],
- regionTablePtr->table[adapterIntervalIndices[i+1]].
- row[RegionAnnotation::RegionStart]));
- }
- // The last subread covers (RegionEnd of last adapter, end of read)
- subreadIntervals.push_back(
- ReadInterval(regionTablePtr->table[
- adapterIntervalIndices[adapterIntervalIndices.size()-1]].
- row[RegionAnnotation::RegionEnd],
- read.length));
- }
- }
- sort(subreadIntervals.begin(), subreadIntervals.end(),
- OrderRegionsByReadStart());
-}
-
-
-// Get all adapter intervals of a ZMW.
-// Input:
-// read - read.zmwData.holeNumber specifies the zmw.
-// regionTablePtr - a pointer to a region table.
-// Output:
-// adapterIntervals - where to assign all adapter intervals of the zmw
-template<typename T_Sequence>
-void CollectAdapterIntervals(T_Sequence &read,
- RegionTable *regionTablePtr,
- std::vector<ReadInterval> &adapterIntervals) {
-
- assert(regionTablePtr != NULL);
- int regionLowIndex = 0, regionHighIndex = 0;
-
- regionTablePtr->LookupRegionsByHoleNumber(read.zmwData.holeNumber,
- regionLowIndex, regionHighIndex);
-
- for (int regionIndex = regionLowIndex;
- regionIndex < regionHighIndex; regionIndex++) {
-
- if (regionTablePtr->GetType(regionIndex) == Adapter) {
- RegionAnnotation & ra = regionTablePtr->table[regionIndex];
- adapterIntervals.push_back(ReadInterval(
- ra.row[RegionAnnotation::RegionStart],
- ra.row[RegionAnnotation::RegionEnd],
- ra.row[RegionAnnotation::RegionScore]));
+ if (regionTable.HasHoleNumber(zmwData.holeNumber)) {
+ RegionAnnotations regions = regionTable[zmwData.holeNumber];
+ if (regions.HasHQRegion()) {
+ readStart = regions.HQStart();
+ readEnd = regions.HQEnd();
+ return true;
}
}
+
+ readStart = 0;
+ readEnd = fastaRead.length;
+ return false;
}
#endif
diff --git a/common.mk b/common.mk
deleted file mode 100644
index 8108011..0000000
--- a/common.mk
+++ /dev/null
@@ -1,84 +0,0 @@
-SHELL = bash
-G_BUILDOS_CMD := bash -c 'set -e; set -o pipefail; id=$$(lsb_release -si | tr "[:upper:]" "[:lower:]"); rel=$$(lsb_release -sr); case $$id in ubuntu) printf "$$id-%04d\n" $${rel/./};; centos) echo "$$id-$${rel%%.*}";; *) echo "$$id-$$rel";; esac' 2>/dev/null
-OS_STRING ?= $(shell $(G_BUILDOS_CMD))
-
-ifeq ($(origin PREBUILT), undefined)
-PREBUILT := $(shell cd ../../../../prebuilt.out 2>/dev/null && pwd || echo -n notfound)
-endif
-
-THIRD_PARTY_PREFIX ?= ../..
-
-ifneq ($(COMMON_NO_THIRD_PARTY_REQD),true)
- #
- # Definitions common to all make files for library code.
- # All paths are relative from inside the subdirectories, not this file
- #
-
- # git layout vs p4 layout automagic
- THIRD_PARTY ?= $(shell cd $(abspath $(THIRD_PARTY_PREFIX)/third-party) 2>/dev/null && pwd || echo -n notfound)
- ifeq ($(THIRD_PARTY), notfound)
- THIRD_PARTY := $(shell cd $(abspath $(THIRD_PARTY_PREFIX)/../third-party/cpp) 2>/dev/null && pwd || echo -n notfound)
- endif
-
- # handle HDF5_INC HDF5_LIB
- ifeq ($(origin HDF5_INC), undefined)
- HDF5_INC := $(shell cd $(PREBUILT)/hdf5/hdf5-1.8.12/$(OS_STRING)/include 2>/dev/null && pwd || echo -n notfound)
- else
- HDF5_INC := $(shell cd $(HDF5_INC) 2>/dev/null && pwd || echo -n notfound)
- endif
- ifeq ($(HDF5_INC), notfound)
- ifeq ($(THIRD_PARTY), notfound)
- $(error cannot find third-party libraries!)
- endif
- HDF5_INC := $(shell cd $(THIRD_PARTY)/hdf5/include 2>/dev/null && pwd || echo -n notfound)
- endif
- ifeq ($(origin HDF5_LIB), undefined)
- HDF5_LIB := $(shell cd $(PREBUILT)/hdf5/hdf5-1.8.12/$(OS_STRING)/lib 2>/dev/null && pwd || echo -n notfound)
- else
- HDF5_LIB := $(shell cd $(HDF5_LIB) 2>/dev/null && pwd || echo -n notfound)
- endif
- ifeq ($(HDF5_LIB), notfound)
- ifeq ($(THIRD_PARTY), notfound)
- $(error cannot find third-party libraries!)
- endif
- HDF5_LIB := $(shell cd $(THIRD_PARTY)/hdf5/lib 2>/dev/null && pwd || echo -n notfound)
- endif
-
- # handle ZLIB_ROOT
- ifeq ($(origin ZLIB_ROOT), undefined)
- ZLIB_ROOT := $(shell cd $(PREBUILT)/zlib/zlib-1.2.5/$(OS_STRING) 2>/dev/null && pwd || echo -n notfound)
- else
- ZLIB_ROOT := $(shell cd $(ZLIB_ROOT) 2>/dev/null && pwd || echo -n notfound)
- endif
-
- ifeq ($(ZLIB_ROOT), notfound)
- # we don't need a backup ZLIB_ROOT here, because almost everybody has one in their paths
- endif
-endif
-
-# handle BOOST
-ifeq ($(origin BOOST_INCLUDE), undefined)
-ifeq ($(origin BOOST_ROOT), undefined)
-BOOST_INCLUDE := $(PREBUILT)/boost/boost_1_55_0
-else
-BOOST_INCLUDE := $(BOOST_ROOT)
-endif
-endif
-
-# handle PBBAM
-ifeq ($(origin PBBAM), undefined)
-PBBAM := $(shell cd $(THIRD_PARTY_PREFIX)/../staging/PostPrimary/pbbam 2>/dev/null && pwd || echo -n notfound)
-endif
-
-# magic for non-verbose builds
-V ?= 0
-
-CXX_0 = @echo " CXX $@"; $(CXX)
-CXX_1 = $(CXX)
-CXX_pp = $(CXX_$(V))
-
-AR_0 = @echo " AR $@"; $(AR)
-AR_1 = $(AR)
-AR_pp = $(AR_$(V))
-
-ARFLAGS := rc
diff --git a/configure.py b/configure.py
new file mode 100755
index 0000000..137160d
--- /dev/null
+++ b/configure.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python
+"""Configure the build.
+
+- Fetch HDF5 headers.
+- Create libconfig.h
+- Create defines.mk
+
+This is not used by './unittest/'.
+"""
+import commands
+import contextlib
+import os
+import sys
+
+thisdir = os.path.dirname(os.path.abspath(__file__))
+
+def log(msg):
+ sys.stderr.write(msg)
+ sys.stderr.write('\n')
+
+def shell(cmd):
+ log(cmd)
+ status, output = commands.getstatusoutput(cmd)
+ if status:
+ raise Exception('%d <- %r' %(status, cmd))
+ return output
+
+def update_content(fn, content):
+ direc = os.path.abspath(os.path.dirname(fn))
+ if not os.path.isdir(direc):
+ shell('mkdir -p %s' %direc)
+ current_content = open(fn).read() if os.path.exists(fn) else None
+ if content != current_content:
+ log('writing to %r' %fn)
+ log('"""\n' + content + '"""')
+ open(fn, 'w').write(content)
+
+def compose_libconfig(pbbam=False):
+ if pbbam:
+ content = """
+#define USE_PBBAM
+"""
+ else:
+ content = """
+"""
+ return content
+
+def compose_defines_with_hdf(HDF5_INC, HDF5_LIB):
+ """We have to use := for HDF5_LIB b/c blasr
+ is using it to mean the directory, not the file,
+ and it's in the environment.
+ """
+ return """
+HDF5_INC:=%(HDF5_INC)s
+HDF5_LIB:=%(HDF5_LIB)s
+#CPPFLAGS+= -I../pbdata -I../hdf -I../alignment
+LIBPBDATA_INC ?=../pbdata
+LIBPBIHDF_INC ?=../hdf
+LIBBLASR_INC ?=../alignment
+LIBPBDATA_LIB ?=../pbdata
+LIBPBIHDF_LIB ?=../hdf
+LIBBLASR_LIB ?=../alignment
+"""%(dict(
+ thisdir=thisdir,
+ HDF5_INC=HDF5_INC,
+ HDF5_LIB=HDF5_LIB))
+
+
+def compose_defines_with_hdf_headers(HDF_HEADERS):
+ return """
+HDF_HEADERS:=%(HDF_HEADERS)s
+#HDF5_INC ?=${HDF_HEADERS}/src
+CPPFLAGS+= -I${HDF_HEADERS}/src -I${HDF_HEADERS}/c++/src
+CPPFLAGS+= -I../pbdata -I../hdf -I../alignment
+LIBPBDATA_LIB ?=../pbdata/
+LIBPBIHDF_LIB ?=../hdf/
+LIBBLASR_LIB ?=../alignment/
+"""%(dict(thisdir=thisdir, HDF_HEADERS=HDF_HEADERS))
+
+def compose_defines():
+ """
+ Note that our local 'hdf' subdir will not even build
+ in this case.
+ """
+ return """
+LIBPBDATA_INC ?=../pbdata
+LIBPBIHDF_INC ?=../hdf
+LIBBLASR_INC ?=../alignment
+LIBPBDATA_LIB ?=%(thisdir)s/pbdata/
+LIBPBIHDF_LIB ?=%(thisdir)s/hdf/
+LIBBLASR_LIB ?=%(thisdir)s/alignment/
+nohdf ?=1
+"""%(dict(thisdir=thisdir))
+
+def get_OS_STRING():
+ G_BUILDOS_CMD = """bash -c 'set -e; set -o pipefail; id=$(lsb_release -si | tr "[:upper:]" "[:lower:]"); rel=$(lsb_release -sr); case $id in ubuntu) printf "$id-%04d\n" ${rel/./};; centos) echo "$id-${rel%%.*}";; *) echo "$id-$rel";; esac' 2>/dev/null"""
+ return shell(G_BUILDOS_CMD)
+
+def get_PBBAM(env, prefix):
+ """
+ key = 'PBBAM'
+ if key in env:
+ return env[key]
+ cmd = 'cd $(THIRD_PARTY_PREFIX)/../staging/PostPrimary/pbbam 2>/dev/null && pwd || echo -n notfound' %(
+ THIRD_PARTY_PREFIX=prefix)
+ return shell(cmd)
+ """
+def get_HTSLIB(env, prefix):
+ """
+ key = 'HTSLIB'
+ if key in env:
+ return env[key]
+ cmd = 'cd $(THIRD_PARTY_PREFIX)/../staging/PostPrimary/htslib 2>/dev/null && pwd || echo -n notfound' %(
+ THIRD_PARTY_PREFIX=prefix)
+ return shell(cmd)
+ """
+def ifenvf(env, key, func):
+ if key in env:
+ return env[key]
+ else:
+ return func()
+def setifenvf(envout, envin, key, func):
+ envout[key] = ifenvf(envin, key, func)
+def setifenv(envout, envin, key, val):
+ envout[key] = envin.get(key, val)
+def setenv(envout, key, val):
+ envout[key] = val
+def update_env_if(envout, envin, keys):
+ for key in keys:
+ if key in envin:
+ envout[key] = envin[key]
+def compose_defs_env(env):
+ # We disallow env overrides for anything with a default from GNU make.
+ nons = ['CXX', 'CC', 'AR'] # 'SHELL'?
+ ovr = ['%-20s ?= %s' %(k, v) for k,v in sorted(env.items()) if k not in nons]
+ nonovr = ['%-20s := %s' %(k, v) for k,v in sorted(env.items()) if k in nons]
+ return '\n'.join(ovr + nonovr + [''])
+def append_common(envin, content):
+ """Dumb way to do this, but this whole thing is evolving.
+ """
+ # This is the original libconfig.h. However, in case somebody (like
+ # pbdagcon) builds libpbdata in-place, we need to drop a copy of
+ # libconfig.h wherever pbdata is actually built, which we will not
+ # know until later. This can all be cleared up later, when we are
+ # more clear about where things are built.
+ libconfig_h = os.path.abspath(os.path.join(os.getcwd(), 'libconfig.h'))
+ content += """
+LIBCONFIG_H:=%s
+# Use PREFIX dir, if available.
+INCLUDES += ${PREFIX_INC}
+LIBS += ${PREFIX_LIB}
+"""%libconfig_h
+ env = dict(envin)
+ # Some extra defs.
+ if 'PREFIX' in envin:
+ PREFIX = envin['PREFIX']
+ setenv(env, 'PREFIX_INC', os.path.join(PREFIX, 'include'))
+ setenv(env, 'PREFIX_LIB', os.path.join(PREFIX, 'lib'))
+ poss = [
+ 'CXXFLAGS',
+ 'SH_LIB_EXT',
+ 'EXTRA_LDFLAGS',
+ 'PREFIX_LIB', 'PREFIX_INC',
+ ]
+ vals = ['%-20s := %s' %(k, v) for k,v in sorted(env.items()) if k in poss]
+ return '\n'.join([''] + vals + ['']) + content
+def compose_defines_pacbio(envin):
+ """
+ This is used by mobs via buildcntl.sh.
+ """
+ env = dict()
+ setenv(env, 'SHELL', 'bash')
+ setifenvf(env, envin, 'OS_STRING', get_OS_STRING)
+ setifenv(env, envin, 'LIBPBDATA_INC', '../pbdata')
+ setifenv(env, envin, 'LIBPBIHDF_INC', '../hdf')
+ setifenv(env, envin, 'LIBBLASR_INC', '../alignment')
+ setifenv(env, envin, 'LIBPBDATA_LIB', '../pbdata/')
+ setifenv(env, envin, 'LIBPBIHDF_LIB', '../hdf/')
+ setifenv(env, envin, 'LIBBLASR_LIB', '../alignment/')
+ if 'nohdf' in envin:
+ env['nohdf'] = envin['nohdf']
+ # Otherwise, do not define it at all. TODO(CD): Remove nohdf, as it is not used.
+ nondefaults = set([
+ 'CXX', 'AR',
+ 'HDF5_INC', 'HDF5_LIB',
+ 'PBBAM_INC', 'PBBAM_LIB',
+ 'HTSLIB_INC', 'HTSLIB_LIB',
+ 'BOOST_INC',
+ 'ZLIB_LIB',
+ 'GCC_LIB',
+ 'GTEST_INC', 'GTEST_SRCDIR',
+ ])
+ update_env_if(env, envin, nondefaults)
+ return compose_defs_env(env)
+
+ at contextlib.contextmanager
+def cd(nwd):
+ cwd = os.getcwd()
+ log('cd %r -> %r' %(cwd, nwd))
+ os.chdir(nwd)
+ yield
+ os.chdir(cwd)
+ log('cd %r <- %r' %(cwd, nwd))
+
+def fetch_hdf5_headers():
+ """Fetch into ./hdf/HEADERS directory.
+ This should not be used when an external build-dir is needed.
+ Return actual directory path, relative to subdirs.
+ """
+ version = 'hdf5-1.8.12-headers'
+ version_dn = os.path.join(thisdir, 'hdf', version)
+ if not os.path.isdir(version_dn):
+ with cd(os.path.dirname(version_dn)):
+ cmd = 'curl -k -L https://www.dropbox.com/s/8971bcyy5o42rxb/hdf5-1.8.12-headers.tar.bz2\?dl\=0 | tar xjf -'
+ shell(cmd)
+ return version_dn # Relative path might help caching.
+
+def update(content_defines_mk, content_libconfig_h):
+ """ Write these relative to the same directory as *this* file.
+
+ Unfortunately, we need to record the exact path of libconfig.h
+ in defines.mk, so we know how to copy it.
+ """
+ fn_libconfig_h = os.path.join('.', 'libconfig.h')
+ update_content(fn_libconfig_h, content_libconfig_h)
+ #content_defines_mk += 'LIBCONFIG_H:=%s\n' %os.path.abspath(fn_libconfig_h)
+ fn_defines_mk = 'defines.mk'
+ update_content(fn_defines_mk, content_defines_mk)
+ if thisdir == os.path.abspath('.'):
+ # This was run in the root directory, so symlink defines.mk
+ # in sub-dirs, which now include defines.mk from CURDIR
+ # in order to facilitate building in external output directories.
+ for sub in ('pbdata', 'hdf', 'alignment', 'unittest'):
+ lname = os.path.join(sub, 'defines.mk')
+ if not os.path.lexists(lname):
+ os.symlink(os.path.join('..', 'defines.mk'), lname)
+
+def configure_nopbbam(envin):
+ """Use HDF5 from env-vars.
+ This is the path used by blasr in a GitHub build, for now.
+ """
+ HDF5_INC = envin.get('HDF5_INC')
+ if not HDF5_INC:
+ HDF5_INC = envin['HDF5_INCLUDE']
+ HDF5_LIB = envin['HDF5_LIB']
+ content1 = compose_defines_with_hdf(HDF5_INC, HDF5_LIB)
+ content1 = append_common(envin, content1)
+ content2 = compose_libconfig(pbbam=False)
+ update(content1, content2)
+
+def configure_nopbbam_skip_hdf(envin):
+ """Fetch HDF5 headers.
+ We lack HDF5 libs, so we cannot build our hdf/ subdir.
+ But the others are fine.
+ """
+ HDF_HEADERS = fetch_hdf5_headers()
+ content1 = compose_defines_with_hdf_headers(HDF_HEADERS)
+ content1 = append_common(envin, content1)
+ content2 = compose_libconfig(pbbam=False)
+ update(content1, content2)
+
+def configure_nopbbam_nohdf5(envin):
+ content1 = compose_defines()
+ content1 = append_common(envin, content1)
+ content2 = compose_libconfig(pbbam=False)
+ update(content1, content2)
+
+def configure_pacbio(envin):
+ content1 = compose_defines_pacbio(envin)
+ content1 = append_common(envin, content1)
+ content2 = compose_libconfig(pbbam=True)
+ update(content1, content2)
+
+def get_make_style_env(envin, args):
+ envout = dict()
+ for arg in args:
+ if '=' in arg:
+ k, v = arg.split('=')
+ envout[k] = v
+ envout.update(envin)
+ return envout
+
+class OsType:
+ Unknown, Linux, Darwin = range(3)
+
+def getOsType():
+ uname = shell('uname -s')
+ log('uname=%r' %uname)
+ if 'Darwin' in uname:
+ return OsType.Darwin
+ elif 'Linux' in uname:
+ return OsType.Linux
+ else:
+ return OsType.Unknown
+
+def update_env_for_linux(env):
+ env['SET_LIB_NAME'] = '-soname'
+ env['SH_LIB_EXT'] = '.so'
+def update_env_for_darwin(env):
+ env['SET_LIB_NAME'] = '-install_name'
+ env['SH_LIB_EXT'] = '.dylib'
+ env['EXTRA_LDFLAGS'] = '-flat_namespace'
+ # -flat_namespace makes BSD ld act like Linux ld, finding
+ # shared libs recursively.
+def update_env_for_unknown(env):
+ env['SET_LIB_NAME'] = '-soname'
+ env['SH_LIB_EXT'] = '.so'
+update_env_for_os = {
+ OsType.Linux: update_env_for_linux,
+ OsType.Darwin: update_env_for_darwin,
+ OsType.Unknown: update_env_for_unknown,
+}
+
+def main(prog, *args):
+ """Include shell environ plus KEY=VAL pairs in args.
+ """
+ ost = getOsType()
+ envin = get_make_style_env(os.environ, args)
+ update_env_for_os[ost](envin)
+ if 'NOPBBAM' in envin:
+ if 'NOHDF' in envin:
+ configure_nopbbam_nohdf5(envin)
+ else:
+ if 'HDF5_LIB' in envin:
+ if 'HDF5_INCLUDE' in envin:
+ if 'HDF5_INC' not in envin:
+ envin['HDF5_INC'] = envin['HDF5_INCLUDE']
+ else:
+ print("WARNING: Found both HDF5_INC and HDF5_INCLUDE in environ!")
+ assert 'HDF5_INC' in envin, 'Hey! You have HDF5_LIB but not HDF5_INC!'
+ configure_nopbbam(envin)
+ else:
+ configure_nopbbam_skip_hdf(envin)
+ else:
+ configure_pacbio(envin)
+
+
+if __name__=="__main__":
+ main(*sys.argv)
diff --git a/hdf/BufferedHDF2DArrayImpl.hpp b/hdf/BufferedHDF2DArrayImpl.hpp
index 49336ef..bddd762 100644
--- a/hdf/BufferedHDF2DArrayImpl.hpp
+++ b/hdf/BufferedHDF2DArrayImpl.hpp
@@ -3,6 +3,7 @@
#include <cstring>
#include <cassert>
+#include "utils.hpp"
template<typename T>
BufferedHDF2DArray<T>::BufferedHDF2DArray(H5::CommonFG *_container,
@@ -104,7 +105,7 @@ int BufferedHDF2DArray<T>::Initialize(HDFGroup &group, std::string datasetName,
if (dimSize) {
delete [] dimSize;
}
- dimSize = new hsize_t[nDims];
+ dimSize = ProtectedNew<hsize_t>(nDims);
dataspace.getSimpleExtentDims(dimSize);
rowLength = dimSize[0];
colLength = dimSize[1];
@@ -125,8 +126,6 @@ int BufferedHDF2DArray<T>::Initialize(HDFGroup &group, std::string datasetName,
template<typename T>
int BufferedHDF2DArray<T>::size() {
- // Why assert nDims == 1 for 2D Array?
- assert(nDims == 1);
dataspace.getSimpleExtentDims(dimSize);
return dimSize[0];
}
@@ -185,7 +184,7 @@ void BufferedHDF2DArray<T>::Create(H5::CommonFG *_container, string _datasetName
assert(this->writeBuffer != NULL);
delete[] this->writeBuffer;
}
- this->writeBuffer = new T[rowLength];
+ this->writeBuffer = ProtectedNew<T>(rowLength);
this->bufferSize = rowLength;
}
@@ -282,7 +281,6 @@ void BufferedHDF2DArray<T>::Flush(int destRow) {
//
// A default writeRow of -1 implies append
//
- int numRowsToCreate; // FIXME(yli): why is numRowsToCreate assigned but not used?
int numDataRows;
//
// this->bufferIndex points after the end of the last data in the
@@ -291,12 +289,6 @@ void BufferedHDF2DArray<T>::Flush(int destRow) {
//
numDataRows = this->bufferIndex / rowLength;
- if (destRow < 0) {
- numRowsToCreate = this->bufferIndex / rowLength;
- }
- else {
- numRowsToCreate = this->bufferIndex / rowLength + destRow;
- }
if (numDataRows > 0) {
assert(fileDataSpaceInitialized);
@@ -340,7 +332,6 @@ void BufferedHDF2DArray<T>::Flush(int destRow) {
// Store the newly dimensioned dataspaces.
//
fileSpace.getSimpleExtentDims(fileArraySize, fileArrayMaxSize);
- //int extendedSize = extendedSpace.getSimpleExtentNpoints(); // FIXME(yli): should this be used??
//
// Configure the proper addressing to append to the array.
//
diff --git a/hdf/BufferedHDFArrayImpl.hpp b/hdf/BufferedHDFArrayImpl.hpp
index 9f890ca..41b4486 100644
--- a/hdf/BufferedHDFArrayImpl.hpp
+++ b/hdf/BufferedHDFArrayImpl.hpp
@@ -4,6 +4,7 @@
#include <cstdlib>
#include <iostream>
#include <cstring>
+#include "utils.hpp"
#include "BufferedHDFArray.hpp"
template<typename T>
@@ -311,7 +312,7 @@ int BufferedHDFArray<T>::UpdateH5Dataspace() {
delete [] dimSize;
dimSize = NULL;
}
- dimSize = new hsize_t[nDims];
+ dimSize = ProtectedNew<hsize_t>(nDims);
dataspace.getSimpleExtentDims(dimSize);
arrayLength = dimSize[0];
diff --git a/hdf/HDFAtom.cpp b/hdf/HDFAtom.cpp
index ef23749..a70804b 100644
--- a/hdf/HDFAtom.cpp
+++ b/hdf/HDFAtom.cpp
@@ -1,17 +1,16 @@
#include "HDFAtom.hpp"
template<>
-void HDFAtom<std::string>::Create(H5::H5Location &object, std::string atomName) {
+void HDFAtom<std::string>::Create(H5::H5Location &object, const std::string & atomName) {
H5::StrType strType(0, H5T_VARIABLE);
hsize_t defaultDims[] = {1};
H5::DataSpace defaultDataSpace(1, defaultDims);
attribute = object.createAttribute(atomName.c_str(), strType, H5::DataSpace(H5S_SCALAR));
- initialized= true;
}
#define DEFINE_TYPED_CREATE_ATOM(T, predtype) template<> \
- void HDFAtom<T>::TypedCreate(H5::H5Location &object, std::string &atomName, H5::DataSpace &defaultDataSpace) { \
+ void HDFAtom<T>::TypedCreate(H5::H5Location &object, const std::string &atomName, H5::DataSpace &defaultDataSpace) { \
attribute = object.createAttribute(atomName.c_str(), (predtype), defaultDataSpace ); \
}
@@ -84,7 +83,6 @@ void HDFAtom<std::string>::Read(std::string &value) {
else {
hsize_t stsize = attribute.getStorageSize();
value.resize(stsize);
- // char *valueStr = new char[stsize+1];
attribute.read(stringType, &value[0]);
if (stsize > 0 and value[stsize-1] == '\0') {
value.resize(stsize-1);
@@ -131,7 +129,6 @@ void HDFAtom<std::vector<std::string> >::Read(std::vector<std::string> &values)
hsize_t nPoints;
nPoints = attributeSpace.getSelectNpoints();
H5::DataType attrType = attribute.getDataType(); // necessary for attr.read()
-
// Declare and initialize std::vector of pointers to std::string attribute list.
std::vector<char*> ptrsToHDFControlledMemory;
ptrsToHDFControlledMemory.resize(nPoints);
@@ -141,7 +138,7 @@ void HDFAtom<std::vector<std::string> >::Read(std::vector<std::string> &values)
unsigned int i;
for (i = 0; i < ptrsToHDFControlledMemory.size(); i++ ){
values.push_back(ptrsToHDFControlledMemory[i]);
- free(ptrsToHDFControlledMemory[i]);
+ free(ptrsToHDFControlledMemory[i]);
}
}
diff --git a/hdf/HDFAtom.hpp b/hdf/HDFAtom.hpp
index f8934df..5b921a9 100644
--- a/hdf/HDFAtom.hpp
+++ b/hdf/HDFAtom.hpp
@@ -16,12 +16,11 @@ class HDFAtom : public HDFData {
public:
H5::Attribute attribute;
- bool initialized;
HDFAtom() {
- initialized = false;
+ isInitialized = false;
}
~HDFAtom() {
- if (initialized) {
+ if (IsInitialized()) {
attribute.close();
}
}
@@ -30,21 +29,21 @@ public:
return NULL;
}
- int Initialize(H5::H5Location &object, std::string attributeName, bool createIfMissing=false) {
+ int Initialize(H5::H5Location &object, const std::string & attributeName, bool createIfMissing=false) {
attribute = object.openAttribute(attributeName.c_str());
- initialized = true;
+ isInitialized = true;
return 1;
}
- int Initialize(HDFGroup &group, std::string attributeName, bool createIfMissing=false) {
+ int Initialize(HDFGroup &group, const std::string & attributeName, bool createIfMissing=false) {
return Initialize(group.group, attributeName);
}
- int Initialize(HDFData &data, std::string attributeName, bool createIfMissing=false) {
+ int Initialize(HDFData &data, const std::string & attributeName, bool createIfMissing=false) {
return Initialize(data.dataset, attributeName);
}
- int Initialize(H5::Group &object, std::string attributeName, bool createIfMissing=false) {
+ int Initialize(H5::Group &object, const std::string & attributeName, bool createIfMissing=false) {
try {
attribute = object.openAttribute(attributeName.c_str());
}
@@ -52,15 +51,15 @@ public:
cout << "ERROR. Could not open attribute " << attributeName << endl;
exit(1);
}
- initialized = true;
+ isInitialized = true;
return 1;
}
- int Initialize(H5::H5File &hdfFile, std::string groupName, std::string attributeName) {
+ int Initialize(H5::H5File &hdfFile, const std::string & groupName, const std::string & attributeName) {
HDFGroup group;
group.Initialize(hdfFile, groupName);
attribute = group.group.openAttribute(attributeName.c_str());
- initialized = true;
+ isInitialized = true;
return 1;
}
@@ -68,28 +67,28 @@ public:
// This handles creation of all non-std::string types. A specialization
// for std::strings is provided below.
//
- void Create(H5::H5Location &object, std::string atomName) {
+ void Create(H5::H5Location &object, const std::string & atomName) {
hsize_t defaultDims[] = {1};
H5::DataSpace defaultDataSpace(1, defaultDims);
TypedCreate(object, atomName, defaultDataSpace);
}
- void Create(H5::H5Location &object, std::string name, std::string value) {
+ void Create(H5::H5Location &object, const std::string & name, const std::string & value) {
H5::StrType strType(0, value.size());
attribute = object.createAttribute(name.c_str(), strType, H5::DataSpace(0,NULL));
- initialized = true;
+ isInitialized = true;
attribute.write(strType, value.c_str());
}
- void Create(H5::H5Location &object, std::string name, std::vector<int> &vect) {
+ void Create(H5::H5Location &object, const std::string & name, std::vector<int> &vect) {
hsize_t length = vect.size();
H5::ArrayType arrayDataType(H5::PredType::NATIVE_INT, 1, &length);
attribute = object.createAttribute(name.c_str(), H5::PredType::NATIVE_INT, H5::DataSpace(1, &length));
attribute.write(H5::PredType::NATIVE_INT, &((vect)[0]));
}
- void Create(H5::H5Location &object, std::string name, std::vector<std::string> &vect) {
+ void Create(H5::H5Location &object, const std::string & name, const std::vector<std::string> &vect) {
hsize_t length = vect.size();
H5::StrType strType(0,H5T_VARIABLE);
H5::ArrayType arrayDataType(strType, 1, &length);
@@ -97,7 +96,7 @@ public:
attribute.write(strType, &((vect)[0]));
}
- void TypedCreate(H5::H5Location &object, std::string &atomName, H5::DataSpace &dataSpace) {
+ void TypedCreate(H5::H5Location &object, const std::string &atomName, H5::DataSpace &dataSpace) {
assert("Calling HDFAtom<T>::typedCreate on an unsupported type" == 0);
}
@@ -118,10 +117,10 @@ public:
//
template<>
-void HDFAtom<std::string>::Create(H5::H5Location &object, std::string atomName);
+void HDFAtom<std::string>::Create(H5::H5Location &object, const std::string & atomName);
#define DECLARE_TYPED_CREATE_ATOM(T, predtype) template<> \
- void HDFAtom<T>::TypedCreate(H5::H5Location &object, std::string &atomName, H5::DataSpace &defaultDataSpace);
+ void HDFAtom<T>::TypedCreate(H5::H5Location &object, const std::string & atomName, H5::DataSpace &defaultDataSpace);
DECLARE_TYPED_CREATE_ATOM(int, H5::PredType::NATIVE_INT)
DECLARE_TYPED_CREATE_ATOM(unsigned int, H5::PredType::NATIVE_UINT)
diff --git a/hdf/HDFAttributable.cpp b/hdf/HDFAttributable.cpp
index da57d08..b092cd8 100644
--- a/hdf/HDFAttributable.cpp
+++ b/hdf/HDFAttributable.cpp
@@ -5,11 +5,11 @@ using namespace std;
using namespace H5;
void CallStoreAttributeName(H5Location &obj, string attrName, void *attrList){
- ((vector<string>*)attrList)->push_back(attrName);
+ ((vector<string>*)attrList)->push_back(string(attrName));
}
void HDFAttributable::StoreAttributeNames(H5Location &thisobject,
- std::vector<std::string> &attributeNames) {
+ const std::vector<std::string> &attributeNames) {
int nAttr = thisobject.getNumAttrs();
unsigned int bounds[2];
bounds[0] = 0;
@@ -23,7 +23,7 @@ H5Location* HDFAttributable::GetObject() {
return NULL;
}
-int HDFAttributable::ContainsAttribute(string attributeName) {
+int HDFAttributable::ContainsAttribute(const string & attributeName) {
size_t i;
std::vector<std::string> tmpAttributeNames;
try{
diff --git a/hdf/HDFAttributable.hpp b/hdf/HDFAttributable.hpp
index 4b223c7..ba97acc 100644
--- a/hdf/HDFAttributable.hpp
+++ b/hdf/HDFAttributable.hpp
@@ -12,11 +12,11 @@ public:
std::vector<std::string> attributeNameList;
void StoreAttributeNames(H5::H5Location &thisobject,
- std::vector<std::string> &attributeNames);
+ const std::vector<std::string> &attributeNames);
virtual H5::H5Location* GetObject();
- int ContainsAttribute(std::string attributeName);
+ int ContainsAttribute(const std::string & attributeName);
};
diff --git a/hdf/HDFBasReader.hpp b/hdf/HDFBasReader.hpp
index 23fe235..ef596bd 100644
--- a/hdf/HDFBasReader.hpp
+++ b/hdf/HDFBasReader.hpp
@@ -242,7 +242,7 @@ public:
}
void GetChangeListID(std::string &changeListID) {
- if (changeListIDAtom.initialized) {
+ if (changeListIDAtom.IsInitialized()) {
changeListIDAtom.Read(changeListID);
}
else {
@@ -388,65 +388,109 @@ public:
}
}
+
int InitializeSequenceFields(HDFGroup &baseCallsGroup) {
- //
// The only field that is absoultely required is Basecall
if (baseArray.InitializeForReading(baseCallsGroup, "Basecall") == false) return 0;
- if (includedFields["QualityValue"] and qualArray.InitializeForReading(baseCallsGroup, "QualityValue") == false) return 0;
- if (includedFields["InsertionQV"] and insertionQVArray.InitializeForReading(baseCallsGroup, "InsertionQV") == false) return 0;
- if (includedFields["DeletionQV"] and deletionQVArray.InitializeForReading(baseCallsGroup, "DeletionQV") == false) return 0;
- if (includedFields["DeletionTag"] and deletionTagArray.InitializeForReading(baseCallsGroup, "DeletionTag") == false) return 0;
- if (includedFields["SubstitutionQV"] and substitutionQVArray.InitializeForReading(baseCallsGroup, "SubstitutionQV") == false) return 0;
- if (includedFields["SubstitutionTag"] and substitutionTagArray.InitializeForReading(baseCallsGroup, "SubstitutionTag") == false) return 0;
- // if (includedFields["PreBaseFrames"] and preBaseFramesArray.InitializeForReading(baseCallsGroup, "PreBaseFrames") == false) return 0;
-
- if (baseCallsGroup.ContainsObject("PreBaseFrames")) {
- if (preBaseFramesArray.InitializeForReading(baseCallsGroup, "PreBaseFrames") == false) return 0;
- } else {
- includedFields["PreBaseFrames"] = false;
- }
//
// These fields are not always present in bas.h5 files.
//
- if (baseCallsGroup.ContainsObject("PulseIndex")) {
- if (pulseIndexArray.InitializeForReading(baseCallsGroup, "PulseIndex") == false) return 0;
- }
- else {
- includedFields["PulseIndex"] = false;
- }
+ //
+ std::string fieldName = "QualityValue";
+ if (baseCallsGroup.ContainsObject(fieldName)) {
+ if (includedFields[fieldName] and
+ not qualArray.InitializeForReading(baseCallsGroup, fieldName))
+ return 0;
+ } else includedFields[fieldName] = false;
- if (baseCallsGroup.ContainsObject("WidthInFrames")) {
- if (basWidthInFramesArray.InitializeForReading(baseCallsGroup, "WidthInFrames") == false) return 0;
- }
- else {
- includedFields["WidthInFrames"] = false;
- }
+ fieldName = "InsertionQV";
+ if (baseCallsGroup.ContainsObject(fieldName)) {
+ if (includedFields[fieldName] and
+ not insertionQVArray.InitializeForReading(baseCallsGroup, fieldName))
+ return 0;
+ } else includedFields[fieldName] = false;
- if (baseCallsGroup.ContainsObject("MergeQV")) {
- if (includedFields["MergeQV"] and mergeQVArray.InitializeForReading(baseCallsGroup, "MergeQV") == false) return false;
- }
- else {
- includedFields["MergeQV"] = false;
- }
+ fieldName = "DeletionQV";
+ if (baseCallsGroup.ContainsObject(fieldName)) {
+ if (includedFields[fieldName] and
+ not deletionQVArray.InitializeForReading(baseCallsGroup, fieldName))
+ return 0;
+ } else includedFields[fieldName] = false;
- if ((includedFields["HQRegionSNR"] or includedFields["ReadScore"]) and
- (baseCallsGroup.ContainsObject(zmwMetricsGroupName) == 0 or
- zmwMetricsGroup.Initialize(baseCallsGroup.group, zmwMetricsGroupName) == 0)) {
- includedFields["HQRegionSNR"] = false;
- includedFields["ReadScore"] = false;
- }
- if (includedFields["HQRegionSNR"] and (zmwMetricsGroup.ContainsObject("HQRegionSNR") == 0 or
- GetDatasetNDim(zmwMetricsGroup.group, "HQRegionSNR") != 2 or
- hqRegionSNRMatrix.InitializeForReading(zmwMetricsGroup, "HQRegionSNR") == false or
- hqRegionSNRMatrix.GetNCols() != 4)) {
+ fieldName = "DeletionTag";
+ if (baseCallsGroup.ContainsObject(fieldName)) {
+ if (includedFields[fieldName] and
+ not deletionTagArray.InitializeForReading(baseCallsGroup, fieldName))
+ return 0;
+ } else includedFields[fieldName] = false;
+
+ fieldName = "SubstitutionQV";
+ if (baseCallsGroup.ContainsObject(fieldName)) {
+ if (includedFields[fieldName] and
+ not substitutionQVArray.InitializeForReading(baseCallsGroup, fieldName))
+ return 0;
+ } else includedFields[fieldName] = false;
+
+ fieldName = "SubstitutionTag";
+ if (baseCallsGroup.ContainsObject(fieldName)) {
+ if (includedFields[fieldName] and
+ not substitutionTagArray.InitializeForReading(baseCallsGroup, fieldName))
+ return 0;
+ } else includedFields[fieldName] = false;
+
+ fieldName = "PreBaseFrames";
+ if (baseCallsGroup.ContainsObject(fieldName)) {
+ if (includedFields[fieldName] and
+ not preBaseFramesArray.InitializeForReading(baseCallsGroup, fieldName))
+ return 0;
+ } else includedFields[fieldName] = false;
+
+ fieldName = "PulseIndex";
+ if (baseCallsGroup.ContainsObject(fieldName)) {
+ if (includedFields[fieldName] and
+ not pulseIndexArray.InitializeForReading(baseCallsGroup, fieldName))
+ return 0;
+ } else includedFields[fieldName] = false;
+
+ fieldName = "WidthInFrames";
+ if (baseCallsGroup.ContainsObject(fieldName)) {
+ if (includedFields[fieldName] and
+ not basWidthInFramesArray.InitializeForReading(baseCallsGroup, fieldName))
+ return 0;
+ } else includedFields[fieldName] = false;
+
+ fieldName = "MergeQV";
+ if (baseCallsGroup.ContainsObject(fieldName)) {
+ if (includedFields[fieldName] and
+ not mergeQVArray.InitializeForReading(baseCallsGroup, fieldName))
+ return 0;
+ } else includedFields[fieldName] = false;
+
+ if (not baseCallsGroup.ContainsObject(zmwMetricsGroupName) or
+ not zmwMetricsGroup.Initialize(baseCallsGroup.group, zmwMetricsGroupName)) {
includedFields["HQRegionSNR"] = false;
- }
- if (includedFields["ReadScore"] and (zmwMetricsGroup.ContainsObject("ReadScore") == 0 or
- readScoreArray.InitializeForReading(zmwMetricsGroup, "ReadScore")) == false) {
includedFields["ReadScore"] = false;
- }
+ } else {
+ if (includedFields["HQRegionSNR"]) {
+ if (not zmwMetricsGroup.ContainsObject("HQRegionSNR") or
+ not hqRegionSNRMatrix.InitializeForReading(zmwMetricsGroup, "HQRegionSNR") or
+ GetDatasetNDim(zmwMetricsGroup.group, "HQRegionSNR") != 2 or
+ hqRegionSNRMatrix.GetNCols() != 4) {
+ includedFields["HQRegionSNR"] = false;
+ } else if (not useScanData) {
+ includedFields["HQRegionSNR"] = false;
+ std::cerr << "WARNING: could not read HQRegionSNR because ScanData is absent!" << std::endl;
+ }
+ }
+
+ if (includedFields["ReadScore"] and
+ (not zmwMetricsGroup.ContainsObject("ReadScore") or
+ not readScoreArray.InitializeForReading(zmwMetricsGroup, "ReadScore"))) {
+ includedFields["ReadScore"] = false;
+ }
+ }
return 1;
}
@@ -488,8 +532,6 @@ public:
//
nReads = zmwReader.numEventArray.arrayLength;
-
-
if (scanDataReader.platformId == Astro) {
if (InitializeAstro() == 0) {
return 0;
@@ -563,7 +605,6 @@ public:
exit(1);
}
curBasePos += seqLength;
- seq.StorePlatformId(scanDataReader.platformId);
return 1;
}
@@ -581,25 +622,24 @@ public:
seq.AllocateQualitySpace(seqLength);
qualArray.Read((int)curBasePos, (int) curBasePos + seqLength, (unsigned char*) seq.qual.data);
}
- }
-
- if (includedFields["DeletionQV"]) {
- GetNextDeletionQV(seq);
- }
- if (includedFields["DeletionTag"]) {
- GetNextDeletionTag(seq);
- }
- if (includedFields["InsertionQV"]) {
- GetNextInsertionQV(seq);
- }
- if (includedFields["SubstitutionQV"]) {
- GetNextSubstitutionQV(seq);
- }
- if (includedFields["SubstitutionTag"]) {
- GetNextSubstitutionTag(seq);
- }
- if (includedFields["MergeQV"]) {
- GetNextMergeQV(seq);
+ if (includedFields["DeletionQV"]) {
+ GetNextDeletionQV(seq);
+ }
+ if (includedFields["DeletionTag"]) {
+ GetNextDeletionTag(seq);
+ }
+ if (includedFields["InsertionQV"]) {
+ GetNextInsertionQV(seq);
+ }
+ if (includedFields["SubstitutionQV"]) {
+ GetNextSubstitutionQV(seq);
+ }
+ if (includedFields["SubstitutionTag"]) {
+ GetNextSubstitutionTag(seq);
+ }
+ if (includedFields["MergeQV"]) {
+ GetNextMergeQV(seq);
+ }
}
seq.SetQVScale(qvScale);
curBasePos += seqLength;
@@ -629,7 +669,6 @@ public:
if (includedFields["ReadScore"]) {
GetNextReadScore(seq);
}
-
int seqLength = GetNextWithoutPosAdvance(seq);
seq.length = seqLength;
if(readQVs) {
@@ -646,11 +685,8 @@ public:
seq.SetQVScale(qvScale);
curBasePos += seqLength;
- seq.subreadStart = 0;
- seq.subreadEnd = seq.length;
+ seq.SubreadStart(0).SubreadEnd(seq.length);
zmwReader.GetNext(seq.zmwData);
- seq.xy[0] = seq.zmwData.x;
- seq.xy[1] = seq.zmwData.y;
} catch (H5::DataSetIException e) {
cout << "ERROR, could not read bases or QVs for SMRTSequence "
<< seq.GetName() << endl;
@@ -671,6 +707,7 @@ public:
int retVal;
DNALength curBasPosCopy = curBasePos;
+
//
// Getting next advances the curBasPos to the end of
// the current sequence.
@@ -681,21 +718,21 @@ public:
return 0;
}
- // get ZMWMetrics fields, must be done before GetNext
- // (which calls GetNextWithoutAdvancePos, which increments curRead)
+ //
+ // Bail now if the file is already done
+ //
+ if ((retVal = this->GetNext((FASTQSequence&)seq)) == 0) {
+ return 0;
+ }
+ // GetNext calls GetNextWithoutPosAdvance, which increments curRead
+ curRead--;
if (includedFields["HQRegionSNR"]) {
GetNextHQRegionSNR(seq);
}
if (includedFields["ReadScore"]) {
GetNextReadScore(seq);
}
-
- //
- // Bail now if the file is already done
- //
- if ((retVal = this->GetNext((FASTQSequence&)seq)) == 0) {
- return 0;
- }
+ curRead++;
DNALength nextBasePos = curBasePos;
curBasePos = curBasPosCopy;
@@ -716,11 +753,8 @@ public:
// By default, the subread of a read without subread information is
// the whole read.
//
- seq.subreadStart = 0;
- seq.subreadEnd = seq.length;
+ seq.SubreadStart(0).SubreadEnd(seq.length);
zmwReader.GetNext(seq.zmwData);
- seq.xy[0] = seq.zmwData.x;
- seq.xy[1] = seq.zmwData.y;
} catch(H5::DataSetIException e) {
cout << "ERROR, could not read pulse metrics for SMRTSequence "
<< seq.GetName() << endl;
@@ -728,16 +762,7 @@ public:
}
return retVal;
}
- /*
- int16_t xy[2];
- if (zmwReader.readHoleXY) {
- zmwReader.xyArray.Read(curRead, curRead+1, 0, 2, xy);
- }
- else {
- xy[0] = xy[1] = 0;
- }
- seq.StoreXY(xy);
- */
+
void GetAllPulseIndex(std::vector<int> &pulseIndex) {
CheckMemoryAllocation(pulseIndexArray.arrayLength, maxAllocNElements, "PulseIndex");
pulseIndex.resize(pulseIndexArray.arrayLength);
@@ -799,12 +824,10 @@ public:
std::string readTitle;
unsigned int holeNumber;
- unsigned char holeStatus;
zmwReader.holeNumberArray.Read(curRead, curRead+1, &holeNumber);
- seq.StoreHoleNumber(holeNumber);
+ unsigned char holeStatus;
zmwReader.holeStatusArray.Read(curRead, curRead+1, &holeStatus);
- seq.StoreHoleStatus(holeStatus);
DNALength simIndex=0, simCoordinate=0;
@@ -856,7 +879,7 @@ public:
delete [] seq.widthInFrames;
seq.widthInFrames = NULL;
}
- seq.widthInFrames = new HalfWord[seq.length];
+ seq.widthInFrames = ProtectedNew<HalfWord>(seq.length);
basWidthInFramesArray.Read((int)curBasePos, (int) curBasePos + seq.length, (HalfWord*) seq.widthInFrames);
return seq.length;
}
@@ -867,7 +890,7 @@ public:
delete [] seq.preBaseFrames;
seq.preBaseFrames = NULL;
}
- seq.preBaseFrames = new HalfWord[seq.length];
+ seq.preBaseFrames = ProtectedNew<HalfWord>(seq.length);
preBaseFramesArray.Read((int)curBasePos, (int) curBasePos + seq.length, (HalfWord*) seq.preBaseFrames);
return seq.length;
}
@@ -877,12 +900,21 @@ public:
delete [] seq.pulseIndex;
seq.pulseIndex = NULL;
}
- seq.pulseIndex = new int[seq.length];
+ seq.pulseIndex = ProtectedNew<int>(seq.length);
pulseIndexArray.Read((int)curBasePos, (int) curBasePos + seq.length, (int*) seq.pulseIndex);
return seq.length;
}
int GetNextHQRegionSNR(SMRTSequence &seq) {
- hqRegionSNRMatrix.Read(curRead, curRead + 1, seq.hqRegionSnr);
+ float snrs[4];
+ hqRegionSNRMatrix.Read(curRead, curRead + 1, snrs);
+
+ // Get BaseMap from ScanData.
+ std::map<char, size_t> baseMap = scanDataReader.BaseMap();
+ assert(ScanData::IsValidBaseMap(baseMap));
+ seq.HQRegionSnr('A', snrs[baseMap['A']])
+ .HQRegionSnr('C', snrs[baseMap['C']])
+ .HQRegionSnr('G', snrs[baseMap['G']])
+ .HQRegionSnr('T', snrs[baseMap['T']]);
return 4;
}
int GetNextReadScore(SMRTSequence &seq) {
diff --git a/hdf/HDFBaseCallsWriter.cpp b/hdf/HDFBaseCallsWriter.cpp
new file mode 100644
index 0000000..5fe5938
--- /dev/null
+++ b/hdf/HDFBaseCallsWriter.cpp
@@ -0,0 +1,326 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#include "HDFBaseCallsWriter.hpp"
+
+HDFBaseCallsWriter::HDFBaseCallsWriter(const std::string & filename,
+ HDFGroup & parentGroup,
+ const std::map<char, size_t> & baseMap,
+ const std::vector<std::string> & qvsToWrite,
+ const bool fakeQualityValue)
+ : HDFWriterBase(filename)
+ , parentGroup_(parentGroup)
+ , baseMap_(baseMap)
+ , qvsToWrite_({}) // Input qvsToWrite must be checked.
+ , zmwWriter_(nullptr)
+ , zmwMetricsWriter_(nullptr)
+ , fakeQualityValue_(fakeQualityValue)
+{
+ // Add BaseCalls as a child group to the parent group.
+ AddChildGroup(parentGroup_, basecallsGroup_, PacBio::GroupNames::basecalls);
+
+ // Initialize the 'basecall' group.
+ basecallArray_.Initialize(basecallsGroup_, PacBio::GroupNames::basecall);
+
+ // Sanity check QVs to write.
+ if (SanityCheckQVs(qvsToWrite)) {
+ // Initialize QV groups
+ if (not InitializeQVGroups()) {
+ AddErrorMessage("Failed to initialize QV Groups.");
+ }
+ }
+
+ // Create a zmwWriter.
+ zmwWriter_.reset(new HDFZMWWriter(Filename(), basecallsGroup_, true));
+
+ // Create a zmwMetricsWriter.
+ zmwMetricsWriter_.reset(new HDFZMWMetricsWriter(Filename(), basecallsGroup_, baseMap_));
+}
+
+std::vector<std::string> HDFBaseCallsWriter::Errors(void) const {
+ std::vector<std::string> retErrors = this->errors_;
+ std::vector<std::string> zmwErrors = zmwWriter_->Errors();
+ std::vector<std::string> zmwMetricsErrors = zmwMetricsWriter_->Errors();
+
+ retErrors.insert(retErrors.end(), zmwErrors.begin(), zmwErrors.end());
+ retErrors.insert(retErrors.end(), zmwMetricsErrors.begin(), zmwMetricsErrors.end());
+ return retErrors;
+}
+
+HDFBaseCallsWriter::~HDFBaseCallsWriter(void) {
+ this->Close();
+}
+
+const std::vector<std::string> & HDFBaseCallsWriter::QVNamesToWrite(void) const {
+ return qvsToWrite_;
+}
+
+const std::vector<std::string> & HDFBaseCallsWriter::ValidQVNames(void) const {
+ return PacBio::GroupNames::BaxQVNames;
+}
+
+bool HDFBaseCallsWriter::InitializeQVGroups(void) {
+ int ret = 1;
+ // special dataset
+ if (FakeQualityValue())
+ ret *= qualityValueArray_.Initialize(basecallsGroup_, PacBio::GroupNames::qualityvalue);
+
+ // normal datasets
+ if (_HasQV(PacBio::GroupNames::deletionqv))
+ ret *= deletionQVArray_.Initialize(basecallsGroup_, PacBio::GroupNames::deletionqv);
+ if (_HasQV(PacBio::GroupNames::deletiontag))
+ ret *= deletionTagArray_.Initialize(basecallsGroup_, PacBio::GroupNames::deletiontag);
+ if (_HasQV(PacBio::GroupNames::insertionqv))
+ ret *= insertionQVArray_.Initialize(basecallsGroup_, PacBio::GroupNames::insertionqv);
+ if (_HasQV(PacBio::GroupNames::mergeqv))
+ ret *= mergeQVArray_.Initialize(basecallsGroup_, PacBio::GroupNames::mergeqv);
+ if (_HasQV(PacBio::GroupNames::substitutionqv))
+ ret *= substitutionQVArray_.Initialize(basecallsGroup_, PacBio::GroupNames::substitutionqv);
+ if (_HasQV(PacBio::GroupNames::substitutiontag))
+ ret *= substitutionTagArray_.Initialize(basecallsGroup_, PacBio::GroupNames::substitutiontag);
+ if (_HasQV(PacBio::GroupNames::prebaseframes))
+ ret *= preBaseFramesArray_.Initialize(basecallsGroup_, PacBio::GroupNames::prebaseframes);
+ if (_HasQV(PacBio::GroupNames::widthinframes))
+ ret *= widthInFramesArray_.Initialize(basecallsGroup_, PacBio::GroupNames::widthinframes);
+ return (ret != 0);
+}
+
+bool HDFBaseCallsWriter::SanityCheckQVs(const std::vector<std::string> & qvsToWrite) {
+ bool allQVsToAddInSpec = true;
+ qvsToWrite_.clear();
+ // Filter qvs which are not in format specification.
+ const std::vector<std::string> & qvsInSpec = ValidQVNames();
+ for(auto qv : qvsToWrite) {
+ if (std::find(qvsInSpec.begin(), qvsInSpec.end(), qv) != qvsInSpec.end()) {
+ if (std::find(qvsToWrite_.begin(), qvsToWrite_.end(), qv) == qvsToWrite_.end())
+ qvsToWrite_.push_back(qv);
+ // else redundant
+ } else {
+ allQVsToAddInSpec = false;
+ AddErrorMessage(std::string("Unsupported quality value ") + qv);
+ }
+ }
+ return allQVsToAddInSpec;
+}
+
+bool HDFBaseCallsWriter::WriteBaseCallerVersion(const std::string & basecallerVersion) {
+ changeListIDAtom_.Create(basecallsGroup_.group,
+ PacBio::AttributeNames::Common::changelistid,
+ basecallerVersion);
+ return true;
+}
+
+bool HDFBaseCallsWriter::WriteOneZmw(const SMRTSequence & read) {
+ bool OK = zmwWriter_->WriteOneZmw(read);
+ OK = OK and zmwMetricsWriter_->WriteOneZmw(read);
+ OK = OK and _WriteBasecall(read);
+
+ if (FakeQualityValue())
+ OK = OK and _WriteQualityValue(read);
+ OK = OK and _WriteDeletionQV(read);
+ OK = OK and _WriteDeletionTag(read);
+ OK = OK and _WriteInsertionQV(read);
+ OK = OK and _WriteMergeQV(read);
+ OK = OK and _WriteSubstitutionTag(read);
+ OK = OK and _WriteSubstitutionQV(read);
+ OK = OK and _WritePreBaseFrames(read);
+ OK = OK and _WriteWidthInFrames(read);
+ return OK;
+}
+
+bool HDFBaseCallsWriter::_WriteBasecall(const SMRTSequence & read) {
+ basecallArray_.Write((const unsigned char*) read.seq, read.length);
+ return true;
+}
+
+bool HDFBaseCallsWriter::_WriteQualityValue(const SMRTSequence & read) {
+ if (FakeQualityValue()) {
+ if (read.length <= 0) {
+ AddErrorMessage(read.GetTitle() + std::string(" is empty."));
+ return false;
+ }
+ if (not read.deletionQV.Empty()) {
+ // Use deletionQV to fake QualityValue if possible.
+ qualityValueArray_.Write(read.deletionQV.data, read.length);
+ } else { // otherwise, fill with 255.
+ QualityValueVector<QualityValue> fakedata;
+ fakedata.Allocate(read.length);
+ memset(fakedata.data, MAX_QUALITY_VALUE, read.length * sizeof(QualityValue));
+ qualityValueArray_.Write(fakedata.data, read.length);
+ fakedata.Free();
+ }
+ }
+ return true;
+}
+
+bool HDFBaseCallsWriter::_WriteDeletionQV(const SMRTSequence & read) {
+ if (HasDeletionQV()) {
+ if (read.deletionQV.Empty()) {
+ AddErrorMessage(std::string(PacBio::GroupNames::deletionqv) + " absent in read " + read.GetTitle());
+ return false;
+ } else {
+ deletionQVArray_.Write(read.deletionQV.data, read.length);
+ return true;
+ }
+ }
+ return true;
+}
+
+bool HDFBaseCallsWriter::_WriteDeletionTag(const SMRTSequence & read) {
+ if (HasDeletionTag()) {
+ if (read.deletionTag == nullptr) {
+ AddErrorMessage(std::string(PacBio::GroupNames::deletiontag) + " absent in read " + read.GetTitle());
+ return false;
+ } else {
+ deletionTagArray_.Write(read.deletionTag, read.length);
+ return true;
+ }
+ }
+ return true;
+}
+
+bool HDFBaseCallsWriter::_WriteInsertionQV(const SMRTSequence & read) {
+ if (HasInsertionQV()) {
+ if (read.insertionQV.Empty()) {
+ AddErrorMessage(std::string(PacBio::GroupNames::insertionqv) + " absent in read " + read.GetTitle());
+ return false;
+ } else {
+ insertionQVArray_.Write(read.insertionQV.data, read.length);
+ return true;
+ }
+ }
+ return true;
+}
+
+bool HDFBaseCallsWriter::_WriteSubstitutionTag(const SMRTSequence & read) {
+ if (HasSubstitutionTag()) {
+ if (read.substitutionTag == nullptr) {
+ AddErrorMessage(std::string(PacBio::GroupNames::substitutiontag) + " absent in read " + read.GetTitle());
+ return false;
+ } else {
+ substitutionTagArray_.Write(read.substitutionTag, read.length);
+ return true;
+ }
+ }
+ return true;
+}
+
+bool HDFBaseCallsWriter::_WriteSubstitutionQV(const SMRTSequence & read) {
+ if (HasSubstitutionQV()) {
+ if (read.substitutionQV.Empty()) {
+ AddErrorMessage(std::string(PacBio::GroupNames::substitutionqv) + " absent in read " + read.GetTitle());
+ return false;
+ } else {
+ substitutionQVArray_.Write(read.substitutionQV.data, read.length);
+ return true;
+ }
+ }
+ return true;
+}
+
+bool HDFBaseCallsWriter::_WriteMergeQV(const SMRTSequence & read) {
+ if (HasMergeQV()) {
+ if (read.mergeQV.Empty()) {
+ AddErrorMessage(std::string(PacBio::GroupNames::mergeqv) + " absent in read " + read.GetTitle());
+ return false;
+ } else {
+ mergeQVArray_.Write(read.mergeQV.data, read.length);
+ return true;
+ }
+ }
+ return true;
+}
+
+bool HDFBaseCallsWriter::_WritePreBaseFrames(const SMRTSequence & read) {
+ if (HasPreBaseFrames()) {
+ if (read.preBaseFrames == nullptr) {
+ AddErrorMessage(std::string(PacBio::GroupNames::prebaseframes) + " absent in read " + read.GetTitle());
+ return false;
+ } else {
+ preBaseFramesArray_.Write(read.preBaseFrames, read.length);
+ return true;
+ }
+ }
+ return true;
+}
+
+bool HDFBaseCallsWriter::_WriteWidthInFrames(const SMRTSequence & read) {
+ if (HasWidthInFrames()) {
+ if (read.widthInFrames == nullptr) {
+ AddErrorMessage(std::string(PacBio::GroupNames::widthinframes) + " absent in read " + read.GetTitle());
+ return false;
+ } else {
+ widthInFramesArray_.Write(read.widthInFrames, read.length);
+ return true;
+ }
+ }
+ return true;
+}
+
+
+void HDFBaseCallsWriter::Flush(void) {
+ basecallArray_.Flush();
+
+ if (HasQualityValue()) qualityValueArray_.Flush();
+ if (HasDeletionQV()) deletionQVArray_.Flush();
+ if (HasDeletionTag()) deletionTagArray_.Flush();
+ if (HasInsertionQV()) insertionQVArray_.Flush();
+ if (HasMergeQV()) mergeQVArray_.Flush();
+ if (HasSubstitutionQV()) substitutionQVArray_.Flush();
+ if (HasSubstitutionTag()) substitutionTagArray_.Flush();
+ if (HasPreBaseFrames()) preBaseFramesArray_.Flush();
+ if (HasWidthInFrames()) widthInFramesArray_.Flush();
+
+ zmwWriter_->Flush();
+ zmwMetricsWriter_->Flush();
+}
+
+void HDFBaseCallsWriter::Close(void) {
+ this->Flush();
+
+ basecallArray_.Close();
+
+ if (HasQualityValue()) qualityValueArray_.Close();
+ if (HasDeletionQV()) deletionQVArray_.Close();
+ if (HasDeletionTag()) deletionTagArray_.Close();
+ if (HasInsertionQV()) insertionQVArray_.Close();
+ if (HasMergeQV()) mergeQVArray_.Close();
+ if (HasSubstitutionQV()) substitutionQVArray_.Close();
+ if (HasSubstitutionTag()) substitutionTagArray_.Close();
+ if (HasPreBaseFrames()) preBaseFramesArray_.Close();
+ if (HasWidthInFrames()) widthInFramesArray_.Close();
+}
diff --git a/hdf/HDFBaseCallsWriter.hpp b/hdf/HDFBaseCallsWriter.hpp
new file mode 100644
index 0000000..a65f5b0
--- /dev/null
+++ b/hdf/HDFBaseCallsWriter.hpp
@@ -0,0 +1,233 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#ifndef _BLASR_HDF_BASECALLS_WRITER_HPP_
+#define _BLASR_HDF_BASECALLS_WRITER_HPP_
+
+#include <memory>
+#include <algorithm>
+#include "HDFAtom.hpp"
+#include "HDFWriterBase.hpp"
+#include "HDFZMWWriter.hpp"
+#include "HDFZMWMetricsWriter.hpp"
+
+class HDFBaseCallsWriter: public HDFWriterBase {
+ /// \name \{
+public:
+ HDFBaseCallsWriter(const std::string & filename,
+ HDFGroup & parentGroup,
+ const std::map<char, size_t> & baseMap,
+ const std::vector<std::string> & qvsToWrite = {},
+ const bool fakeQualityValue = true);
+
+ ~HDFBaseCallsWriter(void);
+
+ /// \brief Write base caller version (changeListId)
+ bool WriteBaseCallerVersion(const std::string & basecallerVersion);
+
+ /// \brief Write a zmw read.
+ bool WriteOneZmw(const SMRTSequence & read);
+
+ /// \brief return a vector of QV name strings specified in file format specification.
+ const std::vector<std::string> & ValidQVNames(void) const;
+
+ /// \brief return a vector of QV name strings to write.
+ const std::vector<std::string> & QVNamesToWrite(void) const;
+
+ void Flush(void);
+
+ void Close(void);
+
+public:
+ /// \brief Sanity check QVs to add. Remove QVs which are
+ /// not included in file format specification, and
+ /// remove redundant QVs.
+ /// \returns Whether or not a QV is not included in sepcification.
+ bool SanityCheckQVs(const std::vector<std::string> & qvsToWrite);
+
+ /// \returns true if FakeQualityValue() and qualityValueArray_
+ /// has been initialized
+ inline bool HasQualityValue(void) const;
+
+ /// \returns true if has DeletionQV dataset and deletionQVArray_
+ /// has been initialized.
+ inline bool HasDeletionQV(void) const;
+ inline bool HasDeletionTag(void) const;
+ inline bool HasInsertionQV(void) const;
+ inline bool HasSubstitutionTag(void) const;
+ inline bool HasSubstitutionQV(void) const;
+ inline bool HasMergeQV(void) const;
+ inline bool HasPreBaseFrames(void) const;
+ inline bool HasIPD(void) const;
+ inline bool HasWidthInFrames(void) const;
+ inline bool HasPulseWidth(void) const;
+
+ std::vector<std::string> Errors(void) const;
+
+public:
+ /// \returns whether or not to fake QualityValue.
+ bool FakeQualityValue() const;
+
+private:
+ bool fakeQualityValue_;
+
+
+private:
+ inline bool _HasQV(const std::string & qvToQuery) const;
+
+ bool _WriteBasecall(const SMRTSequence & read);
+
+ /// Write fake values to the 'QualityValue' dataset.
+ bool _WriteQualityValue(const SMRTSequence & read);
+
+ /// Write real data in the following.
+ bool _WriteDeletionQV(const SMRTSequence & read);
+ bool _WriteDeletionTag(const SMRTSequence & read);
+ bool _WriteInsertionQV(const SMRTSequence & read);
+ bool _WriteSubstitutionTag(const SMRTSequence & read);
+ bool _WriteSubstitutionQV(const SMRTSequence & read);
+ bool _WriteMergeQV(const SMRTSequence & read);
+ bool _WritePreBaseFrames(const SMRTSequence & read);
+ bool _WriteWidthInFrames(const SMRTSequence & read);
+
+private:
+ /// \brief Create and initialize QV groups.
+ /// \returns Whether or not QV groups initialized successfully.
+ bool InitializeQVGroups(void);
+
+private:
+ HDFGroup & parentGroup_;
+ std::map<char, size_t> baseMap_;
+ std::vector<string> qvsToWrite_;
+ std::unique_ptr<HDFZMWWriter> zmwWriter_;
+ std::unique_ptr<HDFZMWMetricsWriter> zmwMetricsWriter_;
+ HDFGroup basecallsGroup_;
+
+private:
+ HDFAtom<string> changeListIDAtom_;
+
+ /// BaseCalls/Basecall group
+ BufferedHDFArray<unsigned char> basecallArray_;
+
+ /// This is a mandatory dataset for 2.3, whose existence is
+ /// to ensure bam2bax to generate 2.3 compatible bax.h5 files.
+ BufferedHDFArray<unsigned char> qualityValueArray_;
+
+ /// \brief Define arrays for rich quality values.
+ /// DeletionQV dq --> BaseCalls/DeletionQV
+ /// DeletionTag dt --> BaseCalls/DeletionTag
+ /// InsertionQV iq --> BaseCalls/InsertionQV
+ /// MergeQV mq --> BaseCalls/MergeQV
+ /// SubstitutionQV sq --> BaseCalls/SubstitutionQV
+ /// SubstitutionTag st --> BaseCalls/SubstitutionTag
+ /// Ipd:Frames ip --> BaseCalls/PreBaseFrames
+ /// PulseWidth:Frames pw --> BaseCalls/WidthInFrames
+ BufferedHDFArray<unsigned char> deletionQVArray_;
+ BufferedHDFArray<unsigned char> deletionTagArray_;
+ BufferedHDFArray<unsigned char> insertionQVArray_;
+ BufferedHDFArray<unsigned char> mergeQVArray_;
+ BufferedHDFArray<unsigned char> substitutionQVArray_;
+ BufferedHDFArray<unsigned char> substitutionTagArray_;
+ BufferedHDFArray<HalfWord> preBaseFramesArray_;
+ BufferedHDFArray<HalfWord> widthInFramesArray_;
+
+ /// \}
+};
+
+inline
+bool HDFBaseCallsWriter::_HasQV(const std::string & qvToQuery) const {
+ return (std::find(qvsToWrite_.begin(), qvsToWrite_.end(), qvToQuery) != qvsToWrite_.end());
+}
+
+inline
+bool HDFBaseCallsWriter::HasQualityValue(void) const
+{return (FakeQualityValue() and
+ qualityValueArray_.IsInitialized());}
+
+inline
+bool HDFBaseCallsWriter::HasDeletionQV(void) const
+{return (_HasQV(PacBio::GroupNames::deletionqv) and
+ deletionQVArray_.IsInitialized());}
+
+inline
+bool HDFBaseCallsWriter::HasDeletionTag(void) const
+{return (_HasQV(PacBio::GroupNames::deletiontag) and
+ deletionTagArray_.IsInitialized());}
+
+inline
+bool HDFBaseCallsWriter::HasInsertionQV(void) const
+{return (_HasQV(PacBio::GroupNames::insertionqv) and
+ insertionQVArray_.IsInitialized());}
+
+inline
+bool HDFBaseCallsWriter::HasSubstitutionTag(void) const
+{return (_HasQV(PacBio::GroupNames::substitutiontag) and
+ substitutionTagArray_.IsInitialized());}
+
+inline
+bool HDFBaseCallsWriter::HasSubstitutionQV(void) const
+{return (_HasQV(PacBio::GroupNames::substitutionqv) and
+ substitutionQVArray_.IsInitialized());}
+
+inline
+bool HDFBaseCallsWriter::HasMergeQV(void) const
+{return (_HasQV(PacBio::GroupNames::mergeqv) and
+ mergeQVArray_.IsInitialized());}
+
+inline
+bool HDFBaseCallsWriter::HasPreBaseFrames(void) const
+{return (_HasQV(PacBio::GroupNames::prebaseframes) and
+ preBaseFramesArray_.IsInitialized());}
+
+inline
+bool HDFBaseCallsWriter::HasIPD(void) const
+{return HasPreBaseFrames();}
+
+inline
+bool HDFBaseCallsWriter::HasWidthInFrames(void) const
+{return (_HasQV(PacBio::GroupNames::widthinframes) and
+ widthInFramesArray_.IsInitialized());}
+
+inline
+bool HDFBaseCallsWriter::HasPulseWidth(void) const
+{return this->HasWidthInFrames();}
+
+inline
+bool HDFBaseCallsWriter::FakeQualityValue(void) const
+{return this->fakeQualityValue_;}
+
+#endif
diff --git a/hdf/HDFBaxWriter.cpp b/hdf/HDFBaxWriter.cpp
new file mode 100644
index 0000000..745c2c5
--- /dev/null
+++ b/hdf/HDFBaxWriter.cpp
@@ -0,0 +1,141 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#include "HDFBaxWriter.hpp"
+
+HDFBaxWriter::HDFBaxWriter(const std::string & filename,
+ const ScanData & sd,
+ const std::string & basecallerVersion,
+ const std::vector<std::string> & qvsToWrite,
+ const std::vector<std::string> & regionTypes,
+ const H5::FileAccPropList & fileAccPropList)
+ : HDFWriterBase(filename)
+ , fileaccproplist_(fileAccPropList)
+ , scandataWriter_(nullptr)
+ , basecallsWriter_(nullptr)
+ , regionsWriter_(nullptr)
+{
+ // sanity check chemistry meta data.
+ SanityCheckChemistry(sd.BindingKit(),
+ sd.SequencingKit(),
+ basecallerVersion);
+
+ // open file
+ outfile_.Open(filename_, H5F_ACC_TRUNC, fileaccproplist_);
+
+ // Add PulseData group to the root group '/'
+ AddChildGroup(outfile_.rootGroup, pulseDataGroup_, PacBio::GroupNames::pulsedata);
+
+ // Create a ScanData writer.
+ scandataWriter_.reset(new HDFScanDataWriter(outfile_.rootGroup));
+ scandataWriter_->Write(sd);
+
+ // Create a BaseCaller writer.
+ basecallsWriter_.reset(new HDFBaseCallsWriter(filename_, pulseDataGroup_, sd.BaseMap(), qvsToWrite));
+ basecallsWriter_->WriteBaseCallerVersion(basecallerVersion);
+
+ // Create a Regions writer.
+ regionsWriter_.reset(new HDFRegionsWriter(filename_, pulseDataGroup_, regionTypes));
+}
+
+HDFBaxWriter::~HDFBaxWriter(void) {
+ this->Close();
+}
+
+void HDFBaxWriter::Flush(void) {
+ basecallsWriter_->Flush();
+ regionsWriter_->Flush();
+}
+
+std::vector<std::string> HDFBaxWriter::Errors(void) {
+ std::vector<std::string> errors = errors_;
+
+ //for (auto error: scandataWriter_->Errors())
+ // errors.emplace_back(error);
+
+ for (auto error: basecallsWriter_->Errors())
+ errors.emplace_back(error);
+
+ for (auto error: regionsWriter_->Errors())
+ errors.emplace_back(error);
+
+ return errors;
+}
+
+void HDFBaxWriter::Close(void) {
+ basecallsWriter_->Close();
+ scandataWriter_->Close();
+ regionsWriter_->Close();
+ outfile_.Close();
+}
+
+bool HDFBaxWriter::SanityCheckChemistry(const std::string & bindingKit,
+ const std::string & sequencingKit,
+ const std::string & basecallerVersion)
+{
+ bool OK = true;
+ if (bindingKit.empty()) {
+ OK = false;
+ AddErrorMessage("Binding kit must be specified.");
+ }
+ if (sequencingKit.empty()) {
+ OK = false;
+ AddErrorMessage("Sequencing kit must be specified.");
+ }
+ if (basecallerVersion.empty()) {
+ OK = false;
+ AddErrorMessage("Base caller version must be specified.");
+ }
+ return OK;
+}
+
+bool HDFBaxWriter::WriteOneZmw(const SMRTSequence & seq) {
+ return basecallsWriter_->WriteOneZmw(seq);
+}
+
+bool HDFBaxWriter::WriteOneZmw(const SMRTSequence & seq,
+ const std::vector<RegionAnnotation> & regions) {
+ if (not this->WriteOneZmw(seq)) {
+ return false;
+ }
+ if (regions.size() == 0) {
+ std::vector<RegionAnnotation> fake = {RegionAnnotation(seq.HoleNumber(), HQRegion, 0, 0, 0)};
+ return regionsWriter_->Write(fake);
+ } else {
+ return regionsWriter_->Write(regions);
+ }
+}
diff --git a/hdf/HDFBaxWriter.hpp b/hdf/HDFBaxWriter.hpp
new file mode 100644
index 0000000..c390d8f
--- /dev/null
+++ b/hdf/HDFBaxWriter.hpp
@@ -0,0 +1,172 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+
+#ifndef _BLASR_HDF_BAX_WRITER_HPP_
+#define _BLASR_HDF_BAX_WRITER_HPP_
+
+#include <sstream>
+#include <memory>
+#include "Enumerations.h"
+#include "SMRTSequence.hpp"
+#include "HDFFile.hpp"
+#include "HDFWriterBase.hpp"
+#include "HDFScanDataWriter.hpp"
+#include "HDFBaseCallsWriter.hpp"
+#include "HDFRegionsWriter.hpp"
+
+using namespace H5;
+using namespace std;
+
+class HDFBaxWriter : public HDFWriterBase {
+public:
+ /// \name Constructor & Related Methods
+ /// \{
+ /// \brief Sets output h5 file name, scan data, base caller version
+ /// QVs to write, and h5 file access property list.
+ /// \param[in] filename output h5 file name.
+ /// \param[in] ScanData meta data string, must contain bindingKit and sequencingKit.
+ /// \param[in] basecallerVersion meta data string
+ /// \param[in] qvsToWrite Quality values to include in output h5 file.
+ /// \param[in] regionTypes, regionTypes as /Regions/RegionTypes
+ /// \param[in] fileAccPropList H5 file access property list
+ HDFBaxWriter(const std::string & filename,
+ const ScanData & sd,
+ const std::string & basecallerVersion,
+ const std::vector<std::string> & qvsToWrite,
+ const std::vector<std::string> & regionTypes = PacBio::AttributeValues::Regions::regiontypes,
+ const H5::FileAccPropList & fileAccPropList = H5::FileAccPropList::DEFAULT);
+
+ ~HDFBaxWriter(void);
+
+ /// \brief Write one zmw sequence to output h5 file.
+ /// \param[in] seq, the SMRTSequence to write
+ bool WriteOneZmw(const SMRTSequence & seq);
+
+ /// \brief Write one zmw sequence and its region table to output h5 file.
+ /// \param[in] seq, the SMRTSequence to write
+ /// \param[in] regions, region annotations of this zmw.
+ bool WriteOneZmw(const SMRTSequence & seq,
+ const std::vector<RegionAnnotation> & regions);
+
+ /// \brief Flushes buffered data.
+ void Flush(void);
+
+ /// \returns all errors from all writers.
+ std::vector<std::string> Errors(void);
+
+ /// \}
+
+private:
+ /// \name Private Variables
+ /// \{
+ HDFFile outfile_; ///< HDFFile file handler
+
+ H5::FileAccPropList fileaccproplist_; ///< H5 file access property list
+
+ HDFGroup pulseDataGroup_; ///< /PulseData group
+
+private:
+ /// Points to scan data writer.
+ std::unique_ptr<HDFScanDataWriter> scandataWriter_;
+ /// Points to base caller writer.
+ std::unique_ptr<HDFBaseCallsWriter> basecallsWriter_;
+ /// Points to region table writer.
+ std::unique_ptr<HDFRegionsWriter> regionsWriter_;
+ /// \}
+
+public:
+ /// \name Which QV will be written.
+ /// \{
+ inline bool HasDeletionQV(void) const;
+ inline bool HasDeletionTag(void) const;
+ inline bool HasInsertionQV(void) const;
+ inline bool HasSubstitutionTag(void) const;
+ inline bool HasSubstitutionQV(void) const;
+ inline bool HasMergeQV(void) const;
+ inline bool HasPreBaseFrames(void) const;
+ inline bool HasIPD(void) const;
+ inline bool HasWidthInFrames(void) const;
+ inline bool HasPulseWidth(void) const;
+ /// \}
+
+private:
+ /// \name Private Methods.
+ /// \{
+ /// \brief Checks whether chemistry triple, including
+ /// binding kit, sequencing kit and base caller version
+ /// are set.
+ /// If not, add error messages.
+ bool SanityCheckChemistry(const std::string & bindingKit,
+ const std::string & sequencingKit,
+ const std::string & basecallerVersion);
+
+ /// \brief Closes HDFBaxWriter.
+ void Close(void);
+ /// \}
+};
+
+inline bool HDFBaxWriter::HasDeletionQV(void) const
+{return basecallsWriter_->HasDeletionQV();}
+
+inline bool HDFBaxWriter::HasDeletionTag(void) const
+{return basecallsWriter_->HasDeletionTag();}
+
+inline bool HDFBaxWriter::HasInsertionQV(void) const
+{return basecallsWriter_->HasInsertionQV();}
+
+inline bool HDFBaxWriter::HasSubstitutionTag(void) const
+{return basecallsWriter_->HasSubstitutionTag();}
+
+inline bool HDFBaxWriter::HasSubstitutionQV(void) const
+{return basecallsWriter_->HasSubstitutionQV();}
+
+inline bool HDFBaxWriter::HasMergeQV(void) const
+{return basecallsWriter_->HasMergeQV();}
+
+inline bool HDFBaxWriter::HasPreBaseFrames(void) const
+{return basecallsWriter_->HasPreBaseFrames();}
+
+inline bool HDFBaxWriter::HasIPD(void) const
+{return this->HasPreBaseFrames();}
+
+inline bool HDFBaxWriter::HasWidthInFrames(void) const
+{return basecallsWriter_->HasWidthInFrames();}
+
+inline bool HDFBaxWriter::HasPulseWidth(void) const
+{return this->HasWidthInFrames();}
+#endif
diff --git a/hdf/HDFCmpFile.hpp b/hdf/HDFCmpFile.hpp
index d4d6984..ceefb94 100644
--- a/hdf/HDFCmpFile.hpp
+++ b/hdf/HDFCmpFile.hpp
@@ -182,6 +182,10 @@ public:
// 2.
HDFCmpRefAlignmentGroup *newGroup = new HDFCmpRefAlignmentGroup;
+ if (newGroup == nullptr) {
+ cout << "ERROR, unable to allocate memory for cmp.h5 file." << endl;
+ exit(1);
+ }
newGroup->Create(rootGroup.rootGroup, refGroupName);
refAlignGroups.push_back(newGroup);
unsigned int id = refAlignGroups.size();
@@ -362,6 +366,10 @@ public:
for (refSeqIndex = 0; refSeqIndex < cmpFile.refGroup.path.size(); refSeqIndex++) {
HDFCmpRefAlignmentGroup* refAlignGroup;
refAlignGroup = new HDFCmpRefAlignmentGroup;
+ if (refAlignGroup == nullptr) {
+ cout << "ERROR, unable to allocate memory for cmp.h5 file." << endl;
+ exit(1);
+ }
refAlignGroup->Initialize(rootGroup.rootGroup.group, cmpFile.refGroup.path[refSeqIndex]);
int refAlignGroupIndex = refAlignGroups.size();
refAlignGroups.push_back(refAlignGroup);
@@ -630,7 +638,7 @@ public:
//
int queryStart = cmpAlignment.GetQueryStart();
int queryEnd = cmpAlignment.GetQueryEnd();
- read.holeNumber = cmpAlignment.GetHoleNumber();
+ read.HoleNumber(cmpAlignment.GetHoleNumber());
int refGroupId = cmpAlignment.GetRefGroupId();
int alnGroupId = cmpAlignment.GetAlnGroupId();
int refGroupIndex = refGroupIdToArrayIndex[refGroupId];
@@ -814,6 +822,10 @@ public:
// 2.
HDFCmpRefAlignmentGroup *newGroup = new HDFCmpRefAlignmentGroup;
+ if (newGroup == nullptr) {
+ cout << "ERROR, unable to allocate memory for cmp.h5 file." << endl;
+ exit(1);
+ }
newGroup->Create(rootGroup.rootGroup, refGroupName);
refAlignGroups.push_back(newGroup);
diff --git a/hdf/HDFCmpReader.hpp b/hdf/HDFCmpReader.hpp
index f12d2e6..38a2839 100644
--- a/hdf/HDFCmpReader.hpp
+++ b/hdf/HDFCmpReader.hpp
@@ -233,6 +233,7 @@ public:
for (refSeqIndex = 0; refSeqIndex < cmpFile.refGroup.path.size(); refSeqIndex++) {
HDFCmpRefAlignmentGroup* refAlignGroup;
refAlignGroup = new HDFCmpRefAlignmentGroup;
+ if (refAlignGroup == nullptr) {cout << "ERROR, unable to allocate memory for HDFCmpReader." << endl; exit(1);}
refAlignGroup->Initialize(rootGroup.rootGroup.group, cmpFile.refGroup.path[refSeqIndex]);
int refAlignGroupIndex = refAlignGroups.size();
refAlignGroups.push_back(refAlignGroup);
diff --git a/hdf/HDFCmpRefAlignmentGroup.hpp b/hdf/HDFCmpRefAlignmentGroup.hpp
index 69650ec..165dc17 100644
--- a/hdf/HDFCmpRefAlignmentGroup.hpp
+++ b/hdf/HDFCmpRefAlignmentGroup.hpp
@@ -56,6 +56,7 @@ class HDFCmpRefAlignmentGroup {
//
int newReadGroupIndex = readGroups.size();
HDFCmpExperimentGroup* readGroupPtr = new HDFCmpExperimentGroup;
+ if (readGroupPtr == nullptr) {cout << "ERROR, failed to allocate memory for HDFCmpExperimentGroup!" << endl; exit(1);}
readGroups.push_back(readGroupPtr);
experimentNameToIndex[readGroupName] = newReadGroupIndex;
@@ -77,7 +78,7 @@ class HDFCmpRefAlignmentGroup {
HDFCmpExperimentGroup* InitializeExperimentGroup(string experimentGroupName, set<string> &includedFields) {
if (refGroup.ContainsObject(experimentGroupName)) {
HDFCmpExperimentGroup* newGroup = new HDFCmpExperimentGroup;
-
+ if (newGroup == nullptr) {cout << "ERROR, failed to allocate memory for HDFCmpExperimentGroup!" << endl; exit(1);}
if (newGroup->Initialize(refGroup, experimentGroupName, includedFields) == 0) {
cout << "ERROR, could not initialize the exp group." << endl;
exit(1);
diff --git a/hdf/HDFData.cpp b/hdf/HDFData.cpp
index 0283992..1d1d090 100644
--- a/hdf/HDFData.cpp
+++ b/hdf/HDFData.cpp
@@ -7,7 +7,7 @@ H5Location* HDFData::GetObject() {
return &dataset;
}
-HDFData::HDFData(CommonFG* _container, string _datasetName) {
+HDFData::HDFData(CommonFG* _container, const string & _datasetName) {
container = _container;
datasetName = _datasetName;
fileDataSpaceInitialized = false;
@@ -20,7 +20,7 @@ HDFData::HDFData() {
isInitialized = false;
}
-bool HDFData::IsInitialized() {
+bool HDFData::IsInitialized() const {
return isInitialized;
}
@@ -32,18 +32,18 @@ int HDFData::Initialize(HDFGroup &parentGroup, const string &datasetName) {
exit(1);
}
-int HDFData::BaseInitializeDataset(CommonFG &hdfFile, string _datasetName) {
+int HDFData::BaseInitializeDataset(CommonFG &hdfFile, const string & _datasetName) {
dataset = hdfFile.openDataSet(_datasetName.c_str());
isInitialized = true;
fileDataSpaceInitialized = true;
return 1;
}
-int HDFData::InitializeDataset(HDFGroup &group, string _datasetName) {
+int HDFData::InitializeDataset(HDFGroup &group, const string & _datasetName) {
return InitializeDataset(group.group, _datasetName);
}
-int HDFData::InitializeDataset(CommonFG &hdfFile, string _datasetName) {
+int HDFData::InitializeDataset(CommonFG &hdfFile, const string & _datasetName) {
try {
datasetName = _datasetName;
dataset = hdfFile.openDataSet(_datasetName.c_str());
diff --git a/hdf/HDFData.hpp b/hdf/HDFData.hpp
index 5ca1455..f31a313 100644
--- a/hdf/HDFData.hpp
+++ b/hdf/HDFData.hpp
@@ -20,11 +20,11 @@ public:
H5::H5Location* GetObject();
- HDFData(H5::CommonFG* _container, std::string _datasetName);
+ HDFData(H5::CommonFG* _container, const std::string & _datasetName);
HDFData();
- bool IsInitialized();
+ bool IsInitialized() const;
//
// Allow derived classes to be initialized generically.
@@ -40,11 +40,11 @@ public:
//
virtual int Initialize(HDFGroup &parentGroup, const std::string &datasetName);
- int BaseInitializeDataset(H5::CommonFG &hdfFile, std::string _datasetName);
+ int BaseInitializeDataset(H5::CommonFG &hdfFile, const std::string & _datasetName);
- int InitializeDataset(HDFGroup &group, std::string _datasetName);
+ int InitializeDataset(HDFGroup &group, const std::string & _datasetName);
- int InitializeDataset(H5::CommonFG &hdfFile, std::string _datasetName);
+ int InitializeDataset(H5::CommonFG &hdfFile, const std::string & _datasetName);
void Close();
};
diff --git a/hdf/HDFPlsReader.hpp b/hdf/HDFPlsReader.hpp
index 750be08..e440674 100644
--- a/hdf/HDFPlsReader.hpp
+++ b/hdf/HDFPlsReader.hpp
@@ -311,7 +311,7 @@ class HDFPlsReader : public DatasetCollection, public HDFPulseDataFile {
Nucleotide * destSeqCopy = NULL;
if (destSequence != "") {
- destSeqCopy = new Nucleotide[destSequence.size()];
+ destSeqCopy = ProtectedNew<Nucleotide>(destSequence.size());
for(int i = 0 ; i < destSequence.size(); i++) {
destSeqCopy[i] = (Nucleotide)destSequence[i];
}
@@ -446,7 +446,7 @@ class HDFPlsReader : public DatasetCollection, public HDFPulseDataFile {
signalMatrix.Read(curPos, curPos + plsSeqLength, &signal[0]); // read off one row.
int i;
for (i = 0; i < basSeqLength; i++) {
- dest[i] = signal[basToPlsIndex[i]*4 + scanDataReader.baseMap[basSeq[i]]];
+ dest[i] = signal[basToPlsIndex[i]*4 + scanDataReader.BaseMap()[basSeq[i]]];
}
}
else {
@@ -479,7 +479,7 @@ class HDFPlsReader : public DatasetCollection, public HDFPulseDataFile {
pulseStartFrame.resize(seqLength);
startFrameArray.Read(curPos, curPos + seqLength, &pulseStartFrame[0]);
if (read.startFrame) {delete [] read.startFrame; read.startFrame = NULL;}
- read.startFrame = new unsigned int[read.length];
+ read.startFrame = ProtectedNew<unsigned int>(read.length);
StoreField(pulseStartFrame, basToPlsIndex, read.startFrame, read.length);
}
@@ -488,27 +488,27 @@ class HDFPlsReader : public DatasetCollection, public HDFPulseDataFile {
pulseWidthInFrames.resize(seqLength);
plsWidthInFramesArray.Read(curPos, curPos + seqLength, &pulseWidthInFrames[0]);
if (read.widthInFrames) {delete [] read.widthInFrames; read.widthInFrames = NULL;}
- read.widthInFrames = new HalfWord[read.length];
+ read.widthInFrames = ProtectedNew<HalfWord>(read.length);
StoreField(pulseWidthInFrames, basToPlsIndex, read.widthInFrames, read.length);
}
if (includedFields["MidSignal"]) {
if (read.midSignal) {delete [] read.midSignal; read.midSignal = NULL;}
- read.midSignal = new HalfWord[read.length];
+ read.midSignal = ProtectedNew<HalfWord>(read.length);
ReadSignal("MidSignal", midSignalArray, midSignalMatrix, seqLength, midSignalNDims,
read.seq, read.length, basToPlsIndex, read.midSignal);
}
if (includedFields["MaxSignal"]) {
if (read.maxSignal) {delete [] read.maxSignal; read.maxSignal = NULL;}
- read.maxSignal = new HalfWord[read.length];
+ read.maxSignal = ProtectedNew<HalfWord>(read.length);
ReadSignal("MaxSignal", maxSignalArray, maxSignalMatrix, seqLength, maxSignalNDims,
read.seq, read.length, basToPlsIndex, read.maxSignal);
}
if (includedFields["MeanSignal"]) {
if (read.meanSignal) {delete [] read.meanSignal; read.meanSignal = NULL;}
- read.meanSignal = new HalfWord[read.length];
+ read.meanSignal = ProtectedNew<HalfWord>(read.length);
ReadSignal("MeanSignal", meanSignalArray, meanSignalMatrix, seqLength, meanSignalNDims,
read.seq, read.length, basToPlsIndex, read.meanSignal);
}
@@ -518,7 +518,7 @@ class HDFPlsReader : public DatasetCollection, public HDFPulseDataFile {
pulseClassifierQV.resize(seqLength);
classifierQVArray.Read(curPos, curPos + seqLength, &pulseClassifierQV[0]);
if (read.classifierQV) {delete [] read.classifierQV; read.classifierQV = NULL;}
- read.classifierQV = new float[read.length];
+ read.classifierQV = ProtectedNew<float>(read.length);
StoreField(pulseClassifierQV, basToPlsIndex, read.classifierQV, read.length);
}
diff --git a/hdf/HDFRegionTableReader.cpp b/hdf/HDFRegionTableReader.cpp
index 80bc2db..4df91ea 100644
--- a/hdf/HDFRegionTableReader.cpp
+++ b/hdf/HDFRegionTableReader.cpp
@@ -1,3 +1,4 @@
+#include <cassert>
#include "HDFRegionTableReader.hpp"
using namespace std;
@@ -29,30 +30,39 @@ int HDFRegionTableReader::Initialize(string ®ionTableFileName,
return 0;
}
- nRows = regions.GetNRows();
-
- if (columnNames.Initialize(regions.dataset, "ColumnNames") == 0) {
+ if (columnNames.Initialize(regions, "ColumnNames") == 0) {
return 0;
}
- if (regionTypes.Initialize(regions.dataset, "RegionTypes") == 0) {
+ if (regionTypes.Initialize(regions, "RegionTypes") == 0) {
return 0;
}
- if (regionDescriptions.Initialize(regions.dataset, "RegionDescriptions") == 0) {
+ if (regionDescriptions.Initialize(regions, "RegionDescriptions") == 0) {
return 0;
}
- if (regionSources.Initialize(regions.dataset, "RegionSources") == 0) {
+ if (regionSources.Initialize(regions, "RegionSources") == 0) {
return 0;
}
+ nRows = regions.GetNRows();
+ isInitialized_ = true;
curRow = 0;
return 1;
}
+bool HDFRegionTableReader::IsInitialized(void) const {
+ return isInitialized_;
+}
+
+bool HDFRegionTableReader::HasRegionTable(void) const {
+ assert(IsInitialized() && "HDFRegionTable is not initialize!");
+ return fileContainsRegionTable;
+}
+
int HDFRegionTableReader::GetNext(RegionAnnotation &annotation) {
+ assert(IsInitialized() && "HDFRegionTable is not initialize!");
//
// Bail with no-op if this is the last row.
//
-
if (fileContainsRegionTable == false) {
return 0;
}
@@ -66,69 +76,58 @@ int HDFRegionTableReader::GetNext(RegionAnnotation &annotation) {
return 1;
}
-void HDFRegionTableReader::RegionTypesToMap(RegionTable &table) {
- size_t i;
- table.regionTypeEnums.resize(table.regionTypes.size());
- for (i = 0;i < table.regionTypes.size(); i++) {
- if (table.regionTypes[i] == "GlobalAccuracy") {
- table.regionTypeEnums[i] = GlobalAccuracy;
- }
- else if (table.regionTypes[i] == "HQRegion") {
- table.regionTypeEnums[i] = HQRegion;
- }
- else if (table.regionTypes[i] == "Adapter") {
- table.regionTypeEnums[i] = Adapter;
- }
- else if (table.regionTypes[i] == "Insert") {
- table.regionTypeEnums[i] = Insert;
- }
- else if (table.regionTypes[i] == "Accuracy") {
- table.regionTypeEnums[i] = Insert;
- }
- else if (table.regionTypes[i] == "ArtifactRegion") {
- table.regionTypeEnums[i] = ArtifactRegion;
- }
- else {
- cout << "ERROR! Region Type " << table.regionTypes[i] << " is not supported. Check Enumerations.h" << endl;
- assert(0);
- }
- }
-}
-
-int HDFRegionTableReader::ReadTableAttributes(RegionTable &table) {
- if (fileContainsRegionTable == false) {
- return 0;
- }
- columnNames.Read(table.columnNames);
- regionTypes.Read(table.regionTypes);
- RegionTypesToMap(table);
- regionDescriptions.Read(table.regionDescriptions);
- regionSources.Read(table.regionSources);
- // All ok.
- return 1;
-}
void HDFRegionTableReader::Close() {
+ isInitialized_ = false;
+ fileContainsRegionTable = false;
+ columnNames.Close();
+ regionTypes.Close();
+ regionDescriptions.Close();
+ regionSources.Close();
pulseDataGroup.Close();
regions.Close();
regionTableFile.Close();
}
-void HDFRegionTableReader::ReadTable(RegionTable &table) {
- if (fileContainsRegionTable == false) {
- return;
- }
- ReadTableAttributes(table);
- table.table.resize(nRows);
- int i = 0;
- while(GetNext(table.table[curRow])) {
- i++;
+// Note that (1) there is NO GUARANTEE that region annotations in hdf5
+// `Regions` dataset be sorted in any order, so we cannot iterate over
+// `Regions` in order to traverse zmws in order.
+// (2) region table of a million zmws is approximately 5M.
+void HDFRegionTableReader::ReadTable(RegionTable & table) {
+ assert(IsInitialized() && "HDFRegionTable is not initialize!");
+ table.Reset();
+
+ if (fileContainsRegionTable) {
+ // Read attributes.
+ std::vector<std::string> names, types, descs, sources;
+ if (columnNames.IsInitialized()) columnNames.Read(names);
+ if (regionTypes.IsInitialized()) regionTypes.Read(types);
+ else {
+ cout << "ERROR MUST HAVE REGIONTYPES" << endl;
+ exit(1);
+ }
+ if (regionDescriptions.IsInitialized()) regionDescriptions.Read(descs);
+ if (regionSources.IsInitialized()) regionSources.Read(sources);
+
+ // Read region annotations
+ std::vector<RegionAnnotation> ras;
+ ras.resize(nRows);
+ assert(curRow == 0);
+ for (; curRow < nRows; curRow++) {
+ regions.Read(curRow, curRow+1, ras[curRow].row);
+ }
+
+ // Reconstruct table
+ table.ConstructTable(ras, types);
+ table.ColumnNames(names);
+ table.RegionDescriptions(descs);
+ table.RegionSources(sources);
}
}
-
void HDFRegionTableReader::GetMinMaxHoleNumber(UInt &minHole,
UInt &maxHole) {
+ assert(IsInitialized() && "HDFRegionTable is not initialize!");
// Hole numbers may not be sorted ascendingly, so do not
// return the first and last hole numbers as the min and max.
UInt saveCurRow = curRow;
diff --git a/hdf/HDFRegionTableReader.hpp b/hdf/HDFRegionTableReader.hpp
index 470cddf..baabfc7 100644
--- a/hdf/HDFRegionTableReader.hpp
+++ b/hdf/HDFRegionTableReader.hpp
@@ -14,7 +14,7 @@
class HDFRegionTableReader {
-public:
+private:
HDFFile regionTableFile;
HDFGroup pulseDataGroup;
HDF2DArray<int> regions;
@@ -23,24 +23,36 @@ public:
HDFAtom<std::vector<std::string> > regionDescriptions;
HDFAtom<std::vector<std::string> > regionSources;
HDFAtom<std::vector<std::string> > columnNames;
+
int curRow;
+
+ bool isInitialized_; // whether or not this reader is initialized.
+
int nRows;
+
bool fileContainsRegionTable;
- int Initialize(std::string ®ionTableFileName,
- const H5::FileAccPropList & fileAccPropList = H5::FileAccPropList::DEFAULT);
+public:
- int GetNext(RegionAnnotation &annotation);
+ HDFRegionTableReader(void)
+ : curRow(0), isInitialized_(false), nRows(0)
+ , fileContainsRegionTable(false) {}
- void RegionTypesToMap(RegionTable &table);
+ int Initialize(std::string ®ionTableFileName,
+ const H5::FileAccPropList & fileAccPropList = H5::FileAccPropList::DEFAULT);
- int ReadTableAttributes(RegionTable &table);
+ bool IsInitialized(void) const;
- void Close();
+ bool HasRegionTable(void) const;
- void ReadTable(RegionTable &table);
-
void GetMinMaxHoleNumber(UInt &minHole, UInt &maxHole);
+
+ void ReadTable(RegionTable &table);
+
+ void Close();
+
+private:
+ int GetNext(RegionAnnotation &annotation);
};
diff --git a/hdf/HDFRegionsWriter.cpp b/hdf/HDFRegionsWriter.cpp
new file mode 100644
index 0000000..05427c6
--- /dev/null
+++ b/hdf/HDFRegionsWriter.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#include "HDFRegionsWriter.hpp"
+
+HDFRegionsWriter::HDFRegionsWriter(const std::string & filename,
+ HDFGroup & parentGroup,
+ const std::vector<std::string> & regionTypes)
+ : HDFWriterBase(filename)
+ , parentGroup_(parentGroup)
+ , regionTypes_(regionTypes)
+ , curRow_(0)
+{
+ // Initialize the 'regions' group.
+ regionsArray_.Initialize(parentGroup_, PacBio::GroupNames::regions, RegionAnnotation::NCOLS);
+}
+
+HDFRegionsWriter::~HDFRegionsWriter(void)
+{
+ WriteAttributes();
+ Close();
+}
+
+bool HDFRegionsWriter::WriteAttributes(void)
+{
+ if (curRow_ > 0) {
+ AddAttribute(regionsArray_, PacBio::AttributeNames::Regions::columnnames, PacBio::AttributeValues::Regions::columnnames);
+ AddAttribute(regionsArray_, PacBio::AttributeNames::Regions::regiontypes, regionTypes_);
+ AddAttribute(regionsArray_, PacBio::AttributeNames::Regions::regiondescriptions, PacBio::AttributeValues::Regions::regiondescriptions);
+ AddAttribute(regionsArray_, PacBio::AttributeNames::Regions::regionsources, PacBio::AttributeValues::Regions::regionsources);
+ return true;
+ } else {
+ AddErrorMessage("Could not write attributes when Regions group is empty.");
+ return false;
+ }
+}
+
+bool HDFRegionsWriter::Write(const std::vector<RegionAnnotation> &annotations) {
+ for (auto annotation: annotations)
+ if (not Write(annotation))
+ return false;
+ return true;
+}
+
+bool HDFRegionsWriter::Write(const RegionAnnotation &annotation) {
+ try {
+ regionsArray_.WriteRow(annotation.row, HDFRegionsWriter::NCOLS);
+ }
+ catch (H5::Exception &e) {
+ AddErrorMessage("Failed to write region annotation " +
+ annotation.GetHoleNumber());
+ return false;
+ }
+ ++curRow_;
+ return true;
+}
+
+void HDFRegionsWriter::Flush(void) {
+ regionsArray_.Flush();
+}
+
+void HDFRegionsWriter::Close(void) {
+ Flush();
+ regionsArray_.Close();
+}
diff --git a/hdf/HDFRegionsWriter.hpp b/hdf/HDFRegionsWriter.hpp
new file mode 100644
index 0000000..13e2445
--- /dev/null
+++ b/hdf/HDFRegionsWriter.hpp
@@ -0,0 +1,101 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#ifndef _HDF_REGIONS_WRITER_HPP_
+#define _HDF_REGIONS_WRITER_HPP_
+
+#include <string>
+#include "Enumerations.h"
+#include "reads/RegionTable.hpp"
+#include "HDFFile.hpp"
+#include "HDFArray.hpp"
+#include "HDF2DArray.hpp"
+#include "HDFAtom.hpp"
+#include "HDFWriterBase.hpp"
+
+using namespace H5;
+using namespace std;
+
+class HDFRegionsWriter: public HDFWriterBase {
+public:
+ /// \name Constructor and destructor
+ /// \{
+ /// \param[in] filename, hdf file name
+ /// \param[in] parentGroup, parent hdf group in hirarchy
+ HDFRegionsWriter(const std::string & filename,
+ HDFGroup & parentGroup,
+ const std::vector<std::string> & regionTypes = PacBio::AttributeValues::Regions::regiontypes);
+ ~HDFRegionsWriter(void);
+ /// \}
+
+private:
+ /// \name Private variables for hdf IO.
+ /// \{
+ HDFGroup & parentGroup_; //< parent hdf group
+
+ /// A vector of strings of region types for RegionTypeIndex to look up. Order matters!
+ std::vector<std::string> regionTypes_;
+
+ HDF2DArray<int> regionsArray_; //< HDF2DArray for writing regions to hdf
+
+ int curRow_; //< which row to write
+
+ static const int NCOLS = 5; //< number of columns in Regions table.
+
+ /// \brief Write attributes of the 'regions' group
+ bool WriteAttributes(void);
+ /// \}
+
+public:
+ /// \name Method to write region annotations.
+ /// \{
+ /// \brief Append a vector of region annotations to 'regions'
+ /// \param[in] annotations - region annotations to append.
+ /// \returns true if succeeded.
+ bool Write(const std::vector<RegionAnnotation> &annotations);
+
+ /// \brief Append a region annotation to 'regions'
+ /// \param[in] annotation - region annotation to append
+ /// \returns true if succeeded.
+ bool Write(const RegionAnnotation &annotation);
+
+ void Flush(void);
+
+ void Close(void);
+};
+
+#endif
diff --git a/hdf/HDFScanDataReader.cpp b/hdf/HDFScanDataReader.cpp
index daed4b2..3b2443d 100644
--- a/hdf/HDFScanDataReader.cpp
+++ b/hdf/HDFScanDataReader.cpp
@@ -89,7 +89,7 @@ int HDFScanDataReader::Initialize(HDFGroup *pulseDataGroup) {
// Load baseMap which maps bases (ATGC) to channel orders.
// This should always be present.
//
- if (LoadBaseMap(baseMap) == 0)
+ if (LoadBaseMap(baseMap_) == 0)
return 0;
//
@@ -126,6 +126,10 @@ int HDFScanDataReader::Read(ScanData &scanData) {
whenStartedAtom.Read(scanData.whenStarted);
}
+ ReadSequencingKit(scanData.sequencingKit_);
+
+ ReadBindingKit(scanData.bindingKit_);
+
return 1;
}
@@ -190,7 +194,7 @@ int HDFScanDataReader::LoadMovieName(string &movieNameP) {
}
}
-int HDFScanDataReader::LoadBaseMap(map<char, int> & baseMap) {
+int HDFScanDataReader::LoadBaseMap(map<char, size_t> & baseMap) {
// Map bases to channel order in hdf pls file.
if (dyeSetGroup.ContainsAttribute("BaseMap") and
baseMapAtom.Initialize(dyeSetGroup, "BaseMap")) {
@@ -204,8 +208,8 @@ int HDFScanDataReader::LoadBaseMap(map<char, int> & baseMap) {
baseMap.clear();
for(size_t i = 0; i < baseMapStr.size(); i++) {
baseMap[toupper(baseMapStr[i])] = i;
- baseMap[tolower(baseMapStr[i])] = i;
}
+ this->baseMap_ = baseMap;
return 1;
}
return 0;
@@ -213,20 +217,21 @@ int HDFScanDataReader::LoadBaseMap(map<char, int> & baseMap) {
void HDFScanDataReader::Close() {
if (useMovieName) {
- movieNameAtom.dataspace.close();
+ movieNameAtom.Close();
}
if (useRunCode) {
- runCodeAtom.dataspace.close();
+ runCodeAtom.Close();
}
if (useWhenStarted) {
- whenStartedAtom.dataspace.close();
+ whenStartedAtom.Close();
}
- baseMapAtom.dataspace.close();
- platformIdAtom.dataspace.close();
- frameRateAtom.dataspace.close();
- numFramesAtom.dataspace.close();
- sequencingKitAtom.dataspace.close();
- bindingKitAtom.dataspace.close();
+
+ baseMapAtom.Close();
+ platformIdAtom.Close();
+ frameRateAtom.Close();
+ numFramesAtom.Close();
+ sequencingKitAtom.Close();
+ bindingKitAtom.Close();
scanDataGroup.Close();
dyeSetGroup.Close();
diff --git a/hdf/HDFScanDataReader.hpp b/hdf/HDFScanDataReader.hpp
index 7da6649..b9ec164 100644
--- a/hdf/HDFScanDataReader.hpp
+++ b/hdf/HDFScanDataReader.hpp
@@ -38,7 +38,6 @@ public:
//
bool useMovieName;
std::string movieName, runCode;
- std::map<char, int> baseMap;
PlatformId platformId;
HDFScanDataReader();
@@ -81,11 +80,15 @@ public:
int LoadMovieName(std::string &movieName);
- int LoadBaseMap(map<char, int> & baseMap);
+ int LoadBaseMap(map<char, size_t> & baseMap);
+
+ std::map<char, size_t> BaseMap(void) const {return baseMap_;}
void Close();
private:
+ std::map<char, size_t> baseMap_;
+
/// Reads value of a string attribute within a HDFGroup.
/// \returns 1 if succesfully read value of the string attribute, 0 otherwise.
/// \param[out] attributeValue, value of a string attribute.
diff --git a/hdf/HDFScanDataWriter.cpp b/hdf/HDFScanDataWriter.cpp
index 874aba0..34e75c7 100644
--- a/hdf/HDFScanDataWriter.cpp
+++ b/hdf/HDFScanDataWriter.cpp
@@ -1,3 +1,40 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
#include "HDFScanDataWriter.hpp"
void HDFScanDataWriter::CreateAcqParamsGroup() {
@@ -15,7 +52,7 @@ void HDFScanDataWriter::CreateDyeSetGroup(){
std::cout << "ERROR could not create /ScanData/DyeSet." << std::endl;
exit(1);
}
- baseMapAtom.Create(dyeSetGroup.group, "BaseMap");
+ baseMapAtom.Create(dyeSetGroup.group, PacBio::AttributeNames::ScanData::DyeSet::basemap);
numAnalogAtom.Create(dyeSetGroup.group, "NumAnalog");
}
@@ -28,6 +65,8 @@ void HDFScanDataWriter::CreateRunInfoGroup(){
platformIdAtom.Create(runInfoGroup.group, "PlatformId");
platformNameAtom.Create(runInfoGroup.group, "PlatformName");
runCodeAtom.Create(runInfoGroup.group, "RunCode");
+ bindingKitAtom.Create(runInfoGroup.group, "BindingKit");
+ sequencingKitAtom.Create(runInfoGroup.group, "SequencingKit");
}
HDFScanDataWriter::HDFScanDataWriter(HDFFile & _outFile) {
@@ -39,8 +78,7 @@ HDFScanDataWriter::HDFScanDataWriter(HDFGroup & _rootGroup) {
}
HDFScanDataWriter::~HDFScanDataWriter() {
- // Assume that closing the hdf file must be done
- // manually and not in a destructor.
+ this->Close();
}
int HDFScanDataWriter::Initialize(HDFGroup & _rootGroup) {
@@ -61,31 +99,45 @@ int HDFScanDataWriter::Initialize(HDFGroup & _rootGroup) {
return 1;
}
-void HDFScanDataWriter::Write(ScanData & scanData) {
+void HDFScanDataWriter::Write(const ScanData & scanData) {
+ const float DEFAULT_FRAMERATE = 75.0;
+ const unsigned int DEFAULT_NUMFRAMES = 1000000;
+ const std::string DEFAULT_DATE = "2013-01-01T01:01:01";
+ const int DEFAULT_NUMANALOG = 4;
+ const std::string DEFAULT_MOVIENAME = "simulated_movie";
+ const std::string DEFAULT_RUNCODE = "simulated_runcode";
+
WriteFrameRate((scanData.frameRate==0)?
- (75):(scanData.frameRate));
+ (DEFAULT_FRAMERATE):(scanData.frameRate));
WriteNumFrames((scanData.numFrames==0)?
- (1000000):(scanData.numFrames));
+ (DEFAULT_NUMFRAMES):(scanData.numFrames));
WriteWhenStarted((scanData.whenStarted.empty())?
- ("2013-01-01T01:01:01"):(scanData.whenStarted));
- std::string baseMapStr = BaseMapToStr(scanData.baseMap);
- WriteBaseMap((baseMapStr == "")?("TGAC"):baseMapStr);
- WriteNumAnalog(4);
+ (DEFAULT_DATE):(scanData.whenStarted));
+
+ // Base map is VITAL, must be specified
+ if (scanData.BaseMapStr().empty()) {
+ assert("ScanData/DyeSet attribute BaseMap MUST be specified." == 0);
+ }
+ WriteBaseMap(scanData.BaseMapStr());
+ WriteNumAnalog(DEFAULT_NUMANALOG);
WriteMovieName((scanData.movieName.empty()?
- ("simulated_movie"):scanData.movieName));
+ (DEFAULT_MOVIENAME):scanData.movieName));
WriteRunCode((scanData.runCode.empty())?
- "simulated_runcode":(scanData.runCode));
+ (DEFAULT_RUNCODE):(scanData.runCode));
WritePlatformId((scanData.platformId==NoPlatform)?
(Springfield):(scanData.platformId));
+
+ WriteBindingKit(scanData.BindingKit());
+ WriteSequencingKit(scanData.SequencingKit());
}
-void HDFScanDataWriter::WriteFrameRate(float frameRate) {
+void HDFScanDataWriter::WriteFrameRate(const float frameRate) {
// Write /ScanData/AcqParams/FrameRate attribute.
frameRateAtom.Write(frameRate);
}
-void HDFScanDataWriter::WriteNumFrames(unsigned int numFrames) {
+void HDFScanDataWriter::WriteNumFrames(const unsigned int numFrames) {
// Write /ScanData/AcqParams/NumFrames attribute.
numFramesAtom.Write(numFrames);
}
@@ -95,23 +147,6 @@ void HDFScanDataWriter::WriteWhenStarted(const std::string whenStarted) {
whenStartedAtom.Write(whenStarted);
}
-std::string HDFScanDataWriter::BaseMapToStr(std::map<char, int> & baseMap) {
- std::string baseMapStr = ""; //4 dye channels.
- if (not baseMap.empty()) {
- baseMapStr = " ";
- map<char, int>::iterator it;
- for (it = baseMap.begin(); it != baseMap.end(); ++it){
- if (it->second > 4 or it->second < 0) {
- std::cout << "ERROR, there are more than four dye channels."
- << std::endl;
- exit(1);
- }
- baseMapStr[it->second]= it->first;
- }
- }
- return baseMapStr;
-}
-
void HDFScanDataWriter::WriteBaseMap(const std::string baseMapStr) {
//Write /ScanData/DyeSet/BaseMap attribute.
baseMapAtom.Write(baseMapStr);
@@ -123,9 +158,9 @@ void HDFScanDataWriter::WriteNumAnalog(const unsigned int numAnalog) {
}
void HDFScanDataWriter::WritePlatformId(const PlatformId id) {
- //Write /ScanData/RunInfo/Flatform attribute.
- platformIdAtom.Write(id);
+ //Write /ScanData/RunInfo/Platform attribute.
std::string name = (id == Springfield)?"Springfield":"Astro";
+ platformIdAtom.Write(id);
platformNameAtom.Write(name);
}
@@ -139,20 +174,30 @@ void HDFScanDataWriter::WriteRunCode(const std::string runCode) {
runCodeAtom.Write(runCode);
}
+void HDFScanDataWriter::WriteBindingKit(const std::string & bindingKit) {
+ bindingKitAtom.Write(bindingKit);
+}
+
+void HDFScanDataWriter::WriteSequencingKit(const std::string & sequencingKit) {
+ sequencingKitAtom.Write(sequencingKit);
+}
+
void HDFScanDataWriter::Close() {
// Close /ScanData/AcqParams attributes.
- whenStartedAtom.dataspace.close();
- frameRateAtom.dataspace.close();
- numFramesAtom.dataspace.close();
+ whenStartedAtom.Close();
+ frameRateAtom.Close();
+ numFramesAtom.Close();
// Close /ScanData/DyeSet attributes.
- baseMapAtom.dataspace.close();
+ baseMapAtom.Close();
// Close /ScanData/RunInfo attributes.
- movieNameAtom.dataspace.close();
- runCodeAtom.dataspace.close();
- platformIdAtom.dataspace.close();
- platformNameAtom.dataspace.close();
+ movieNameAtom.Close();
+ runCodeAtom.Close();
+ platformIdAtom.Close();
+ platformNameAtom.Close();
+ bindingKitAtom.Close();
+ sequencingKitAtom.Close();
// Close /ScanData/AcqParams|DyeSet|RunInfo.
acqParamsGroup.Close();
diff --git a/hdf/HDFScanDataWriter.hpp b/hdf/HDFScanDataWriter.hpp
index f92d8e6..136e416 100644
--- a/hdf/HDFScanDataWriter.hpp
+++ b/hdf/HDFScanDataWriter.hpp
@@ -1,3 +1,40 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
#ifndef DATA_HDF_HDF_SCAN_DATA_WRITER_H_
#define DATA_HDF_HDF_SCAN_DATA_WRITER_H_
@@ -27,6 +64,9 @@ private:
HDFAtom<std::string> movieNameAtom;
HDFAtom<std::string> runCodeAtom;
+ HDFAtom<std::string> bindingKitAtom;
+ HDFAtom<std::string> sequencingKitAtom;
+
HDFAtom<unsigned int> platformIdAtom;
HDFAtom<std::string> platformNameAtom;
@@ -45,16 +85,17 @@ public:
int Initialize(HDFGroup & _rootGroup);
- void Write(ScanData & scanData);
+ void Write(const ScanData & scanData);
- void WriteFrameRate(float frameRate);
+ void WriteFrameRate(const float frameRate);
- void WriteNumFrames(unsigned int numFrames);
+ void WriteNumFrames(const unsigned int numFrames);
void WriteWhenStarted(const std::string whenStarted);
- std::string BaseMapToStr(std::map<char, int> & baseMap);
-
+ void Close();
+
+private:
void WriteBaseMap(const std::string baseMapStr);
void WriteNumAnalog(const unsigned int numAnalog);
@@ -65,7 +106,9 @@ public:
void WriteRunCode(const std::string runCode);
- void Close();
+ void WriteBindingKit(const std::string & bindingKit);
+
+ void WriteSequencingKit(const std::string & sequencingKit);
};
#endif
diff --git a/hdf/HDFWriteBuffer.hpp b/hdf/HDFWriteBuffer.hpp
index 3600c5a..db812df 100644
--- a/hdf/HDFWriteBuffer.hpp
+++ b/hdf/HDFWriteBuffer.hpp
@@ -2,6 +2,7 @@
#define _BLASR_HDF_WRITE_BUFFER_HPP_
#include <cstddef>
+#include "utils.hpp"
template<typename T>
class HDFWriteBuffer {
@@ -20,7 +21,7 @@ public:
Free(); // Free before reusing the buffer.
bufferSize = pBufferSize;
if (bufferSize > 0) {
- writeBuffer = new T[bufferSize];
+ writeBuffer = ProtectedNew<T>(bufferSize);
}
else {
writeBuffer = NULL;
diff --git a/hdf/HDFWriterBase.cpp b/hdf/HDFWriterBase.cpp
new file mode 100644
index 0000000..856dfb1
--- /dev/null
+++ b/hdf/HDFWriterBase.cpp
@@ -0,0 +1,98 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. //
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#include "HDFWriterBase.hpp"
+
+std::vector<std::string> HDFWriterBase::Errors(void) const {
+ return errors_;
+}
+
+bool HDFWriterBase::AddChildGroup(HDFGroup & parentGroup,
+ HDFGroup & childGroup,
+ const std::string & childGroupName) {
+ parentGroup.AddGroup(childGroupName);
+ if (childGroup.Initialize(parentGroup, childGroupName) == 0) {
+ FAILED_TO_CREATE_GROUP_ERROR(childGroupName);
+ return false;
+ }
+ return true;
+}
+
+bool HDFWriterBase::AddAttribute(HDFData & group,
+ const std::string & attributeName,
+ const std::string & attributeValue)
+{
+ return this->AddAttribute(group, attributeName, std::vector<std::string>({attributeValue}));
+}
+
+bool HDFWriterBase::AddAttribute(HDFData & group,
+ const std::string & attributeName,
+ const std::vector<std::string> & attributeValues)
+{
+ try {
+ HDFAtom<std::vector<std::string> > attributeAtom;
+ attributeAtom.Create(group.dataset, std::string(attributeName), attributeValues);
+ attributeAtom.Close();
+ }
+ catch (H5::Exception &e) {
+ FAILED_TO_CREATE_ATTRIBUTE_ERROR(attributeName);
+ return false;
+ }
+ return true;
+}
+
+void HDFWriterBase::AddErrorMessage(const std::string & errmsg) {
+ errors_.push_back(errmsg);
+}
+
+void HDFWriterBase::FAILED_TO_CREATE_GROUP_ERROR(const std::string & groupName) {
+ std::stringstream ss;
+ ss << "Failed to create group " << groupName << " in " << filename_;
+ AddErrorMessage(ss.str());
+}
+
+void HDFWriterBase::FAILED_TO_CREATE_ATTRIBUTE_ERROR(const std::string & attributeName) {
+ std::stringstream ss;
+ ss << "Failed to create attribute " << attributeName << " in " << filename_;
+ AddErrorMessage(ss.str());
+}
+
+void HDFWriterBase::PARENT_GROUP_NOT_INITIALIZED_ERROR(const std::string & groupName) {
+ std::stringstream ss;
+ ss << "Parent hdf group of " << groupName << " in file " << filename_
+ << " is not initialized.";
+ AddErrorMessage(ss.str());
+}
diff --git a/hdf/HDFWriterBase.hpp b/hdf/HDFWriterBase.hpp
new file mode 100644
index 0000000..39b33c2
--- /dev/null
+++ b/hdf/HDFWriterBase.hpp
@@ -0,0 +1,88 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#ifndef _BLASR_HDFWRITERBASE_HPP_
+#define _BLASR_HDFWRITERBASE_HPP_
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include "HDFGroup.hpp"
+#include "HDFAtom.hpp"
+
+class HDFWriterBase {
+public:
+ HDFWriterBase(const std::string & filename)
+ : filename_(filename)
+ {}
+
+ ~HDFWriterBase() {}
+
+public:
+ /// \returns Target H5 filename.
+ std::string Filename(void) {return filename_;}
+
+ std::vector<std::string> Errors(void) const;
+
+protected:
+ std::string filename_;
+ std::vector<std::string> errors_;
+
+ bool AddChildGroup(HDFGroup & parentGroup,
+ HDFGroup & childGroup,
+ const std::string & childGroupName);
+
+ bool AddAttribute(HDFData & group,
+ const std::string & attributeName,
+ const std::string & attributeValue);
+
+ bool AddAttribute(HDFData & group,
+ const std::string & attributeName,
+ const std::vector<std::string> & attributeValues);
+
+ void AddErrorMessage(const std::string & errmsg);
+
+ void FAILED_TO_CREATE_GROUP_ERROR(const std::string & groupName);
+
+ void FAILED_TO_CREATE_ATTRIBUTE_ERROR(const std::string & attributeName);
+
+ void PARENT_GROUP_NOT_INITIALIZED_ERROR(const std::string & groupName);
+
+ virtual void Close(void) = 0;
+};
+
+#endif
diff --git a/hdf/HDFZMWMetricsWriter.cpp b/hdf/HDFZMWMetricsWriter.cpp
new file mode 100644
index 0000000..f05a0b6
--- /dev/null
+++ b/hdf/HDFZMWMetricsWriter.cpp
@@ -0,0 +1,142 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#include "HDFZMWMetricsWriter.hpp"
+#include "reads/ScanData.hpp"
+
+HDFZMWMetricsWriter::HDFZMWMetricsWriter(const std::string & filename,
+ HDFGroup & parentGroup, const std::map<char, size_t> & baseMap)
+ : HDFWriterBase(filename)
+ , parentGroup_(parentGroup)
+ , baseMap_(baseMap)
+ , curRow_(0)
+{
+ if (not parentGroup.groupIsInitialized)
+ PARENT_GROUP_NOT_INITIALIZED_ERROR(PacBio::GroupNames::zmwmetrics);
+ else {
+ parentGroup_.AddGroup(PacBio::GroupNames::zmwmetrics);
+
+ if (zmwMetricsGroup_.Initialize(parentGroup_, PacBio::GroupNames::zmwmetrics) == 0)
+ FAILED_TO_CREATE_GROUP_ERROR(PacBio::GroupNames::zmwmetrics);
+
+ InitializeChildHDFGroups();
+ }
+
+ // Sanity Check BaseMap
+ assert(ScanData::IsValidBaseMap(baseMap));
+}
+
+HDFZMWMetricsWriter::~HDFZMWMetricsWriter() {
+ Flush(); // Must flush in case group is empty.
+ assert(WriteAttributes());
+ Close();
+}
+
+bool HDFZMWMetricsWriter::WriteOneZmw(const SMRTSequence & read) {
+ try {
+ float snrs[4];
+ for (char base: {'A', 'C', 'G', 'T'}) {
+ snrs[baseMap_[base]] = read.HQRegionSnr(base);
+ }
+ hqRegionSNRArray_.WriteRow(snrs, SNRNCOLS);
+ readScoreArray_.Write(&read.readScore, 1);
+ productivityArray_.Write(&read.zmwData.holeStatus, 1);
+ }
+ catch (H5::Exception & e) {
+ AddErrorMessage("Failed to write HQRegionSNR or ReadScore or Productivity.");
+ return false;
+ }
+ ++curRow_;
+
+ return true;
+}
+
+void HDFZMWMetricsWriter::Flush(void) {
+ hqRegionSNRArray_.Flush();
+ readScoreArray_.Flush();
+ productivityArray_.Flush();
+}
+
+void HDFZMWMetricsWriter::Close(void) {
+ hqRegionSNRArray_.Close();
+ readScoreArray_.Close();
+ productivityArray_.Close();
+
+ zmwMetricsGroup_.Close();
+}
+
+bool HDFZMWMetricsWriter::InitializeChildHDFGroups(void) {
+ bool OK = true;
+
+ if (hqRegionSNRArray_.Initialize(zmwMetricsGroup_, PacBio::GroupNames::hqregionsnr, SNRNCOLS) == 0) {
+ FAILED_TO_CREATE_GROUP_ERROR(PacBio::GroupNames::hqregionsnr);
+ OK = false;
+ }
+
+ if (readScoreArray_.Initialize(zmwMetricsGroup_, PacBio::GroupNames::readscore) == 0) {
+ FAILED_TO_CREATE_GROUP_ERROR(PacBio::GroupNames::readscore);
+ OK = false;
+ }
+
+ if (productivityArray_.Initialize(zmwMetricsGroup_, PacBio::GroupNames::productivity) == 0) {
+ FAILED_TO_CREATE_GROUP_ERROR(PacBio::GroupNames::productivity);
+ OK = false;
+ }
+
+ return OK;
+}
+
+bool HDFZMWMetricsWriter::WriteAttributes(void) {
+ if (curRow_ > 0) {
+ bool OK =
+ AddAttribute(hqRegionSNRArray_,
+ PacBio::AttributeNames::Common::description,
+ PacBio::AttributeValues::ZMWMetrics::HQRegionSNR::description)
+ and
+ AddAttribute(readScoreArray_,
+ PacBio::AttributeNames::Common::description,
+ PacBio::AttributeValues::ZMWMetrics::ReadScore::description)
+ and
+ AddAttribute(productivityArray_,
+ PacBio::AttributeNames::Common::description,
+ PacBio::AttributeValues::ZMWMetrics::Productivity::description);
+ return OK;
+ } else {
+ AddErrorMessage("Could not write attributes when ZMWMetrics group is empty.");
+ return false;
+ }
+}
diff --git a/hdf/HDFZMWMetricsWriter.hpp b/hdf/HDFZMWMetricsWriter.hpp
new file mode 100644
index 0000000..61234e2
--- /dev/null
+++ b/hdf/HDFZMWMetricsWriter.hpp
@@ -0,0 +1,117 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+
+#ifndef _BLASR_HDF_HDFZMWMETRICSWriter_HPP_
+#define _BLASR_HDF_HDFZMWMETRICSWriter_HPP_
+
+#include "SMRTSequence.hpp"
+#include "HDFWriterBase.hpp"
+#include "BufferedHDFArray.hpp"
+#include "BufferedHDF2DArray.hpp"
+
+
+class HDFBaseCallerWriter;
+
+class HDFZMWMetricsWriter: public HDFWriterBase {
+
+friend class HDFBaseCallerWriter;
+private:
+ /// \name Private variable
+ /// \{
+ HDFGroup & parentGroup_;
+
+ HDFGroup zmwMetricsGroup_;
+
+ /// HDF2DArray for writing average SNR within HQRegion.
+ BufferedHDF2DArray<float> hqRegionSNRArray_;
+
+ /// HDFArray for writing read raw accuracy prediction.
+ BufferedHDFArray<float> readScoreArray_;
+
+ /// HDFArray for writing Productivity
+ BufferedHDFArray<unsigned char> productivityArray_;
+
+ /// Map bases (e.g., ACGT) to indices
+ std::map<char, size_t> baseMap_;
+
+ int curRow_;
+
+ static const int SNRNCOLS = 4;
+ /// \}
+
+public:
+ /// \name Constructors and Destructors
+ /// \{
+ HDFZMWMetricsWriter(const std::string & filename,
+ HDFGroup & parentGroup,
+ const std::map<char, size_t> & baseMap);
+
+ ~HDFZMWMetricsWriter(void) ;
+ /// \}
+
+ /// \name Public Methods
+ /// \{
+
+ /// \note Write info of a SMRTSequence to ZMWMetrics,
+ /// (1) add average signal to noise ratio in HQRegion to HQRegionSNR
+ /// (2) add read raw accuracy prediction to ReadScore
+ bool WriteOneZmw(const SMRTSequence & read);
+
+
+ /// \note Flushes all data from cache to disc.
+ void Flush(void);
+
+ /// \note Closes this zmw group as well as child hdf groups.
+ void Close(void);
+ /// \}
+
+private:
+ /// \name Private Methods
+ /// \{
+
+ /// \note Initialize child hdf groups under ZMWMetrics, including
+ /// HQRegionSNR and ReadScore
+ /// \reutrns bool, whether or not child hdf groups successfully initialized.
+ bool InitializeChildHDFGroups(void);
+
+ /// \note Write Attributes.
+ bool WriteAttributes(void);
+ /// \}
+};
+
+#endif
diff --git a/hdf/HDFZMWWriter.cpp b/hdf/HDFZMWWriter.cpp
new file mode 100644
index 0000000..447d832
--- /dev/null
+++ b/hdf/HDFZMWWriter.cpp
@@ -0,0 +1,144 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#include "HDFZMWWriter.hpp"
+
+HDFZMWWriter::HDFZMWWriter(const std::string & filename,
+ HDFGroup & parentGroup,
+ bool hasHoleXY)
+ : HDFWriterBase(filename)
+ , parentGroup_(parentGroup)
+ , hasHoleXY_(hasHoleXY)
+{
+ if (not parentGroup.groupIsInitialized)
+ PARENT_GROUP_NOT_INITIALIZED_ERROR(PacBio::GroupNames::zmw);
+ else {
+ parentGroup_.AddGroup(PacBio::GroupNames::zmw);
+
+ if (zmwGroup_.Initialize(parentGroup_, PacBio::GroupNames::zmw) == 0)
+ FAILED_TO_CREATE_GROUP_ERROR(PacBio::GroupNames::zmw);
+
+ this->InitializeChildHDFGroups();
+ }
+}
+
+HDFZMWWriter::~HDFZMWWriter() {
+ this->_WriteAttributes();
+ this->Close();
+}
+
+bool HDFZMWWriter::WriteOneZmw(const SMRTSequence & read) {
+ int length_ = static_cast<int> (read.length);
+ numEventArray_.Write(&length_, 1);
+
+ UInt hn_ = read.HoleNumber();
+ holeNumberArray_.Write(&hn_, 1);
+
+ unsigned char hs_ = read.HoleStatus();
+ holeStatusArray_.Write(&hs_, 1);
+
+ if (HasHoleXY()) {
+ int16_t xy[2] = {static_cast<int16_t>(read.HoleX()),
+ static_cast<int16_t>(read.HoleY())};
+ holeXYArray_.WriteRow(xy, 2);
+ }
+ return true;
+}
+
+void HDFZMWWriter::Flush(void) {
+ numEventArray_.Flush();
+ holeNumberArray_.Flush();
+ holeStatusArray_.Flush();
+ if (HasHoleXY())
+ holeXYArray_.Flush();
+}
+
+void HDFZMWWriter::Close(void) {
+ this->Flush();
+
+ numEventArray_.Close();
+ holeNumberArray_.Close();
+ holeStatusArray_.Close();
+ if (HasHoleXY())
+ holeXYArray_.Close();
+ zmwGroup_.Close();
+}
+
+bool HDFZMWWriter::InitializeChildHDFGroups(void) {
+ bool OK = true;
+
+ if (numEventArray_.Initialize(zmwGroup_, PacBio::GroupNames::numevent) == 0) {
+ FAILED_TO_CREATE_GROUP_ERROR(PacBio::GroupNames::numevent);
+ OK = false;
+ }
+
+ if (holeNumberArray_.Initialize(zmwGroup_, PacBio::GroupNames::holenumber) == 0) {
+ FAILED_TO_CREATE_GROUP_ERROR(PacBio::GroupNames::holenumber);
+ OK = false;
+ }
+
+ if (holeStatusArray_.Initialize(zmwGroup_, PacBio::GroupNames::holestatus) == 0) {
+ FAILED_TO_CREATE_GROUP_ERROR(PacBio::GroupNames::holestatus);
+ OK = false;
+ }
+
+ if (HasHoleXY()) {
+ if (holeXYArray_.Initialize(zmwGroup_, PacBio::GroupNames::holexy, 2) == 0) {
+ FAILED_TO_CREATE_GROUP_ERROR(PacBio::GroupNames::holexy);
+ OK = false;
+ }
+ }
+
+ return OK;
+}
+
+void HDFZMWWriter::_WriteAttributes(void)
+{
+ if (holeNumberArray_.IsInitialized() and holeNumberArray_.size() > 0) {
+ AddAttribute(holeNumberArray_, PacBio::AttributeNames::Common::description, PacBio::AttributeValues::ZMW::HoleNumber::description);
+ }
+
+ if (holeStatusArray_.IsInitialized() and holeStatusArray_.size() > 0) {
+ AddAttribute(holeStatusArray_, PacBio::AttributeNames::Common::description, PacBio::AttributeValues::ZMW::HoleStatus::description);
+ AddAttribute(holeStatusArray_, PacBio::AttributeNames::ZMW::HoleStatus::lookuptable, PacBio::AttributeValues::ZMW::HoleStatus::lookuptable);
+ }
+
+ if (holeXYArray_.IsInitialized()) {
+ AddAttribute(holeXYArray_, PacBio::AttributeNames::Common::description, PacBio::AttributeValues::ZMW::HoleXY::description);
+ }
+}
+
diff --git a/hdf/HDFZMWWriter.hpp b/hdf/HDFZMWWriter.hpp
new file mode 100644
index 0000000..ea2d4f1
--- /dev/null
+++ b/hdf/HDFZMWWriter.hpp
@@ -0,0 +1,120 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+
+#ifndef _BLASR_HDF_HDFZMWWriter_HPP_
+#define _BLASR_HDF_HDFZMWWriter_HPP_
+
+#include "HDFWriterBase.hpp"
+#include "BufferedHDFArray.hpp"
+#include "BufferedHDF2DArray.hpp"
+#include "SMRTSequence.hpp"
+
+class HDFBaseCallerWriter;
+
+class HDFZMWWriter: public HDFWriterBase {
+
+friend class HDFBaseCallerWriter;
+
+private:
+ /// ZMW/NumEvent
+ BufferedHDFArray<int> numEventArray_;
+
+ // ZMW/HoleNumber
+ BufferedHDFArray<unsigned int> holeNumberArray_;
+
+ // ZMW/HoleStatus
+ BufferedHDFArray<unsigned char> holeStatusArray_;
+
+ // ZMW/HoleXY
+ BufferedHDF2DArray<int16_t> holeXYArray_;
+
+private:
+ HDFGroup zmwGroup_;
+ HDFGroup & parentGroup_;
+ bool hasHoleXY_;
+
+public:
+ /// \name Constructors and Destructors
+ /// \{
+ HDFZMWWriter(const std::string & filename,
+ HDFGroup & parentGroup,
+ bool hasHoleXY = true);
+
+ ~HDFZMWWriter() ;
+ /// \}
+
+ /// \name Public Methods
+ /// \{
+
+ /// \note Write info of a SMRTSequence to ZMW,
+ /// (1) add length (UInt) of the sequence to NumEvent,
+ /// (2) add zmw hole number (UInt) of the sequence as a UInt to HoleNumber,
+ /// (3) add hole status (unsigned char) to HoleStatus,
+ /// (4) add hole coordinate xy as (int16_t, int16_t) to HoleXY
+ bool WriteOneZmw(const SMRTSequence & read);
+
+ /// \returns Whether or not ZMW contains the HoleXY dataset.
+ inline bool HasHoleXY(void) const;
+
+ /// \note Flushes all data from cache to disc.
+ void Flush(void);
+
+ /// \note Closes this zmw group as well as child hdf groups.
+ void Close(void);
+
+ /// \}
+
+private:
+ /// \name Private Methods
+ /// \{
+
+ /// \note Initialize child hdf groups under ZMW, including
+ /// NumEvent, HoleNumber, HoleStatus, HoleXY
+ /// \reutrns bool, whether or not child hdf groups successfully initialized.
+ bool InitializeChildHDFGroups(void);
+
+ /// \name Add attributes to HoleNumber, HoleXY, HoleStatus.
+ void _WriteAttributes(void);
+
+ /// \}
+};
+
+inline bool HDFZMWWriter::HasHoleXY(void) const
+{return hasHoleXY_;}
+
+#endif
diff --git a/hdf/Makefile b/hdf/Makefile
deleted file mode 100644
index 25247e3..0000000
--- a/hdf/Makefile
+++ /dev/null
@@ -1,94 +0,0 @@
-
-include ../common.mk
-
-# To enable building a shared library, invoke as "make SHARED_LIB=true ..."
-ifneq ($(SHARED_LIB),)
- # Generating shared library
- CXX_SHAREDFLAGS := -fPIC
- LD_SHAREDFLAGS := -shared -fPIC
- TARGET_LIB := libpbihdf.so
- # Developers should set these to appropriate defaults (other systems
- # will override these on the command line):
- HDF5_LIB := ../../../../prebuilt.out/prebuilt.out/hdf5/hdf5-1.8.12/centos-5/lib/libhdf5.so
- ZLIB_LIB := ../../../../prebuilt.tmpsrc/zlib/zlib_1.2.8/_output/install/lib/libz.so
- HTSLIB_LIB := ../../../staging/PostPrimary/pbbam/_output/install-build/lib/libpbbam.so
- PBBAM_LIB := ../../../staging/PostPrimary/pbbam/third-party/htslib/_output/install-build/lib/libhts.so
- LIBPBDATA_LIB := ../../../staging/PostPrimary/pbbam/third-party/htslib/_output/install-build/lib/libhts.so
-else
- # Generating shared library
- CXX_SHAREDFLAGS :=
- LD_SHAREDFLAGS :=
- TARGET_LIB := libpbihdf.a
- HDF5_LIB :=
- ZLIB_LIB :=
- HTSLIB_LIB :=
- PBBAM_LIB :=
- LIBPBDATA_LIB :=
-endif
-
-DEP_LIBS := $(HDF5_LIB) $(ZLIB_LIB) $(HTSLIB_LIB) $(PBBAM_LIB) $(PBDATA_LIB)
-
-# FIXME: remove PBDATA_INCLUDE and assign directly to LIBPBDATA_INCLUDE.
-# Use only LIBPBDATA_INCLUDE to be consistent with the libblasr Makefile
-PBDATA_INCLUDE := ../pbdata
-LIBPBDATA_INCLUDE := $(PBDATA_INCLUDE)
-PBBAM_INCLUDE := $(PBBAM)/include
-HTSLIB_INCLUDE ?= $(PBBAM)/third-party/htslib
-
-INCLUDES = -I$(LIBPBDATA_INCLUDE)
-
-
-ifeq ($(origin nopbbam), undefined)
- INCLUDES += -I$(PBBAM_INCLUDE) -I$(HTSLIB_INCLUDE) -I$(BOOST_INCLUDE)
-endif
-
-ifneq ($(ZLIB_ROOT), notfound)
- INCLUDES += -I$(ZLIB_ROOT)/include
-endif
-
-CXXOPTS := -std=c++11 -pedantic -MMD -MP
-sources := $(wildcard *.cpp)
-objects := $(sources:.cpp=.o)
-shared_objects := $(sources:.cpp=.shared.o)
-dependencies := $(objects:.o=.d) $(shared_objects:.o=.d)
-
-ifneq ($(HDF5_INC),)
-HDF_HEADERS :=
-INCLUDES += -I$(HDF5_INC)
-else
-HDF_HEADERS := hdf5-1.8.12-headers
-INCLUDES += -I./$(HDF_HEADERS)/src -I./$(HDF_HEADERS)/c++/src
-endif
-
-all : CXXFLAGS ?= -O3
-
-debug : CXXFLAGS ?= -g -ggdb -fno-inline
-
-profile : CXXFLAGS ?= -Os -pg
-
-g : CXXFLAGS ?= -g -ggdb -fno-inline -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fno-omit-frame-pointer
-
-all debug profile g: $(HDF_HEADERS) $(TARGET_LIB)
-
-libpbihdf.a: $(objects)
- $(AR_pp) $(ARFLAGS) $@ $^
-
-libpbihdf.so: $(shared_objects) $(DEP_LIBS)
- $(CXX) $(LD_SHAREDFLAGS) -o $@ $^
-
-%.o: %.cpp
- $(CXX) $(CXXOPTS) $(CXXFLAGS) $(INCLUDES) -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.o) $(@:%.o=%.d)" -c $< -o $@
-
-%.shared.o: %.cpp
- $(CXX) $(CXX_SHAREDFLAGS) $(CXXOPTS) $(CXXFLAGS) $(INCLUDES) -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.o) $(@:%.o=%.d)" -c $< -o $@
-
-$(HDF_HEADERS):
- curl -k -L https://www.dropbox.com/s/8971bcyy5o42rxb/hdf5-1.8.12-headers.tar.bz2\?dl\=0 | tar xjf -
-
-# .INTERMEDIATE: $(objects)
-
-clean:
- @rm -f libpbihdf.a libpbihdf.so
- @rm -f $(objects) $(shared_objects) $(dependencies)
-
--include $(dependencies)
diff --git a/hdf/build.mk b/hdf/build.mk
new file mode 120000
index 0000000..2247f36
--- /dev/null
+++ b/hdf/build.mk
@@ -0,0 +1 @@
+makefile
\ No newline at end of file
diff --git a/hdf/makefile b/hdf/makefile
new file mode 100644
index 0000000..8bf9ad5
--- /dev/null
+++ b/hdf/makefile
@@ -0,0 +1,33 @@
+all:
+
+THISDIR:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+-include ${CURDIR}/defines.mk
+include ${THISDIR}/../rules.mk
+
+CXXOPTS += -std=c++11 -pedantic
+INCLUDES += ${LIBPBDATA_INC} ${HDF5_INC} ${PBBAM_INC} ${HTSLIB_INC} ${BOOST_INC}
+LIBS += ${LIBPBDATA_LIB} ${HDF5_LIB} ${PBBAM_LIB} ${HTSLIB_LIB} ${ZLIB_LIB}
+LDFLAGS += $(patsubst %,-L%,${LIBS})
+LDLIBS += -lpbdata -lhdf5 -lhdf5_cpp
+
+all: libpbihdf.a libpbihdf${SH_LIB_EXT}
+
+paths := ${THISDIR}
+sources := $(wildcard ${THISDIR}*.cpp)
+sources := $(notdir ${sources})
+objects := $(sources:.cpp=.o)
+shared_objects := $(sources:.cpp=.shared.o)
+dependencies := $(objects:.o=.d) $(shared_objects:.o=.d)
+
+vpath %.cpp ${paths}
+
+libpbihdf.a: $(objects)
+ $(AR) $(ARFLAGS) $@ $^
+
+libpbihdf${SH_LIB_EXT}: $(shared_objects)
+
+clean:
+ rm -f libpbihdf.a libpbihdf.so *.o *.d
+
+-include $(dependencies)
+depend: $(dependencies:.d=.depend)
diff --git a/makefile b/makefile
new file mode 100644
index 0000000..c9d37af
--- /dev/null
+++ b/makefile
@@ -0,0 +1,31 @@
+SHELL=/bin/bash
+
+THISDIR:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+
+.PHONY: all libpbdata libhdf libblasr gtest clean cleanall
+
+all:
+ ${MAKE} libpbdata
+ ${MAKE} libpbihdf
+ ${MAKE} libblasr
+all-debug:
+ ${MAKE} CXXFLAGS=-g all
+all-opt:
+ ${MAKE} CXXFLAGS=-O3 all
+all-depend:
+ ${MAKE} -C ${THISDIR}/pbdata depend
+libpbdata:
+ ${MAKE} -C ${THISDIR}/pbdata libconfig.h
+ ${MAKE} -C ${THISDIR}/pbdata all
+libpbihdf:
+ ${MAKE} -C ${THISDIR}/hdf all
+libblasr:
+ ${MAKE} -C ${THISDIR}/alignment all
+gtest:
+ ${MAKE} -C ${THISDIR}/unittest gtest
+clean:
+ ${MAKE} -C ${THISDIR}/pbdata clean
+ ${MAKE} -C ${THISDIR}/hdf clean
+ ${MAKE} -C ${THISDIR}/alignment clean
+ ${MAKE} -C ${THISDIR}/unittest clean
+cleanall: clean
diff --git a/pbdata/.gitignore b/pbdata/.gitignore
new file mode 100644
index 0000000..4c3cf0b
--- /dev/null
+++ b/pbdata/.gitignore
@@ -0,0 +1,2 @@
+/libconfig.h
+/defines.mk
diff --git a/pbdata/CCSSequence.cpp b/pbdata/CCSSequence.cpp
index 1f8e5b9..85239db 100644
--- a/pbdata/CCSSequence.cpp
+++ b/pbdata/CCSSequence.cpp
@@ -5,22 +5,22 @@ void CCSSequence::Free() {
numConsensusBases = 0;
SMRTSequence::Free();
unrolledRead.Free();
- /*
- ClearMemory(passStartPulse);
- ClearMemory(passNumPulses);
- ClearMemory(passStartBase);
- ClearMemory(passNumBases);
- ClearMemory(passDirection);
- ClearMemory(adapterHitBefore);
- ClearMemory(adapterHitAfter);
- ClearMemory(adapterHitConfidence);
- */
}
int CCSSequence::GetStorageSize() {
return SMRTSequence::GetStorageSize() + unrolledRead.GetStorageSize();
}
+UInt CCSSequence::HoleNumber(void) const {
+ return SMRTSequence::HoleNumber();
+}
+
+CCSSequence & CCSSequence::HoleNumber(const UInt holeNumber) {
+ SMRTSequence::HoleNumber(holeNumber);
+ unrolledRead.HoleNumber(holeNumber);
+ return *this;
+}
+
//
// In the first iteration, Explode simply pulls the subreads out
// that are used in the ccs. Eventually, it will pull out all
@@ -31,5 +31,6 @@ void CCSSequence::Explode(std::vector<SMRTSequence> &subreads) {
int subreadIndex;
for (subreadIndex = 0; subreadIndex < numPasses; subreadIndex++) {
subreads[subreadIndex].ReferenceSubstring(this->unrolledRead, passStartBase[subreadIndex], passNumBases[subreadIndex]);
+ subreads[subreadIndex].zmwData = unrolledRead.zmwData;
}
}
diff --git a/pbdata/CCSSequence.hpp b/pbdata/CCSSequence.hpp
index 15e89d6..a415aca 100644
--- a/pbdata/CCSSequence.hpp
+++ b/pbdata/CCSSequence.hpp
@@ -5,11 +5,12 @@
#include "SMRTSequence.hpp"
#include "VectorUtils.hpp"
+
//
// A CCS Sequence is both a SMRTSequence itself, and contains a list of SMRTSequences.
//
class CCSSequence : public SMRTSequence {
- public:
+public:
UInt numPasses;
UInt numConsensusBases;
std::vector<DNALength> passStartPulse, passNumPulses, passStartBase, passNumBases;
@@ -23,17 +24,24 @@ class CCSSequence : public SMRTSequence {
//
SMRTSequence unrolledRead;
+public:
inline ~CCSSequence();
void Free();
+ UInt HoleNumber(void) const;
+
+ CCSSequence & HoleNumber(const UInt holeNumber);
+
int GetStorageSize();
- //
- //
- // In the first iteration, Explode simply pulls the subreads out
- // that are used in the ccs. Eventually, it will pull out all
- // high-quality subreads.
- //
+
+ /// \name
+ /// \{
+ /// In the first iteration, Explode simply pulls the subreads out
+ /// that are used in the ccs. Eventually, it will pull out all
+ /// high-quality subreads.
+ ///
void Explode(std::vector<SMRTSequence> &subreads);
+ /// \}
};
inline CCSSequence::~CCSSequence() {
diff --git a/pbdata/CompressedDNASequence.hpp b/pbdata/CompressedDNASequence.hpp
index a3fa6d1..2613f07 100644
--- a/pbdata/CompressedDNASequence.hpp
+++ b/pbdata/CompressedDNASequence.hpp
@@ -29,7 +29,7 @@ class CompressedDNASequence: public DNASequence {
CompressedDNASequence() {
const char t[] = "Compressed sequence\0";
titleLength = strlen(t);
- title = new char[titleLength+1];
+ title = ProtectedNew<char>(titleLength+1);
strcpy(title, t);
title[titleLength] = '\0';
}
@@ -62,13 +62,13 @@ class CompressedDNASequence: public DNASequence {
}
void Copy(FASTASequence &rhs) {
- seq = new CompressedNucleotide[rhs.length];
+ seq = ProtectedNew<CompressedNucleotide>(rhs.length);
memcpy(seq, rhs.seq, rhs.length);
length = rhs.length;
if (title != NULL) {
delete[] title;
}
- title = new char[rhs.titleLength+1];
+ title = ProtectedNew<char>(rhs.titleLength+1);
memcpy(title, rhs.title, rhs.titleLength);
titleLength = rhs.titleLength;
title[titleLength] = '\0';
diff --git a/pbdata/CompressedSequenceImpl.hpp b/pbdata/CompressedSequenceImpl.hpp
index 7759541..2144bcd 100644
--- a/pbdata/CompressedSequenceImpl.hpp
+++ b/pbdata/CompressedSequenceImpl.hpp
@@ -1,5 +1,6 @@
#ifndef _BLASR_COMPRESSED_SEQUENCES_IMPL_HPP_
#define _BLASR_COMPRESSED_SEQUENCES_IMPL_HPP_
+#include "utils.hpp"
template<typename T_Sequence>
void CompressedSequence<T_Sequence>::CopyConfiguration(CompressedSequence<T_Sequence> &rhs) {
@@ -54,13 +55,13 @@ char* CompressedSequence<T_Sequence>::GetName() {
template<typename T_Sequence>
void CompressedSequence<T_Sequence>::Copy(FASTASequence &rhs) {
- seq = new CompressedNucleotide[rhs.length];
+ seq = ProtectedNew<CompressedNucleotide>(rhs.length);
memcpy(seq, rhs.seq, rhs.length);
length = rhs.length;
if (title != NULL) {
delete[] title;
}
- title = new char[rhs.titleLength+1];
+ title = ProtectedNew<char>(rhs.titleLength+1);
memcpy(title, rhs.title, rhs.titleLength);
titleLength = rhs.titleLength;
title[titleLength] = '\0';
@@ -140,14 +141,14 @@ void CompressedSequence<T_Sequence>::Read(std::string inFileName) {
if (hasTitle) {
int inTitleLength;
in.read((char*) &inTitleLength, sizeof(int));
- char * inTitle = new char[inTitleLength+1];
+ char * inTitle = ProtectedNew<char>(inTitleLength+1);
in.read((char*) inTitle, inTitleLength);
inTitle[titleLength] = '\0';
CopyTitle(inTitle, inTitleLength);
delete [] inTitle;
}
in.read((char*) &length, sizeof(DNALength));
- seq = new Nucleotide[length];
+ seq = ProtectedNew<Nucleotide>(length);
in.read((char*) seq, length * sizeof(Nucleotide));
if (hasIndex) {
index.Read(in);
@@ -190,7 +191,7 @@ int CompressedSequence<T_Sequence>::BuildReverseIndex(int maxRun, int binSize) {
//
index.Free();
index.indexLength = hpi/index.binSize + 1;
- index.index = new int[index.indexLength];
+ index.index = ProtectedNew<int>(index.indexLength);
hpi = 0;
int ii = 0;
for (i = 0; i < length; i++) {
@@ -307,7 +308,7 @@ DNALength CompressedSequence<T_Sequence>::FourBitDecompressHomopolymers(int star
count >>= 4;
decompSeq.length += count;
}
- decompSeq.seq = new Nucleotide[decompSeq.length];
+ decompSeq.seq = ProtectedNew<Nucleotide>(decompSeq.length);
//
// Now store the actual decompressed seq.
diff --git a/pbdata/DNASequence.cpp b/pbdata/DNASequence.cpp
index cec3fab..eef5f0a 100644
--- a/pbdata/DNASequence.cpp
+++ b/pbdata/DNASequence.cpp
@@ -33,7 +33,7 @@ void DNASequence::Append(const DNASequence &rhs, DNALength appendPos) {
//
if (appendPos == 0) {
DNALength newSeqLength = length + rhs.length;
- newSeq = new Nucleotide[newSeqLength];
+ newSeq = ProtectedNew<Nucleotide>(newSeqLength);
memcpy(newSeq, seq, length);
memcpy(&newSeq[length], rhs.seq, rhs.length);
@@ -53,7 +53,7 @@ void DNASequence::Append(const DNASequence &rhs, DNALength appendPos) {
length = appendPos;
DNALength newSeqLength;
newSeqLength = length + rhs.length;
- newSeq = new Nucleotide[newSeqLength];
+ newSeq = ProtectedNew<Nucleotide>(newSeqLength);
memcpy(newSeq, seq, length);
memcpy(&newSeq[length], rhs.seq, rhs.length);
if (deleteOnExit and lengthCopy != 0) {
@@ -114,7 +114,7 @@ DNASequence& DNASequence::Copy(const DNASequence &rhs, DNALength rhsPos, DNALeng
seq = NULL;
}
else {
- seq = new Nucleotide [rhsLength];
+ seq = ProtectedNew<Nucleotide>(rhsLength);
memcpy(seq, &rhs.seq[rhsPos], rhsLength);
}
length = rhsLength;
@@ -139,7 +139,7 @@ void DNASequence::ShallowCopy(const DNASequence &rhs) {
deleteOnExit = false;
}
-int DNASequence::GetStorageSize() {
+int DNASequence::GetStorageSize() const {
return (length * sizeof(Nucleotide));
}
@@ -151,11 +151,11 @@ DNASequence &DNASequence::operator=(const DNASequence &rhs){
//
// synonym for printseq
//
-void DNASequence::Print(std::ostream &out, int lineLength) {
+void DNASequence::Print(std::ostream &out, int lineLength) const {
PrintSeq(out, lineLength);
}
-void DNASequence::PrintSeq(std::ostream &out, int lineLength) {
+void DNASequence::PrintSeq(std::ostream &out, int lineLength) const {
if (lineLength == 0) {
std::string line;
line.assign((char*)seq, length);
@@ -181,8 +181,7 @@ void DNASequence::PrintSeq(std::ostream &out, int lineLength) {
void DNASequence::Allocate(DNALength plength) {
DNASequence::Free();
-
- seq = new Nucleotide [plength];
+ seq = ProtectedNew<Nucleotide> (plength);
length = plength;
deleteOnExit = true;
}
@@ -266,12 +265,12 @@ void DNASequence::Assign(DNASequence &ref, DNALength start, DNALength plength) {
if (plength) {
length = plength;
- seq = new Nucleotide[length];
+ seq = ProtectedNew<Nucleotide> (length);
memcpy(seq, &ref.seq[start], length);
}
else if (start) {
length = ref.length - start;
- seq = new Nucleotide[length];
+ seq = ProtectedNew<Nucleotide> (length);
memcpy(seq, &ref.seq[start], length);
}
else {
@@ -298,7 +297,7 @@ void DNASequence::Concatenate(const Nucleotide *moreSeq, DNALength moreSeqLength
DNALength prevLength = length;
length += moreSeqLength;
Nucleotide *prev = seq;
- seq = new Nucleotide[length];
+ seq = ProtectedNew<Nucleotide> (length);
if (prev != NULL) {
memcpy(seq, prev, prevLength);
delete[] prev;
@@ -351,11 +350,11 @@ void DNASequence::CleanupASCII() {
}
}
-Nucleotide DNASequence::GetNuc(DNALength i) {
+Nucleotide DNASequence::GetNuc(DNALength i) const {
return seq[i];
}
-DNALength DNASequence::GetRepeatContent() {
+DNALength DNASequence::GetRepeatContent() const {
DNALength i;
DNALength nRepeat = 0;
for (i =0 ; i < length;i++) {
@@ -385,12 +384,12 @@ void DNASequence::Free() {
void DNASequence::Resize(DNALength newLength) {
DNASequence::Free();
- seq = new Nucleotide[newLength];
+ seq = ProtectedNew<Nucleotide>(newLength);
length = newLength;
deleteOnExit = true;
}
-DNALength DNASequence::GetSeqStorage() {
+DNALength DNASequence::GetSeqStorage() const{
return length;
}
@@ -400,3 +399,4 @@ DNASequence & DNASequence::Copy(const PacBio::BAM::BamRecord & record) {
}
#endif
+
diff --git a/pbdata/DNASequence.hpp b/pbdata/DNASequence.hpp
index 286e7d8..5aed371 100644
--- a/pbdata/DNASequence.hpp
+++ b/pbdata/DNASequence.hpp
@@ -8,6 +8,7 @@
#include <cassert>
#include "Types.h"
#include "NucConversion.hpp"
+#include "utils.hpp"
#include "libconfig.h"
#ifdef USE_PBBAM
@@ -41,15 +42,15 @@ public:
DNASequence & Copy(const std::string & rhs);
- int GetStorageSize();
+ int GetStorageSize() const;
DNASequence &operator=(const DNASequence &rhs);
DNASequence &operator=(const std::string &rhs);
- void Print(std::ostream &out, int lineLength = 50);
+ void Print(std::ostream &out, int lineLength = 50) const;
- void PrintSeq(std::ostream &out, int lineLength = 50);
+ void PrintSeq(std::ostream &out, int lineLength = 50) const;
void Allocate(DNALength plength);
@@ -99,9 +100,9 @@ public:
return seq[i];
}
- Nucleotide GetNuc(DNALength i);
+ Nucleotide GetNuc(DNALength i) const;
- DNALength GetRepeatContent();
+ DNALength GetRepeatContent() const;
void CleanupOnFree();
@@ -109,7 +110,7 @@ public:
void Resize(DNALength newLength);
- DNALength GetSeqStorage();
+ DNALength GetSeqStorage() const;
#ifdef USE_PBBAM
/// Copies a BamRecord as a DNASequence.
@@ -174,7 +175,7 @@ template<typename T>
DNALength ResizeSequence(T &dnaseq, DNALength newLength) {
assert(newLength > 0);
((T&)dnaseq).Free();
- dnaseq.seq = new Nucleotide[newLength];
+ dnaseq.seq = ProtectedNew<Nucleotide>(newLength);
dnaseq.length = newLength;
dnaseq.deleteOnExit = true;
return newLength;
diff --git a/pbdata/Enumerations.h b/pbdata/Enumerations.h
index b51575b..0fbd6fe 100644
--- a/pbdata/Enumerations.h
+++ b/pbdata/Enumerations.h
@@ -1,3 +1,40 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+
#ifndef _BLASR_ENUMERATIONS_HPP_
#define _BLASR_ENUMERATIONS_HPP_
@@ -15,17 +52,17 @@ typedef enum T_FileType {Fasta,
typedef enum T_Strand {Forward, Reverse} Strand;
typedef enum T_PlatformType {
- Springfield,
- Astro,
+ Astro=1,
+ Springfield=2,
NoPlatform
} PlatformId;
typedef enum T_RegionType {
- GlobalAccuracy,
- HQRegion,
Adapter,
Insert,
- ArtifactRegion
+ HQRegion,
+ BarCode,
+ UnknownRegionType
} RegionType;
typedef enum T_PulseMetricType {
diff --git a/pbdata/FASTAReader.cpp b/pbdata/FASTAReader.cpp
index 6477fc0..ba10d21 100644
--- a/pbdata/FASTAReader.cpp
+++ b/pbdata/FASTAReader.cpp
@@ -81,7 +81,7 @@ int FASTAReader::Init(string &seqInName, int passive) {
}
SetFileSize();
filePtr = (char*) mmap(0, fileSize, PROT_READ, MAP_PRIVATE, fileDes, 0);
- if (filePtr == NULL) {
+ if (filePtr == MAP_FAILED) {
cout << "ERROR, Fail to load FASTA file " << seqInName
<< " to virtual memory." << endl;
exit(1);
@@ -217,7 +217,7 @@ void FASTAReader::ReadTitle(long &p, FASTASequence & seq) {
int seqTitleLen;
ReadTitle(p, seqTitle, seqTitleLen);
seq.CopyTitle(seqTitle, seqTitleLen);
- if (seqTitle) {delete seqTitle;}
+ if (seqTitle) {delete [] seqTitle;}
}
void FASTAReader::ReadTitle(long &p, char *&title, int &titleLength) {
@@ -233,7 +233,10 @@ void FASTAReader::ReadTitle(long &p, char *&title, int &titleLength) {
titleLength = p - curPos;
if (titleLength > 0) {
if (title) {delete [] title; title = NULL;}
- title = new char[titleLength+1];
+ title = ProtectedNew<char> (titleLength + 1);
+ if (title == nullptr) {
+ cout << "ERROR, unable to read FASTA file to memory. " << endl; exit(1);
+ }
int t = 0;
for (p = curPos; p < curPos + titleLength; p++, t++) {
title[t] = filePtr[p];
@@ -294,7 +297,7 @@ int FASTAReader::GetNext(FASTASequence &seq) {
seq.length = 0;
if (seqLength > 0) {
seq.length = seqLength;
- seq.seq = new Nucleotide[seqLength+padding+1];
+ seq.seq = ProtectedNew <Nucleotide>(seqLength+padding+1);
p = curPos;
seq.deleteOnExit = true;
long s = 0;
diff --git a/pbdata/FASTASequence.cpp b/pbdata/FASTASequence.cpp
index 4b579cc..67cf81b 100644
--- a/pbdata/FASTASequence.cpp
+++ b/pbdata/FASTASequence.cpp
@@ -12,14 +12,14 @@ FASTASequence::FASTASequence() : DNASequence() {
// regardless of deleteOnExit.
}
-void FASTASequence::PrintSeq(ostream &out, int lineLength, char delim) {
+void FASTASequence::PrintSeq(ostream &out, int lineLength, char delim) const {
out << delim;
if (title) out << title;
out << endl;
- static_cast<DNASequence*>(this)->PrintSeq(out, lineLength);
+ static_cast<const DNASequence*>(this)->PrintSeq(out, lineLength);
}
-int FASTASequence::GetStorageSize() {
+int FASTASequence::GetStorageSize() const {
if (!title)
return DNASequence::GetStorageSize();
return strlen(title) + DNASequence::GetStorageSize();
@@ -42,36 +42,6 @@ string FASTASequence::GetName() const {
return name;
}
-//
-// Define some no-ops to satisfy instantiating templates that
-// expect these to exist.
-//
-bool FASTASequence::StoreHoleNumber(int holeNumber) {return false;}
-bool FASTASequence::StoreHoleStatus(unsigned char holeStatus) {return false;}
-bool FASTASequence::StorePlatformId(PlatformId platformId) { return false;}
-bool FASTASequence::StoreZMWData(ZMWGroupEntry &data) { return false;}
-bool GetHoleNumber (int &holeNumberP) {
- //
- // There is no notion of a hole number for a fasta sequence.
- //
- return false;
-}
-
-bool FASTASequence::StoreXY(int16_t xy[]) {return false;}
-
-bool FASTASequence::GetXY(int xyP[]) {
- //
- // Although the xyP is stored in the fasta title for astro reads
- // this class is more general than an astro read, so do not assume
- // that it may be found in the title.
- //
- // So, this function is effectively a noop.
- //
- xyP[0] = xyP[1] = 0;
- return false;
-}
-
-
void FASTASequence::ShallowCopy(const FASTASequence &rhs) {
CheckBeforeCopyOrReference(rhs, "FASTASequence");
FASTASequence::Free();
@@ -124,7 +94,7 @@ void FASTASequence::CopyTitle(string str) {
FASTASequence::CopyTitle(str.c_str(), str.size());
}
-void FASTASequence::GetFASTATitle(string& fastaTitle) {
+void FASTASequence::GetFASTATitle(string& fastaTitle) const {
// look for the first space, and return the string until there.
int i;
for (i = 0; i < titleLength; i++ ){
diff --git a/pbdata/FASTASequence.hpp b/pbdata/FASTASequence.hpp
index bfadb58..24b1633 100644
--- a/pbdata/FASTASequence.hpp
+++ b/pbdata/FASTASequence.hpp
@@ -29,22 +29,12 @@ public:
FASTASequence();
inline ~FASTASequence();
- void PrintSeq(std::ostream &out, int lineLength = 50, char delim='>');
+ void PrintSeq(std::ostream &out, int lineLength = 50, char delim='>') const;
- int GetStorageSize();
+ int GetStorageSize() const;
std::string GetName() const;
- virtual bool StoreHoleNumber(int holeNumber);
- virtual bool StoreHoleStatus(unsigned char holeStatus);
- virtual bool StorePlatformId(PlatformId platformId);
- virtual bool StoreZMWData(ZMWGroupEntry &data);
- virtual bool StoreXY(int16_t xy[]);
-
- bool GetHoleNumber (int &holeNumberP);
-
- bool GetXY(int xyP[]);
-
void ShallowCopy(const FASTASequence &rhs);
std::string GetTitle() const;
@@ -55,7 +45,7 @@ public:
void CopyTitle(std::string str);
- void GetFASTATitle(std::string& fastaTitle);
+ void GetFASTATitle(std::string& fastaTitle) const;
void CopySubsequence(FASTASequence &rhs, int readStart, int readEnd=-1);
diff --git a/pbdata/FASTQReader.cpp b/pbdata/FASTQReader.cpp
index 324d57a..73128e5 100644
--- a/pbdata/FASTQReader.cpp
+++ b/pbdata/FASTQReader.cpp
@@ -45,7 +45,7 @@ int FASTQReader::GetNext(FASTQSequence &seq) {
seq.length = p2 - p;
long seqPos;
if (seq.length > 0) {
- seq.seq = new Nucleotide[seq.length];
+ seq.seq = ProtectedNew<Nucleotide>(seq.length);
p2 = p;
seqPos = 0;
while(p2 < fileSize and filePtr[p2] != '\n') { seq.seq[seqPos] = filePtr[p2]; p2++; seqPos++;}
diff --git a/pbdata/FASTQSequence.cpp b/pbdata/FASTQSequence.cpp
index 26789ac..aa9c59a 100644
--- a/pbdata/FASTQSequence.cpp
+++ b/pbdata/FASTQSequence.cpp
@@ -18,7 +18,7 @@ using namespace std;
//
int FASTQSequence::charToQuality = FASTQ_CHAR_TO_QUALITY;
-QVScale FASTQSequence::GetQVScale() {
+QVScale FASTQSequence::GetQVScale() const {
return qvScale;
}
@@ -42,7 +42,7 @@ FASTQSequence::GetQVPointerByIndex(int index) {
return NULL;
}
-int FASTQSequence::GetStorageSize() {
+int FASTQSequence::GetStorageSize() const {
int total = 0;
int nQV = 0;
int nTag =0;
@@ -88,12 +88,10 @@ FASTQSequence::FASTQSequence() : FASTASequence() {
insertionQVPrior = 0;
substitutionQVPrior = 0;
preBaseDeletionQVPrior = 0;
-
- subreadStart = subreadEnd = 0;
qvScale = PHRED;
}
-QualityValue FASTQSequence::GetDeletionQV(DNALength pos) {
+QualityValue FASTQSequence::GetDeletionQV(DNALength pos) const {
assert(pos < ((unsigned int)-1));
assert(pos < length);
if (deletionQV.Empty()) {
@@ -104,7 +102,7 @@ QualityValue FASTQSequence::GetDeletionQV(DNALength pos) {
}
}
-QualityValue FASTQSequence::GetMergeQV(DNALength pos) {
+QualityValue FASTQSequence::GetMergeQV(DNALength pos) const {
assert(pos < ((unsigned int)-1));
assert(pos < length);
if (mergeQV.Empty()) {
@@ -115,7 +113,7 @@ QualityValue FASTQSequence::GetMergeQV(DNALength pos) {
}
}
-Nucleotide FASTQSequence::GetSubstitutionTag(DNALength pos) {
+Nucleotide FASTQSequence::GetSubstitutionTag(DNALength pos) const {
if (substitutionTag == NULL) {
return 'N';
}
@@ -124,7 +122,7 @@ Nucleotide FASTQSequence::GetSubstitutionTag(DNALength pos) {
return substitutionTag[pos];
}
-Nucleotide FASTQSequence::GetDeletionTag(DNALength pos) {
+Nucleotide FASTQSequence::GetDeletionTag(DNALength pos) const {
if (deletionTag == NULL) {
return 'N';
}
@@ -133,7 +131,7 @@ Nucleotide FASTQSequence::GetDeletionTag(DNALength pos) {
return deletionTag[pos];
}
-QualityValue FASTQSequence::GetInsertionQV(DNALength pos) {
+QualityValue FASTQSequence::GetInsertionQV(DNALength pos) const {
if (insertionQV.Empty()) {
return insertionQVPrior;
}
@@ -142,7 +140,7 @@ QualityValue FASTQSequence::GetInsertionQV(DNALength pos) {
return insertionQV[pos];
}
-QualityValue FASTQSequence::GetSubstitutionQV(DNALength pos) {
+QualityValue FASTQSequence::GetSubstitutionQV(DNALength pos) const {
if (substitutionQV.Empty()) {
return substitutionQVPrior;
}
@@ -151,7 +149,7 @@ QualityValue FASTQSequence::GetSubstitutionQV(DNALength pos) {
return substitutionQV[pos];
}
-QualityValue FASTQSequence::GetPreBaseDeletionQV(DNALength pos, Nucleotide nuc) {
+QualityValue FASTQSequence::GetPreBaseDeletionQV(DNALength pos, Nucleotide nuc) const {
if (preBaseDeletionQV.Empty()) {
return preBaseDeletionQVPrior;
}
@@ -270,7 +268,7 @@ void FASTQSequence::AllocateMergeQVSpace(DNALength len) {
void FASTQSequence::AllocateDeletionTagSpace(DNALength qualLength) {
if (deletionTag != NULL) delete[] deletionTag;
- deletionTag = new Nucleotide[qualLength];
+ deletionTag = ProtectedNew<Nucleotide>(qualLength);
}
void FASTQSequence::AllocatePreBaseDeletionQVSpace(DNALength qualLength) {
@@ -287,7 +285,7 @@ void FASTQSequence::AllocateSubstitutionQVSpace(DNALength qualLength ){
void FASTQSequence::AllocateSubstitutionTagSpace(DNALength qualLength ){
if (substitutionTag != NULL) delete[] substitutionTag;
- substitutionTag = new Nucleotide[qualLength];
+ substitutionTag = ProtectedNew<Nucleotide>(qualLength);
}
void FASTQSequence::AllocateRichQualityValues(DNALength qualLength) {
@@ -335,7 +333,7 @@ void FASTQSequence::Assign(FASTQSequence &rhs) {
FASTQSequence::CopyQualityValues(rhs);
}
-void FASTQSequence::PrintFastq(ostream &out, int lineLength) {
+void FASTQSequence::PrintFastq(ostream &out, int lineLength) const {
PrintSeq(out, lineLength, '@');
if (lineLength == 0) {
out << endl;
@@ -346,12 +344,12 @@ void FASTQSequence::PrintFastq(ostream &out, int lineLength) {
}
}
-void FASTQSequence::PrintFastqQuality(ostream &out, int lineLength) {
+void FASTQSequence::PrintFastqQuality(ostream &out, int lineLength) const {
out << "+" << endl;
PrintAsciiQual(out, lineLength);
}
-bool FASTQSequence::GetQVs(const QVIndex & qvIndex, std::vector<uint8_t> & qvs, bool reverse) {
+bool FASTQSequence::GetQVs(const QVIndex & qvIndex, std::vector<uint8_t> & qvs, bool reverse) const {
qvs.clear();
uint8_t * qualPtr;
int charOffset = charToQuality;
@@ -391,7 +389,7 @@ bool FASTQSequence::GetQVs(const QVIndex & qvIndex, std::vector<uint8_t> & qvs,
return true;
}
-QVIndex FASTQSequence::GetQVIndex(const std::string & qvName) {
+QVIndex FASTQSequence::GetQVIndex(const std::string & qvName) const {
if (qvName == "QualityValue") {
return I_QualityValue;
} else if (qvName == "InsertionQV") {
@@ -412,11 +410,11 @@ QVIndex FASTQSequence::GetQVIndex(const std::string & qvName) {
}
}
-bool FASTQSequence::GetQVs(const std::string & qvName, std::vector<uint8_t> & qvs, bool reverse){
+bool FASTQSequence::GetQVs(const std::string & qvName, std::vector<uint8_t> & qvs, bool reverse) const {
return GetQVs(GetQVIndex(qvName), qvs, reverse);
}
-bool FASTQSequence::GetQVs(const std::string & qvName, std::string & qvsStr, bool reverse) {
+bool FASTQSequence::GetQVs(const std::string & qvName, std::string & qvsStr, bool reverse) const {
std::vector<uint8_t> qvs;
bool OK = GetQVs(qvName, qvs, reverse);
qvsStr = string(qvs.begin(), qvs.end());
@@ -424,7 +422,7 @@ bool FASTQSequence::GetQVs(const std::string & qvName, std::string & qvsStr, boo
}
void FASTQSequence::PrintAsciiRichQuality(ostream &out,
- int whichQuality, int lineLength) {
+ int whichQuality, int lineLength) const {
vector<uint8_t> qvs;
bool OK = GetQVs(static_cast<QVIndex>(whichQuality), qvs);
@@ -460,11 +458,11 @@ void FASTQSequence::PrintAsciiRichQuality(ostream &out,
}
}
-void FASTQSequence::PrintAsciiQual(ostream &out, int lineLength) {
+void FASTQSequence::PrintAsciiQual(ostream &out, int lineLength) const {
PrintAsciiRichQuality(out, 0, lineLength);
}
-void FASTQSequence::PrintQual(ostream &out, int lineLength) {
+void FASTQSequence::PrintQual(ostream &out, int lineLength) const {
out << ">" << this->title << endl;
DNALength i;
for (i = 0; i < length; i++ ){
@@ -479,7 +477,7 @@ void FASTQSequence::PrintQual(ostream &out, int lineLength) {
}
}
-void FASTQSequence::PrintQualSeq(ostream &out, int lineLength) {
+void FASTQSequence::PrintQualSeq(ostream &out, int lineLength) const {
FASTASequence::PrintSeq(out, lineLength);
lineLength /= 4;
PrintQual(out, lineLength);
@@ -593,7 +591,7 @@ void FASTQSequence::LowerCaseMask(int qThreshold) {
}
}
-float FASTQSequence::GetAverageQuality() {
+float FASTQSequence::GetAverageQuality() const {
DNALength p;
float totalQ;
if (qual.Empty() == true) { return 0.0; }
@@ -643,9 +641,5 @@ void FASTQSequence::Copy(const PacBio::BAM::BamRecord & record) {
AllocateDeletionTagSpace(static_cast<DNALength>(qvs.size()));
std::memcpy(deletionTag, qvs.c_str(), qvs.size() * sizeof(char));
}
- // preBaseQVs are not included in BamRecord, and will not be copied.
-
- subreadStart = static_cast<int>(record.QueryStart());
- subreadEnd = static_cast<int>(record.QueryEnd());
}
#endif
diff --git a/pbdata/FASTQSequence.hpp b/pbdata/FASTQSequence.hpp
index f94b2b0..334d5e6 100644
--- a/pbdata/FASTQSequence.hpp
+++ b/pbdata/FASTQSequence.hpp
@@ -22,35 +22,34 @@ public:
QualityValueVector<QualityValue> mergeQV;
Nucleotide *deletionTag;
Nucleotide *substitutionTag;
- int subreadStart, subreadEnd;
QualityValue deletionQVPrior, insertionQVPrior, substitutionQVPrior, preBaseDeletionQVPrior;
QVScale qvScale;
- QVScale GetQVScale();
+ QVScale GetQVScale() const;
void SetQVScale(QVScale qvScaleP);
QualityValueVector<QualityValue>* GetQVPointerByIndex(int index);
- int GetStorageSize();
+ int GetStorageSize() const;
FASTQSequence();
inline ~FASTQSequence();
- QualityValue GetDeletionQV(DNALength pos);
+ QualityValue GetDeletionQV(DNALength pos) const;
- QualityValue GetMergeQV(DNALength pos);
+ QualityValue GetMergeQV(DNALength pos) const;
- Nucleotide GetSubstitutionTag(DNALength pos);
+ Nucleotide GetSubstitutionTag(DNALength pos) const;
- Nucleotide GetDeletionTag(DNALength pos);
+ Nucleotide GetDeletionTag(DNALength pos) const;
- QualityValue GetInsertionQV(DNALength pos);
+ QualityValue GetInsertionQV(DNALength pos) const;
- QualityValue GetSubstitutionQV(DNALength pos);
+ QualityValue GetSubstitutionQV(DNALength pos) const;
- QualityValue GetPreBaseDeletionQV(DNALength pos, Nucleotide nuc);
+ QualityValue GetPreBaseDeletionQV(DNALength pos, Nucleotide nuc) const;
void ShallowCopy(const FASTQSequence &rhs);
@@ -90,40 +89,40 @@ public:
void Assign(FASTQSequence &rhs);
- void PrintFastq(std::ostream &out, int lineLength=50);
+ void PrintFastq(std::ostream &out, int lineLength=50) const;
- void PrintFastqQuality(std::ostream &out, int lineLength=50);
+ void PrintFastqQuality(std::ostream &out, int lineLength=50) const;
- QVIndex GetQVIndex(const std::string & qvName);
+ QVIndex GetQVIndex(const std::string & qvName) const;
/// Get QVs in vector<uint8_t> associated with the given QVIndex.
/// \returns true if qvs are available, false otherwise
/// \param [in] qvIndex - enum QVIndex
/// \param [out] qvs - obtained QVs.
/// \param [in] reverse - reverse orders of QVs or not
- bool GetQVs(const QVIndex & qvIndex, std::vector<uint8_t> & qvs, bool reverse=false);
+ bool GetQVs(const QVIndex & qvIndex, std::vector<uint8_t> & qvs, bool reverse=false) const;
/// Get QVs in vector<uint8_t>, given with QV Name.
/// \returns true if qvs are available, false, otherwise
/// \param [in] qvName - InsertionQV, DeletionQV, SubstitionQV, MergeQV, SubstitutionTag, DeletionTag
/// \param [out] qvs - obtians QVs.
/// \param [in] reverse - reverse orders of QVs or not.
- bool GetQVs(const std::string & qvName, std::vector<uint8_t> & qvs, bool reverse=false);
+ bool GetQVs(const std::string & qvName, std::vector<uint8_t> & qvs, bool reverse=false) const;
/// Get QVs in string, given with QV Name.
/// \returns true if qvs are available, false, otherwise
/// \param [in] qvName - InsertionQV, DeletionQV, SubstitionQV, MergeQV, SubstitutionTag, DeletionTag
/// \param [out] qvs - obtians QVs.
/// \param [in] reverse - reverse order of QVs or not
- bool GetQVs(const std::string & qvName, std::string & qvs, bool reverse=false);
+ bool GetQVs(const std::string & qvName, std::string & qvs, bool reverse=false) const;
- void PrintAsciiRichQuality(std::ostream &out, int whichQuality, int lineLength=50);
+ void PrintAsciiRichQuality(std::ostream &out, int whichQuality, int lineLength=50) const;
- void PrintAsciiQual(std::ostream &out, int lineLength=50) ;
+ void PrintAsciiQual(std::ostream &out, int lineLength=50) const;
- void PrintQual(std::ostream &out, int lineLength = 50);
+ void PrintQual(std::ostream &out, int lineLength = 50) const;
- void PrintQualSeq(std::ostream &out, int lineLength = 50);
+ void PrintQualSeq(std::ostream &out, int lineLength = 50) const;
void MakeRC(FASTQSequence &rc);
@@ -131,7 +130,7 @@ public:
void LowerCaseMask(int qThreshold);
- float GetAverageQuality();
+ float GetAverageQuality() const;
#ifdef USE_PBBAM
/// Copy name, sequence, and QVs from BamRecord.
diff --git a/pbdata/MD5Utils.cpp b/pbdata/MD5Utils.cpp
index dd1f45a..02f2e67 100644
--- a/pbdata/MD5Utils.cpp
+++ b/pbdata/MD5Utils.cpp
@@ -44,6 +44,7 @@ documentation and/or software.
#include <assert.h>
#include <strings.h>
#include <iostream>
+#include "utils.hpp"
#include "MD5Utils.hpp"
using namespace std;
@@ -234,7 +235,7 @@ MD5::MD5(ifstream& stream){
unsigned char *MD5::raw_digest(){
- uint1 *s = new uint1[16];
+ uint1 *s = ProtectedNew<uint1>(16);
if (!finalized){
cerr << "MD5::raw_digest: Can't get digest if you haven't "<<
@@ -253,7 +254,7 @@ unsigned char *MD5::raw_digest(){
char *MD5::hex_digest(){
int i;
- char *s= new char[33];
+ char *s= ProtectedNew<char>(33);
if (!finalized){
cerr << "MD5::hex_digest: Can't get digest if you haven't "<<
diff --git a/pbdata/Makefile b/pbdata/Makefile
deleted file mode 100644
index 4d3a688..0000000
--- a/pbdata/Makefile
+++ /dev/null
@@ -1,85 +0,0 @@
-# Let common.mk know that that the third party checks should be ignored
-COMMON_NO_THIRD_PARTY_REQD := true
-
-include ../common.mk
-
-CXXOPTS := -std=c++11 -pedantic -MMD -MP
-INCLUDES := -I. -Imatrix -Ireads -Iqvs -Imetagenome -Isaf -Iutils -Ialignment
-
-# To enable building a shared library, invoke as "make SHARED_LIB=true ..."
-ifneq ($(SHARED_LIB),)
- # Generating shared library
- CXX_SHAREDFLAGS := -fPIC
- LD_SHAREDFLAGS := -shared -fPIC
- TARGET_LIB := libpbdata.so
- # Developers should set these to appropriate defaults (other systems
- # will override these on the command line):
- PBBAM_LIB := ../../../staging/PostPrimary/pbbam/_output/install-build/lib/libpbbam.so
- HTSLIB_LIB := ../../../staging/PostPrimary/htslib/_output/install-build/lib/libhts.so
-else
- # Generating shared library
- CXX_SHAREDFLAGS :=
- TARGET_LIB := libpbdata.a
- LD_SHAREDFLAGS :=
- HTSLIB_LIB :=
- PBBAM_LIB :=
-endif
-
-DEP_LIBS := $(HTSLIB_LIB) $(PBBAM_LIB)
-
-PBBAM_INCLUDE := $(PBBAM)/include
-HTSLIB_INCLUDE ?= $(PBBAM)/third-party/htslib
-
-sources := $(wildcard *.cpp) \
- $(wildcard matrix/*.cpp) \
- $(wildcard reads/*.cpp) \
- $(wildcard metagenome/*.cpp) \
- $(wildcard qvs/*.cpp) \
- $(wildcard saf/*.cpp) \
- $(wildcard utils/*.cpp) \
- $(wildcard loadpulses/*.cpp) \
- $(wildcard alignment/*.cpp) \
- $(wildcard amos/*.cpp) \
- $(wildcard sam/*.cpp)
-
-objects := $(sources:.cpp=.o)
-shared_objects := $(sources:.cpp=.shared.o)
-dependencies := $(objects:.o=.d) $(shared_objects:.o=.d)
-
-all : CXXFLAGS ?= -O3
-
-debug : CXXFLAGS ?= -g -ggdb -fno-inline
-
-profile : CXXFLAGS ?= -Os -pg
-
-g: CXXFLAGS = -g -ggdb -fno-inline -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fno-omit-frame-pointer
-
-all debug profile g: mklibconfig $(TARGET_LIB)
-
-mklibconfig:
-ifeq ($(origin nopbbam), undefined)
- @grep "USE_PBBAM" libconfig.h 2>/dev/null 1>/dev/null || echo "#define USE_PBBAM" > libconfig.h
- INCLUDES += -I$(PBBAM_INCLUDE) -I$(HTSLIB_INCLUDE) -I$(BOOST_INCLUDE)
-else
- @rm -f libconfig.h && echo "" > libconfig.h && echo "no use libpbbam"
-endif
-
-libpbdata.a: $(objects)
- $(AR_pp) $(ARFLAGS) $@ $^
-
-libpbdata.so: $(shared_objects) $(DEP_LIBS)
- $(CXX_pp) $(LD_SHAREDFLAGS) -o $@ $^
-
-%.o: %.cpp
- $(CXX_pp) $(CXXOPTS) $(CXXFLAGS) $(INCLUDES) -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.o) $(@:%.o=%.d)" -c $< -o $@
-
-%.shared.o: %.cpp
- $(CXX_pp) $(CXX_SHAREDFLAGS) $(CXXOPTS) $(CXXFLAGS) $(INCLUDES) -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.o) $(@:%.o=%.d)" -c $< -o $@
-
-# .INTERMEDIATE: $(objects)
-
-clean:
- @rm -f libpbdata.a libpbdata.so
- @rm -f $(objects) $(shared_objects) $(dependencies)
-
--include $(dependencies)
diff --git a/pbdata/PacBioDefs.h b/pbdata/PacBioDefs.h
new file mode 100644
index 0000000..2f80381
--- /dev/null
+++ b/pbdata/PacBioDefs.h
@@ -0,0 +1,180 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#ifndef _PBDATA_PACBIO_DEFS_H_
+#define _PBDATA_PACBIO_DEFS_H_
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace GroupNames {
+ ///PulseData
+ static const std::string pulsedata = "PulseData";
+
+ ///PulseData/BaseCalls
+ static const std::string basecalls = "BaseCalls";
+ ///PulseData/Regions
+ static const std::string regions = "Regions";
+
+ ///PulseData/BaseCalls/BaseCall
+ static const std::string basecall = "Basecall";
+
+ static const std::string qualityvalue = "QualityValue";
+ static const std::string deletionqv = "DeletionQV";
+ static const std::string deletiontag = "DeletionTag";
+ static const std::string insertionqv = "InsertionQV";
+ static const std::string mergeqv = "MergeQV";
+ static const std::string substitutionqv = "SubstitutionQV";
+ static const std::string substitutiontag = "SubstitutionTag";
+ static const std::string prebaseframes = "PreBaseFrames";
+ static const std::string widthinframes = "WidthInFrames";
+
+ ///PulseData/BaseCalls/ZMW
+ static const std::string zmw = "ZMW";
+ static const std::string zmwmetrics = "ZMWMetrics";
+
+ ///PulseData/BaseCalls/ZMW/HoleNumber
+ static const std::string holenumber = "HoleNumber";
+ static const std::string holestatus = "HoleStatus";
+ static const std::string holexy = "HoleXY";
+ static const std::string numevent = "NumEvent";
+
+ ///PulseData/BaseCalls/ZMWMetrics/HQRegionSNR
+ static const std::string hqregionsnr = "HQRegionSNR";
+ static const std::string readscore = "ReadScore";
+ static const std::string productivity = "Productivity";
+
+ static const std::vector<std::string> BaxQVNames ({
+ deletionqv,
+ deletiontag,
+ insertionqv,
+ mergeqv,
+ substitutionqv,
+ substitutiontag,
+ prebaseframes,
+ widthinframes,
+ hqregionsnr,
+ readscore});
+} // namespace GroupNames
+
+namespace AttributeNames {
+ namespace Common {
+ static const std::string changelistid = "ChangeListID";
+ static const std::string description = "Description";
+ } // Common
+
+ namespace ZMW {
+ namespace HoleStatus {
+ static const std::string lookuptable = "LookupTable";
+ }
+ }
+
+ namespace Regions {
+ static const std::string columnnames = "ColumnNames";
+ static const std::string regiontypes = "RegionTypes";
+ static const std::string regiondescriptions = "RegionDescriptions";
+ static const std::string regionsources = "RegionSources";
+ } // Regions
+
+ namespace ScanData {
+ namespace DyeSet {
+ static const std::string basemap = "BaseMap";
+ }
+ } // ScanData
+
+
+} // namespace AttributeNames
+
+namespace AttributeValues {
+
+ namespace ZMW {
+ namespace HoleNumber {
+ static const std::string description = "Hole number on chip array";
+ } //namespace HoleNumber
+
+ namespace HoleStatus {
+ static const std::string description = "Type of data coming from ZMW";
+ static const std::vector<std::string> lookuptable =
+ {"SEQUENCING", "ANTIHOLE", "FIDUCIAL", "SUSPECT",
+ "ANTIMIRROR", "FDZMW", "FBZMW", "ANTIBEAMLET", "OUTSIDEFOV"};
+
+ static const unsigned char sequencingzmw = 0; // not '0'
+ static const unsigned char outsidefov = 8; // not '8'
+ } // namespace HoleStatus
+
+ namespace HoleXY {
+ static const std::string description = "Coordinates of ZMW on Chip";
+ } // namespace HoleXY
+
+ } // namespace ZMW
+
+ namespace Regions {
+ static const std::vector<std::string> columnnames =
+ {"HoleNumber", "Region type index", "Region start in bases",
+ "Region end in bases", "Region score"};
+ static const std::vector<std::string> regiontypes =
+ {"Adapter", "Insert", "HQRegion"};
+ static const std::vector<std::string> regiondescriptions =
+ {"Adapter Hit", "Insert Region",
+ "High Quality bases region. Score is 1000 * predicted accuracy, where predicted accuary is 0 to 1.0"};
+ static const std::vector<std::string> regionsources =
+ {"AdapterFinding", "AdapterFinding", "PulseToBase Region classifer"};
+ }
+
+ namespace ZMWMetrics {
+ namespace HQRegionSNR {
+ static const std::string description = "HQRegion average signal to noise ratio";
+ }
+ namespace ReadScore {
+ static const std::string description = "Read raw accuracy prediction";
+ }
+ namespace Productivity {
+ static const std::string description = "ZMW productivity classification";
+ }
+ } // ZMWMetrics
+
+ namespace ScanData {
+ namespace DyeSet {
+ static const std::string basemap = "ACGT"; // default, order matters!
+ }
+ } // ScanData
+
+} // namespace AttributeValues
+
+} // namespace PacBio
+
+#endif
diff --git a/pbdata/PackedDNASequence.cpp b/pbdata/PackedDNASequence.cpp
index 957839b..33a556f 100644
--- a/pbdata/PackedDNASequence.cpp
+++ b/pbdata/PackedDNASequence.cpp
@@ -69,7 +69,7 @@ void PackedDNASequence::Allocate(DNALength numberOfNucleotides) {
length = numberOfNucleotides;
if (seq) {delete [] seq; seq = NULL;}
if (arrayLength > 0) {
- seq = new PackedDNAWord[arrayLength];
+ seq = ProtectedNew<PackedDNAWord>(arrayLength);
std::fill(seq, seq + arrayLength, 0);
}
}
@@ -79,7 +79,7 @@ void PackedDNASequence::CreateFromDNASequence(DNASequence &dnaSeq) {
length = dnaSeq.length;
if (seq) {delete [] seq; seq = NULL;}
if (arrayLength > 0) {
- seq = new PackedDNAWord[arrayLength];
+ seq = ProtectedNew<PackedDNAWord>(arrayLength);
DNALength pos;
for (pos = 0; pos < dnaSeq.length; pos++) {
Set(pos, ThreeBit[dnaSeq[pos]]);
@@ -214,7 +214,7 @@ void PackedDNASequence::Read(std::istream &in) {
in.read((char*)&length, sizeof(length));
if (seq) {delete [] seq; seq = NULL;}
if (arrayLength > 0) {
- seq = new PackedDNAWord[arrayLength];
+ seq = ProtectedNew<PackedDNAWord>(arrayLength);
in.read((char*)seq, sizeof(PackedDNAWord)*arrayLength);
}
}
diff --git a/pbdata/ReverseCompressIndex.cpp b/pbdata/ReverseCompressIndex.cpp
index 5179735..d96a20f 100644
--- a/pbdata/ReverseCompressIndex.cpp
+++ b/pbdata/ReverseCompressIndex.cpp
@@ -1,5 +1,6 @@
#include <iostream>
#include <fstream>
+#include "utils.hpp"
#include "ReverseCompressIndex.hpp"
ReverseCompressIndex::ReverseCompressIndex() {
@@ -27,7 +28,7 @@ void ReverseCompressIndex::Read(std::ifstream &in) {
in.read((char*) &indexLength, sizeof(int));
in.read((char*) &binSize, sizeof(int));
in.read((char*) &maxRun, sizeof(int));
- index = new int[indexLength];
+ index = ProtectedNew<int>(indexLength);
in.read((char*) index, sizeof(int) *indexLength);
}
diff --git a/pbdata/SMRTSequence.cpp b/pbdata/SMRTSequence.cpp
index baabf57..6fc8f7b 100644
--- a/pbdata/SMRTSequence.cpp
+++ b/pbdata/SMRTSequence.cpp
@@ -1,36 +1,72 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+
#include <stdlib.h>
+#include "utils/SMRTTitle.hpp"
#include "SMRTSequence.hpp"
using namespace std;
-void SMRTSequence::SetNull() {
- pulseIndex = NULL;
- preBaseFrames = NULL;
- widthInFrames = NULL;
- xy[0] = 0; xy[1] = 0;
- // These are not allocted by default.
- meanSignal = maxSignal = midSignal = NULL;
- classifierQV = NULL;
- startFrame = NULL;
- platform = NoPlatform;
- // By default, allow the entire read.
- lowQualityPrefix = lowQualitySuffix = 0;
- highQualityRegionScore = 0;
+SMRTSequence::SMRTSequence()
+ : FASTQSequence()
+ , subreadStart_(0) // subread start
+ , subreadEnd_(0) // subread end
+ , preBaseFrames(nullptr)
+ , widthInFrames(nullptr)
+ , pulseIndex(nullptr)
+ , startFrame(nullptr) // not allocated by default
+ , meanSignal(nullptr) // not allocated by default
+ , maxSignal(nullptr) // not allocated by default
+ , midSignal(nullptr) // not allocated by default
+ , classifierQV(nullptr) // not allocated by default
+ , lowQualityPrefix(0) // By default, allow the entire read.
+ , lowQualitySuffix(0) // By default, allow the entire read.
+ , highQualityRegionScore(0) // HQ read score
+ , readScore(0) // read score
+ , readGroupId_("") // read group id
+ , copiedFromBam(false)
+#ifdef USE_PBBAM
+ , bamRecord(PacBio::BAM::BamRecord())
+#endif
+{
// ZMWMetrics
for (size_t i = 0; i < 4; i++) {
- hqRegionSnr[i] = -1;
+ hqRegionSnr_[i] = -1;
}
- readScore = -1;
- holeNumber = static_cast<UInt>(-1);
- readGroupId = "";
- copiedFromBam = false;
-#ifdef USE_PBBAM
- bamRecord = PacBio::BAM::BamRecord();
-#endif
-}
-
-SMRTSequence::SMRTSequence() : FASTQSequence() {
- SetNull();
}
void SMRTSequence::Allocate(DNALength length) {
@@ -42,12 +78,13 @@ void SMRTSequence::Allocate(DNALength length) {
}
FASTQSequence::AllocateRichQualityValues(length);
- seq = new Nucleotide[length];
+ seq = ProtectedNew<Nucleotide>(length);
+ this->length = length;
qual.Allocate(length);
- preBaseFrames = new HalfWord[length];
- widthInFrames = new HalfWord[length];
- pulseIndex = new int[length];
- subreadEnd = length;
+ preBaseFrames = ProtectedNew<HalfWord>(length);
+ widthInFrames = ProtectedNew<HalfWord>(length);
+ pulseIndex = ProtectedNew<int>(length);
+ subreadEnd_ = length;
deleteOnExit = true;
}
@@ -62,8 +99,8 @@ void SMRTSequence::SetSubreadBoundaries(SMRTSequence &subread, DNALength subread
subreadEnd = length;
}
assert(subreadEnd - subreadStart <= length);
- subread.subreadStart= subreadStart;
- subread.subreadEnd = subreadEnd;
+ subread.subreadStart_ = subreadStart;
+ subread.subreadEnd_ = subreadEnd;
SetSubreadTitle(subread, subreadStart, subreadEnd);
}
@@ -128,22 +165,22 @@ void SMRTSequence::Copy(const SMRTSequence &rhs, int rhsPos, int rhsLength) {
// Copy SMRT QVs
if (rhs.preBaseFrames != NULL) {
- preBaseFrames = new HalfWord[length];
+ preBaseFrames = ProtectedNew<HalfWord>(length);
memcpy(preBaseFrames, rhs.preBaseFrames, length*sizeof(HalfWord));
}
if (rhs.widthInFrames != NULL) {
- widthInFrames = new HalfWord[length];
+ widthInFrames = ProtectedNew<HalfWord>(length);
memcpy(widthInFrames, rhs.widthInFrames, length*sizeof(HalfWord));
}
if (rhs.pulseIndex != NULL) {
- pulseIndex = new int[length];
+ pulseIndex = ProtectedNew <int>(length);
memcpy(pulseIndex, rhs.pulseIndex, sizeof(int) * length);
}
}
// Copy other member variables from rhs
- subreadStart = rhs.subreadStart;
- subreadEnd = rhs.subreadEnd;
+ subreadStart_ = rhs.subreadStart_;
+ subreadEnd_ = rhs.subreadEnd_;
lowQualityPrefix = rhs.lowQualityPrefix;
lowQualitySuffix = rhs.lowQualitySuffix;
highQualityRegionScore = rhs.highQualityRegionScore;
@@ -158,9 +195,9 @@ void SMRTSequence::Copy(const SMRTSequence &rhs, int rhsPos, int rhsLength) {
#endif
}
-void SMRTSequence::Print(ostream &out) {
- out << "SMRTSequence for zmw " << zmwData.holeNumber
- << ", [" << subreadStart << ", " << subreadEnd << ")" << endl;
+void SMRTSequence::Print(ostream &out) const {
+ out << "SMRTSequence for zmw " << HoleNumber()
+ << ", [" << SubreadStart() << ", " << SubreadEnd() << ")" << endl;
DNASequence::Print(out);
}
@@ -183,6 +220,9 @@ void SMRTSequence::Free() {
if (startFrame) {
delete[] startFrame;
}
+ // FIXME: memory of QVs should be handled within class
+ // in a consistent way.
+ // Comments from Mark Chaisson:
// meanSignal, maxSignal, midSignal and classifierQV
// need to be handled separatedly.
}
@@ -194,68 +234,113 @@ void SMRTSequence::Free() {
startFrame = NULL;
// Reset member variables
- xy[0] = 0; xy[1] = 0;
+ subreadStart_ = subreadEnd_ = 0;
lowQualityPrefix = lowQualitySuffix = 0;
+ readScore = 0;
highQualityRegionScore = 0;
- holeNumber = static_cast<UInt>(-1);
- readGroupId = "";
+ readGroupId_ = "";
copiedFromBam = false;
#ifdef USE_PBBAM
bamRecord = PacBio::BAM::BamRecord();
#endif
+ // ZMWMetrics
+ for (size_t i = 0; i < 4; i++) {
+ hqRegionSnr_[i] = -1;
+ }
+
// Free seq, title and FASTQ QVs, also reset deleteOnExit.
// Don't call FASTQSequence::Free() before freeing SMRT QVs.
FASTQSequence::Free();
}
-bool SMRTSequence::StoreXY(int16_t xyP[]) {
- xy[0] = xyP[0];
- xy[1] = xyP[1];
- return true;
+SMRTSequence & SMRTSequence::HoleNumber(UInt holeNumber) {
+ zmwData.holeNumber = holeNumber;
+ return *this;
+}
+
+UInt SMRTSequence::HoleNumber(void) const {
+ return zmwData.holeNumber;
+}
+
+SMRTSequence & SMRTSequence::HoleXY(const int x, const int y) {
+ zmwData.x = x;
+ zmwData.y = y;
+ return *this;
+}
+
+UInt SMRTSequence::HoleX(void) const {
+ return zmwData.x;
}
-bool SMRTSequence::StorePlatformId(PlatformId pid) {
- platform = pid;
- return true;
+UInt SMRTSequence::HoleY(void) const {
+ return zmwData.y;
}
-bool SMRTSequence::StoreHoleNumber(UInt holeNumberP){
- zmwData.holeNumber = holeNumber = holeNumberP;
- return true;
+SMRTSequence & SMRTSequence::HoleStatus(const unsigned char holeStatus) {
+ zmwData.holeStatus = holeStatus;
+ return *this;
+}
+
+unsigned char SMRTSequence::HoleStatus(void) const {
+ return zmwData.holeStatus;
}
-bool SMRTSequence::StoreHoleStatus(unsigned char s) {
- zmwData.holeStatus = s;
- return true;
+std::string SMRTSequence::MovieName(void) const {
+ return SMRTTitle(GetTitle()).MovieName();
}
-bool SMRTSequence::StoreZMWData(ZMWGroupEntry &data) {
- zmwData = data;
- return true;
+DNALength SMRTSequence::SubreadStart(void) const {
+ return subreadStart_;
}
-bool SMRTSequence::GetXY(int xyP[]) {
- xyP[0] = xy[0];
- xyP[1] = xy[1];
- return true;
+SMRTSequence & SMRTSequence::SubreadStart(const DNALength start) {
+ subreadStart_ = start;
+ return *this;
}
-bool SMRTSequence::GetHoleNumber(UInt & holeNumberP) {
- holeNumberP = holeNumber;
- return true;
+DNALength SMRTSequence::SubreadEnd(void) const {
+ return subreadEnd_;
+}
+
+SMRTSequence & SMRTSequence::SubreadEnd(const DNALength end) {
+ subreadEnd_ = end;
+ return *this;
}
-std::string SMRTSequence::GetReadGroupId() {
- return readGroupId;
+DNALength SMRTSequence::SubreadLength(void) const {
+ return subreadEnd_ - subreadStart_;
}
-void SMRTSequence::SetReadGroupId(const std::string & rid) {
- readGroupId = rid;
+std::string SMRTSequence::ReadGroupId() const {
+ return readGroupId_;
+}
+
+SMRTSequence & SMRTSequence::ReadGroupId(const std::string & rid) {
+ readGroupId_ = rid;
+ return *this;
+}
+
+float SMRTSequence::HQRegionSnr(const char base) const {
+ if (::toupper(base) == 'A') return hqRegionSnr_[SMRTSequence::SnrIndex4Base::A];
+ else if (::toupper(base) == 'C') return hqRegionSnr_[SMRTSequence::SnrIndex4Base::C];
+ else if (::toupper(base) == 'G') return hqRegionSnr_[SMRTSequence::SnrIndex4Base::G];
+ else if (::toupper(base) == 'T') return hqRegionSnr_[SMRTSequence::SnrIndex4Base::T];
+ else assert("Base must be in A, C, G, T" == 0);
+}
+
+SMRTSequence & SMRTSequence::HQRegionSnr(const char base, float v) {
+ if (::toupper(base) == 'A') hqRegionSnr_[SMRTSequence::SnrIndex4Base::A] = v;
+ else if (::toupper(base) == 'C') hqRegionSnr_[SMRTSequence::SnrIndex4Base::C] = v;
+ else if (::toupper(base) == 'G') hqRegionSnr_[SMRTSequence::SnrIndex4Base::G] = v;
+ else if (::toupper(base) == 'T') hqRegionSnr_[SMRTSequence::SnrIndex4Base::T] = v;
+ else assert("Base must be in A, C, G, T" == 0);
+ return *this;
}
#ifdef USE_PBBAM
-void SMRTSequence::Copy(const PacBio::BAM::BamRecord & record) {
+void SMRTSequence::Copy(const PacBio::BAM::BamRecord & record,
+ bool copyAllQVs) {
Free();
copiedFromBam = true;
@@ -267,17 +352,71 @@ void SMRTSequence::Copy(const PacBio::BAM::BamRecord & record) {
// Do NOT copy other SMRTQVs such as startFrame, meanSignal...
(static_cast<FASTQSequence*>(this))->Copy(record);
+ // Set subread start, subread end in coordinate of zmw.
+ if (record.Type() != PacBio::BAM::RecordType::CCS) {
+ subreadStart_ = static_cast<int>(record.QueryStart());
+ subreadEnd_ = static_cast<int>(record.QueryEnd());
+ } else {
+ subreadStart_ = 0;
+ subreadEnd_ = static_cast<int>(record.Sequence().length());;
+ }
+
+ // Shall we copy all pulse QVs including ipd and pw?
+ if (copyAllQVs) {
+ if (record.HasPreBaseFrames()) {
+ std::vector<uint16_t> qvs = record.PreBaseFrames().DataRaw();
+ assert(preBaseFrames == nullptr);
+ preBaseFrames = ProtectedNew<HalfWord>(qvs.size());
+ std::memcpy(preBaseFrames, &qvs[0], qvs.size() * sizeof(HalfWord));
+ }
+ if (record.HasIPD()) {
+ std::vector<uint16_t> qvs = record.IPD().DataRaw();
+ assert(widthInFrames == nullptr);
+ widthInFrames = ProtectedNew<HalfWord>(qvs.size());
+ std::memcpy(widthInFrames, &qvs[0], qvs.size() * sizeof(HalfWord));
+ }
+ }
+
+ // preBaseQVs are not included in BamRecord, and will not be copied.
// Copy read group id from BamRecord.
- SetReadGroupId(record.ReadGroupId());
+ ReadGroupId(record.ReadGroupId());
// PacBio bam for secondary analysis does NOT carry zmw
// info other than holeNumber, including holeStatus, holeX,
// holeY, numEvents.
- zmwData.holeNumber = static_cast<UInt> (record.HoleNumber());
+ UInt hn = static_cast<UInt> (record.HoleNumber());
+ this->HoleNumber(hn).
+ // Assumption: holeStatus of a bam record must be 'SEQUENCING'
+ HoleStatus(static_cast<unsigned char> (PacBio::AttributeValues::ZMW::HoleStatus::sequencingzmw)).
+ // x = lower 16 bit, y = upper 16 bit
+ HoleXY(hn & 0x0000FFFF, hn >> 16);
// Set hq region read score
- if (record.Impl().HasTag("rq"))
- highQualityRegionScore = record.Impl().TagValue("rq").ToInt32();
+ if (record.HasReadAccuracy()) {
+ // In pre 3.0.1 BAM, ReadAccuracy is in [0, 1000],
+ // in post 3.0.1 BAM, ReadAccuracy is a float in [0, 1]
+ // In blasr_libcpp, which supports both HDF5 and BAM,
+ // readScore should always be a float in [0, 1],
+ // and highQualityRegionScore always be a int in [0, 1000]
+ readScore = float(record.ReadAccuracy());
+ if (readScore <= 1.0) {
+ highQualityRegionScore = int(readScore * 1000);
+ } else {
+ highQualityRegionScore = int(readScore);
+ readScore /= 1000.0;
+ }
+ }
+ // Set HQRegionSNR if record has the 'sn' tag
+ if (record.HasSignalToNoise()) {
+ // Signal to noise ratio of ACGT (in that particular ORDER) over
+ // HQRegion from BAM: record.SignalToNoise()
+ std::vector<float> snrs = record.SignalToNoise();
+ this->HQRegionSnr('A', snrs[0])
+ .HQRegionSnr('C', snrs[1])
+ .HQRegionSnr('G', snrs[2])
+ .HQRegionSnr('T', snrs[3]);
+ }
}
+
#endif
diff --git a/pbdata/SMRTSequence.hpp b/pbdata/SMRTSequence.hpp
index f7cff1c..151e809 100644
--- a/pbdata/SMRTSequence.hpp
+++ b/pbdata/SMRTSequence.hpp
@@ -1,3 +1,40 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+
#ifndef _BLASR_SMRT_SEQUENCE_HPP_
#define _BLASR_SMRT_SEQUENCE_HPP_
@@ -12,16 +49,28 @@
#include "reads/RegionTable.hpp"
#include "reads/ZMWGroupEntry.hpp"
+
class SMRTSequence : public FASTQSequence {
+friend class HDFZMWReader;
+friend class HDFZMWWriter;
+friend class HDFZMWMetricsWriter;
+
+private:
+ enum SnrIndex4Base {A=0, C=1, G=2, T=3};
+ float hqRegionSnr_[4]; // Always saved as 'ACGT'
+
+ DNALength subreadStart_;
+ DNALength subreadEnd_;
+
+ // read group id associated with each SMRTSequence
+ std::string readGroupId_;
+
public:
- int16_t xy[2];
- UInt holeNumber;
- float hqRegionSnr[4];
- float readScore;
ZMWGroupEntry zmwData;
- PlatformId platform;
+
HalfWord *preBaseFrames;
HalfWord *widthInFrames;
+
//
// The following are fields that are read in from the pulse file.
// Because they are not standard in bas.h5 files, these fields
@@ -34,22 +83,83 @@ public:
float *classifierQV;
unsigned int *startFrame;
int *pulseIndex;
+
DNALength lowQualityPrefix, lowQualitySuffix;
int highQualityRegionScore; // High quality region score in region table.
+ float readScore;
-protected:
- // read group id associated with each SMRTSequence
- std::string readGroupId;
-
-public:
// Whether or not this is originally copied from a BamRecord.
bool copiedFromBam;
- void SetNull();
-
+public:
SMRTSequence();
+
inline ~SMRTSequence();
+ /// \name Sets and gets attributes.
+ /// \{
+ /// Set HoleNumber.
+ /// \returns this SMRTSequence
+ SMRTSequence & HoleNumber(UInt holeNumber);
+
+ /// \reutrns HoleNumber
+ UInt HoleNumber(void) const;
+
+ /// Set HoleXY
+ SMRTSequence & HoleXY(const int x, const int y);
+
+ /// \returns HoleX
+ UInt HoleX(void) const;
+
+ /// \returns HoleY
+ UInt HoleY(void) const;
+
+ /// Set HoleStatus
+ SMRTSequence & HoleStatus(const unsigned char);
+
+ /// \returns HoleStatus
+ unsigned char HoleStatus(void) const;
+
+ /// \returns movie name parsed from sequence title
+ std::string MovieName(void) const;
+
+ /// \returns start pos of this sequence in coordinate of zmw polymerase sequence
+ DNALength SubreadStart(void) const;
+
+ /// Sets subreadStart.
+ SMRTSequence & SubreadStart(const DNALength start);
+
+ /// \returns subread end pos of this sequence in coordinate of zmw polymerase sequence
+ DNALength SubreadEnd(void) const;
+
+ /// Set subread end pos in coordinate of polymerase sequence.
+ SMRTSequence & SubreadEnd(const DNALength end);
+
+ /// A SMRTSequence's this->seq may point to sequence of a whole
+ /// polymerase read, but only represents a subread [subreadStart_, subreadEnd_).
+ /// \returns subread length (SubreadEnd() - SubreadStart())
+ DNALength SubreadLength(void) const;
+
+ /// \returns read group id for this sequence.
+ std::string ReadGroupId(void) const;
+
+ /// Set readGroup Id for this sequence.
+ SMRTSequence & ReadGroupId(const std::string & rid);
+
+ /// Access to HQRegion SNRs must be done via public API.
+ float HQRegionSnr(const char base) const;
+
+ /// Set HQRegion SNR of base as v.
+ SMRTSequence & HQRegionSnr(const char base, float v);
+
+ /// \}
+
+public:
+ /// \name Clip subread
+ /// \{
+ SMRTSequence & Clip(const DNALength subreadStart, const DNALength subreadEnd);
+ /// \}
+
void Allocate(DNALength length);
void SetSubreadTitle(SMRTSequence &subread, DNALength subreadStart,
@@ -68,37 +178,19 @@ public:
void Copy(const SMRTSequence &rhs, int rhsPos, int rhsLength);
- void Print(std::ostream &out);
+ void Print(std::ostream &out) const;
SMRTSequence& operator=(const SMRTSequence &rhs);
void Free();
-
- bool StoreXY(int16_t xyP[]);
-
- bool StorePlatformId(PlatformId pid);
-
- bool StoreHoleNumber(UInt holeNumberP);
-
- bool StoreHoleStatus(unsigned char s);
-
- bool StoreZMWData(ZMWGroupEntry &data);
-
- bool GetXY(int xyP[]);
-
- bool GetHoleNumber(UInt & holeNumberP);
-
- // Get read group id for this sequence.
- std::string GetReadGroupId();
-
- // Set readGroup Id for this sequence.
- void SetReadGroupId(const std::string & rid);
#ifdef USE_PBBAM
public:
// Copy read sequence, title, holeNumber, readGroupId, and QVs
// (iq, dq, sq, mq, st, dt) from BamRecord to this SMRTSequence.
- void Copy(const PacBio::BAM::BamRecord & record);
+ // If copyAllQVs is false, also copy all QVs.
+ void Copy(const PacBio::BAM::BamRecord & record,
+ bool copyAllQVs = false);
// Keep track of BamRecord from which this SMRTSequence is
// originally copied. However, one should NOT assume
@@ -112,4 +204,5 @@ public:
inline SMRTSequence::~SMRTSequence(){
SMRTSequence::Free();
}
+
#endif // _BLASR_SMRT_SEQUENCE_HPP_
diff --git a/pbdata/StringUtils.cpp b/pbdata/StringUtils.cpp
index e21a675..2b008d4 100644
--- a/pbdata/StringUtils.cpp
+++ b/pbdata/StringUtils.cpp
@@ -62,49 +62,23 @@ int ToWords(string &orig, vector<string> &words) {
return words.size();
}
-int Tokenize(string orig, string pattern, vector<string> &tokens) {
- VectorIndex tokenStart, tokenEnd;
- int patternLength = pattern.size();
- int origLength = orig.size();
- if (origLength == 0) {
- return 0;
- }
- bool prevWasToken = false;
- tokenEnd = 0;
- tokenStart = 0;
- //for (tokenEnd = 0; tokenEnd < origLength-patternLength; tokenEnd) {
- while(tokenEnd < origLength - patternLength) {
- while (tokenStart < origLength - patternLength and
- orig.compare(tokenStart, patternLength, pattern, 0, patternLength) == 0) {
- tokenStart++;
- }
- tokenEnd = tokenStart + 1;
-
- prevWasToken = false;
- while (tokenEnd < origLength - patternLength) {
- if (orig.compare(tokenEnd, patternLength, pattern, 0, patternLength) == 0) {
- // add this token to the vector of tokens
- if (tokenEnd - tokenStart >= 1) {
- prevWasToken = true;
- tokens.push_back(orig.substr(tokenStart, tokenEnd - tokenStart));
- }
- tokenEnd+=patternLength;
- tokenStart = tokenEnd;
- break;
- }
- else {
- prevWasToken = false;
- ++tokenEnd;
- }
- }
- }
- if (tokenEnd - tokenStart > 1) {
- tokens.push_back(orig.substr(tokenStart, tokenEnd - tokenStart+1));
+// Splice a string by pattern and save to a vector of token strings.
+int Splice(const string & orig, const string & pattern, vector<string> & tokens) {
+ assert(pattern.size() > 0);
+
+ tokens.clear();
+ size_t search_start = 0;
+ size_t find_pos = orig.find(pattern, search_start);
+ while(find_pos != string::npos) {
+ string x = orig.substr(search_start, find_pos - search_start);
+ tokens.push_back(x);
+ search_start = find_pos + pattern.size();
+ find_pos = orig.find(pattern, search_start);
}
+ tokens.push_back(orig.substr(search_start));
return tokens.size();
}
-
void ParseSeparatedList(const string &csl, vector<string> &values, char delim) {
stringstream cslStrm(csl);
string valString;
diff --git a/pbdata/StringUtils.hpp b/pbdata/StringUtils.hpp
index 578f0c2..ea895a2 100644
--- a/pbdata/StringUtils.hpp
+++ b/pbdata/StringUtils.hpp
@@ -23,7 +23,7 @@ int IsSpace(char c);
int ToWords(string &orig, vector<string> &words);
-int Tokenize(string orig, string pattern, vector<string> &tokens);
+int Splice(const string & orig, const string & pattern, vector<string> &tokens);
void ParseSeparatedList(const string &csl, vector<string> &values, char delim=',');
diff --git a/pbdata/build.mk b/pbdata/build.mk
new file mode 120000
index 0000000..2247f36
--- /dev/null
+++ b/pbdata/build.mk
@@ -0,0 +1 @@
+makefile
\ No newline at end of file
diff --git a/pbdata/makefile b/pbdata/makefile
new file mode 100644
index 0000000..034ee60
--- /dev/null
+++ b/pbdata/makefile
@@ -0,0 +1,39 @@
+all:
+
+THISDIR:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+-include ${CURDIR}/defines.mk
+include ${THISDIR}/../rules.mk
+
+CXXOPTS += -std=c++11 -pedantic
+# CURDIR should have libconfig.h
+INCLUDES += ${CURDIR}
+INCLUDES += ${THISDIR} matrix reads qvs metagenome saf utils
+INCLUDES += ${LIBBLASR_INC} ${PBBAM_INC} ${HTSLIB_INC} ${BOOST_INC}
+LIBS += ${PBBAM_LIB} ${HTSLIB_LIB}
+LDFLAGS += $(patsubst %,-L%,${LIBS})
+
+all: libpbdata.a libpbdata${SH_LIB_EXT}
+
+paths := . matrix reads metagenome qvs saf utils loadpulses alignment amos sam
+paths := $(patsubst %,${THISDIR}%,${paths})
+sources := $(shell find ${THISDIR} -name '*.cpp')
+sources := $(notdir ${sources})
+objects := $(sources:.cpp=.o)
+shared_objects := $(sources:.cpp=.shared.o)
+dependencies := $(objects:.o=.d) $(shared_objects:.o=.d)
+
+vpath %.cpp ${paths}
+
+libpbdata.a: $(objects)
+ $(AR) $(ARFLAGS) $@ $^
+
+libpbdata${SH_LIB_EXT}: $(shared_objects)
+
+libconfig.h:
+ cp -af ${LIBCONFIG_H} $@
+
+clean:
+ rm -f libpbdata.a libpbdata.so *.o *.d
+
+-include $(dependencies)
+depend: $(dependencies:.d=.depend)
diff --git a/pbdata/matrix/FlatMatrixImpl.hpp b/pbdata/matrix/FlatMatrixImpl.hpp
index 5536cb7..7fe77c7 100644
--- a/pbdata/matrix/FlatMatrixImpl.hpp
+++ b/pbdata/matrix/FlatMatrixImpl.hpp
@@ -3,6 +3,7 @@
#include <iostream>
#include <assert.h>
#include "Types.h"
+#include "utils.hpp"
#include "FlatMatrix.hpp"
template<typename T>
@@ -61,7 +62,7 @@ void FlatMatrix2D<T>::Resize(unsigned int _totalSize) {
if (matrix != NULL) {
delete[] matrix;
}
- matrix = new T[_totalSize];
+ matrix = ProtectedNew<T>(_totalSize);
totalSize = _totalSize;
}
@@ -87,7 +88,7 @@ void FlatMatrix2D<T>::Grow(int _nRows, int _nCols) {
if (totalSize != 0 && matrix)
delete[] matrix;
totalSize = nRows * nCols;
- matrix = new T[totalSize];
+ matrix = ProtectedNew<T>(totalSize);
}
}
@@ -115,7 +116,7 @@ template<typename T>
void FlatMatrix2D<T>::Allocate(UInt _nRows, UInt _nCols) {
nRows = _nRows;
nCols = _nCols;
- matrix = new T[nRows * nCols];
+ matrix = ProtectedNew<T>(nRows * nCols);
}
template<typename T>
@@ -163,7 +164,7 @@ void FlatMatrix3D<T>::Grow(int _nx, int _ny, int _nz) {
delete[] matrix;
}
totalSize = nx*ny*nz;
- matrix = new T[totalSize];
+ matrix = ProtectedNew<T>(totalSize);
}
xy = nx*ny;
}
diff --git a/pbdata/matrix/MatrixImpl.hpp b/pbdata/matrix/MatrixImpl.hpp
index c66aaf4..d08974c 100644
--- a/pbdata/matrix/MatrixImpl.hpp
+++ b/pbdata/matrix/MatrixImpl.hpp
@@ -5,13 +5,14 @@
#include <iostream>
#include <fstream>
#include <stdint.h>
+#include "utils.hpp"
#include "Types.h"
template<typename T>
void CreateMatrix(int rows, int cols, std::vector<T*> matrix) {
matrix.resize(rows);
if (matrix[0]) {delete [] matrix[0]; matrix[0] = NULL;}
- matrix[0] = new T[rows*cols];
+ matrix[0] = ProtectedNew<T>(rows*cols);
VectorIndex r = 1;
for (r = 1; r < rows; r++) {
matrix[r] = &matrix[cols * r];
@@ -56,14 +57,14 @@ void Matrix<T>::Resize(VectorIndex nRowsP, VectorIndex nColsP) {
}
}
if (matrix == NULL) {
- matrix = new T*[nRows];
+ matrix = ProtectedNew<T*>(nRows);
}
else {
if (matrix[0] != NULL) {
delete[] matrix[0]; matrix[0] = NULL;
}
}
- matrix[0] = new T[matrixBufferSize];
+ matrix[0] = ProtectedNew<T>(matrixBufferSize);
VectorIndex rowIndex;
for (rowIndex = 1; rowIndex < nRows; rowIndex++ ){
matrix[rowIndex] = &matrix[0][nCols * rowIndex];
diff --git a/pbdata/metagenome/SequenceIndexDatabaseImpl.hpp b/pbdata/metagenome/SequenceIndexDatabaseImpl.hpp
index e26f61e..6ad9456 100644
--- a/pbdata/metagenome/SequenceIndexDatabaseImpl.hpp
+++ b/pbdata/metagenome/SequenceIndexDatabaseImpl.hpp
@@ -191,25 +191,25 @@ ReadDatabase(std::ifstream &in) {
in.read((char*) &nSeqPos, sizeof(int));
assert(seqStartPos == NULL);
- seqStartPos = new DNALength[nSeqPos];
+ seqStartPos = ProtectedNew<DNALength>(nSeqPos);
deleteSeqStartPos = true;
in.read((char*) seqStartPos, sizeof(DNALength) * nSeqPos);
int nSeq = nSeqPos - 1;
// Get the lengths of the strings to read.
assert(nameLengths == NULL);
- nameLengths = new int[nSeq];
+ nameLengths = ProtectedNew<int>(nSeq);
deleteNameLengths = true;
in.read((char*)nameLengths, sizeof(int) * nSeq);
// Get the titles of the sequences.
assert(names == NULL); // Otherwise need to delete names;
- names = new char*[nSeq];
+ names = ProtectedNew<char*>(nSeq);
deleteNames = true;
char *namePtr;
int i;
for (i = 0; i < nSeq; i++) {
- namePtr = new char[nameLengths[i]];
+ namePtr = ProtectedNew<char>(nameLengths[i]);
if (nameLengths[i] > 0) {
in.read(namePtr, nameLengths[i]);
}
@@ -227,7 +227,7 @@ SequenceTitleLinesToNames() {
std::string tmpName;
AssignUntilFirstSpace(names[seqIndex], nameLengths[seqIndex], tmpName);
if (names[seqIndex]) {delete[] names[seqIndex];}
- names[seqIndex] = new char[tmpName.size()+1];
+ names[seqIndex] = ProtectedNew<char>(tmpName.size()+1);
strcpy(names[seqIndex], tmpName.c_str());
names[seqIndex][tmpName.size()] = '\0';
nameLengths[seqIndex] = tmpName.size();
@@ -267,14 +267,14 @@ Finalize() {
int nSeq = nSeqPos - 1;
assert(names==NULL);
- names = new char*[nSeq];
+ names = ProtectedNew<char*>(nSeq);
deleteNames = true;
unsigned int i;
if (nameLengths) {delete [] nameLengths; nameLengths = NULL;}
- nameLengths = new int[nSeq];
+ nameLengths = ProtectedNew<int>(nSeq);
deleteNameLengths = true;
for (i = 0; i < nSeq; i++) {
- names[i] = new char[growableName[i].size() + 1];
+ names[i] = ProtectedNew<char>(growableName[i].size() + 1);
memcpy((char*) names[i], (char*) growableName[i].c_str(),
growableName[i].size());
diff --git a/pbdata/metagenome/TitleTable.cpp b/pbdata/metagenome/TitleTable.cpp
index 078da23..7f207b2 100644
--- a/pbdata/metagenome/TitleTable.cpp
+++ b/pbdata/metagenome/TitleTable.cpp
@@ -11,12 +11,12 @@ TitleTable::~TitleTable() {
void TitleTable::Copy(char **src, int nSrc) {
Free(); //Free before copy
- table = new char*[nSrc];
+ table = ProtectedNew<char*>(nSrc);
tableLength = nSrc;
int i;
for (i = 0; i < nSrc; i++ ){
int lenStrI = strlen(src[i]);
- table[i] = new char[lenStrI+1];
+ table[i] = ProtectedNew<char>(lenStrI+1);
memcpy(table[i], src[i], lenStrI);
table[i][lenStrI] = '\0';
}
@@ -44,10 +44,10 @@ void TitleTable::Read(std::string &inFileName) {
void TitleTable::CopyFromVector(std::vector<std::string> &titles) {
Free(); //Free before copy.
tableLength = titles.size();
- table = new char*[tableLength];
+ table = ProtectedNew<char*>(tableLength);
int i;
for (i = 0; i < tableLength; i++) {
- table[i] = new char[titles[i].size() + 1];
+ table[i] = ProtectedNew<char>(titles[i].size() + 1);
memcpy(table[i], titles[i].c_str(), titles[i].size());
table[i][titles[i].size()] = '\0';
}
@@ -101,7 +101,7 @@ void TitleTable::ResetTableToIntegers(char **table,
namestrm << i;
std::string name;
name = namestrm.str();
- table[i] = new char[name.size()+1];
+ table[i] = ProtectedNew<char>(name.size()+1);
memcpy( table[i], name.c_str(), name.size());
table[i][name.size()] = '\0';
tableLengths[i] = (int) name.size() + 1;
diff --git a/pbdata/reads/BaseFile.cpp b/pbdata/reads/BaseFile.cpp
index bf4c56a..56bac24 100644
--- a/pbdata/reads/BaseFile.cpp
+++ b/pbdata/reads/BaseFile.cpp
@@ -27,11 +27,11 @@ bool BaseFile::LookupReadIndexByXY(uint16_t x, uint16_t y, int &index) {
void BaseFile::CopyReadAt(uint32_t readIndex, SMRTSequence &read) {
assert(holeNumbers.size() > readIndex);
- read.zmwData.holeNumber = holeNumbers[readIndex];
+ read.HoleNumber(holeNumbers[readIndex]);
if (holeXY.size() > 0) {
assert(holeXY.size() > readIndex);
- read.zmwData.x = holeXY[readIndex].xy[0];
- read.zmwData.y = holeXY[readIndex].xy[1];
+ read.HoleXY(holeXY[readIndex].xy[0],
+ holeXY[readIndex].xy[1]);
}
int startPos = readStartPositions[readIndex];
diff --git a/pbdata/reads/PulseBaseCommon.cpp b/pbdata/reads/PulseBaseCommon.cpp
index 437a0fb..16bc95c 100644
--- a/pbdata/reads/PulseBaseCommon.cpp
+++ b/pbdata/reads/PulseBaseCommon.cpp
@@ -1,3 +1,40 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+
#include <vector>
#include <algorithm>
#include "PulseBaseCommon.hpp"
@@ -14,7 +51,7 @@ std::string PulseBaseCommon::GetMovieName() {
return scanData.movieName;
}
-std::map<char, int> PulseBaseCommon::GetBaseMap() {
+std::map<char, size_t> PulseBaseCommon::GetBaseMap() {
return scanData.baseMap;
}
diff --git a/pbdata/reads/PulseBaseCommon.hpp b/pbdata/reads/PulseBaseCommon.hpp
index e8b86fc..5143682 100644
--- a/pbdata/reads/PulseBaseCommon.hpp
+++ b/pbdata/reads/PulseBaseCommon.hpp
@@ -1,3 +1,40 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+
#ifndef DATASTRUCTURES_READS_PULSE_BASE_COMMON_H_
#define DATASTRUCTURES_READS_PULSE_BASE_COMMON_H_
@@ -18,7 +55,7 @@ public:
std::string GetMovieName();
- std::map<char, int> GetBaseMap();
+ std::map<char, size_t> GetBaseMap();
bool LookupReadIndexByHoleNumber(uint32_t holeNumber, int &readIndex);
};
diff --git a/pbdata/reads/PulseFile.cpp b/pbdata/reads/PulseFile.cpp
index 7d2db0b..3cdb4e3 100644
--- a/pbdata/reads/PulseFile.cpp
+++ b/pbdata/reads/PulseFile.cpp
@@ -1,5 +1,41 @@
-#include "PulseFile.hpp"
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+#include "PulseFile.hpp"
void PulseFile::CopySignal(HalfWord *signalData, // either a vector or matrix
int signalNDims,
@@ -12,7 +48,7 @@ void PulseFile::CopySignal(HalfWord *signalData, // either a vector or matrix
// pulseStartPos must be 0;
// otherwise, pulseStartPos is pulseStartPositions[holeIndex]
- std::map<char, int> baseMap = GetBaseMap();
+ std::map<char, size_t> baseMap = GetBaseMap();
int i;
if (signalNDims == 1) {
for (i = 0; i < readLength; i++) {
diff --git a/pbdata/reads/PulseFileImpl.hpp b/pbdata/reads/PulseFileImpl.hpp
index 7fc1f8d..394ad35 100644
--- a/pbdata/reads/PulseFileImpl.hpp
+++ b/pbdata/reads/PulseFileImpl.hpp
@@ -1,6 +1,8 @@
#ifndef _BLASR_PULSE_FILE_IMPL_HPP_
#define _BLASR_PULSE_FILE_IMPL_HPP_
+#include "utils.hpp"
+
template<typename T_FieldType>
void PulseFile::StoreField(std::vector<T_FieldType> &source, int *basToPlsIndex, T_FieldType *dest, int destLength) {
int i;
@@ -14,7 +16,7 @@ template <typename T>
if (ptr != NULL) {
delete[] ptr;
}
- ptr = new T[length];
+ ptr = ProtectedNew<T>(length);
return ptr != NULL;
}
diff --git a/pbdata/reads/ReadInterval.hpp b/pbdata/reads/ReadInterval.hpp
index c21838f..8db6a22 100644
--- a/pbdata/reads/ReadInterval.hpp
+++ b/pbdata/reads/ReadInterval.hpp
@@ -1,18 +1,35 @@
#ifndef _BLASR_READ_INTERVAL_HPP_
#define _BLASR_READ_INTERVAL_HPP_
+#include "RegionAnnotation.hpp"
+
+class RegionAnnotation;
+
class ReadInterval {
public:
int start;
int end;
int score;
- ReadInterval(int s, int e, int sc=0) : start(s), end(e), score(sc) {};
+
+ ReadInterval(int s=0, int e=0, int sc=0) : start(s), end(e), score(sc) {};
+
+ ReadInterval(const RegionAnnotation & ra)
+ : start(ra.GetStart())
+ , end(ra.GetEnd())
+ , score(ra.GetScore()) {}
+
ReadInterval& operator=(const ReadInterval &rhs) {
start = rhs.start;
end = rhs.end;
score = rhs.score;
return *this;
}
+
+ bool operator==(const ReadInterval &rhs) const {
+ return (start == rhs.start and
+ end == rhs.end and
+ score == rhs.score);
+ }
};
#endif
diff --git a/pbdata/reads/RegionAnnotation.cpp b/pbdata/reads/RegionAnnotation.cpp
new file mode 100644
index 0000000..bee4a9b
--- /dev/null
+++ b/pbdata/reads/RegionAnnotation.cpp
@@ -0,0 +1,49 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+
+
+#include <ostream>
+#include "RegionAnnotation.hpp"
+
+std::ostream & operator << (std::ostream & os, const RegionAnnotation& ra) {
+ os << "ZMW " << ra.GetHoleNumber()
+ << ", region type index " << ra.GetTypeIndex()
+ << " [" << ra.GetStart()
+ << ", " << ra.GetEnd()
+ << "), " << ra.GetScore();
+ return os;
+}
diff --git a/pbdata/reads/RegionAnnotation.hpp b/pbdata/reads/RegionAnnotation.hpp
new file mode 100644
index 0000000..38732b4
--- /dev/null
+++ b/pbdata/reads/RegionAnnotation.hpp
@@ -0,0 +1,241 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. //
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+
+#ifndef _BLASR_REGION_ANNOTATION_HPP_
+#define _BLASR_REGION_ANNOTATION_HPP_
+
+#include <cassert>
+#include <cstring>
+#include <string>
+#include <iostream>
+#include <vector>
+#include <map>
+#include <ostream>
+#include "Types.h"
+#include "Enumerations.h"
+#include "PacBioDefs.h"
+#include "RegionTypeMap.hpp"
+
+
+class HDFRegionTableReader;
+class HDFRegionTableWriter;
+class HDFRegionsWriter;
+
+
+class RegionAnnotation {
+friend class HDFRegionTableReader;
+friend class HDFRegionTableWriter;
+friend class HDFRegionsWriter;
+
+public:
+ static const int HOLENUMBERCOL = 0;
+ static const int REGIONTYPEINDEXCOL = 1;
+ static const int REGIONSTARTCOL = 2;
+ static const int REGIONENDCOL = 3;
+ static const int REGIONSCORECOL = 4;
+ static const int NCOLS=5;
+
+ int row[NCOLS];
+
+public:
+ // FIXME: use regionType as a member varaible instead of regionTypeIndex
+ inline RegionAnnotation(UInt holeNumber = 0,
+ int typeIndex = 0,
+ int start = 0, int end = 0,
+ int score = -1);
+
+ inline bool operator<(const RegionAnnotation &rhs) const;
+
+ inline bool operator<(int holeNumber) const;
+
+ inline RegionAnnotation& operator=(const RegionAnnotation &rhs);
+
+ inline bool operator==(const RegionAnnotation &rhs) const;
+
+ inline int GetHoleNumber(void) const;
+
+ inline RegionAnnotation & SetHoleNumber(int holeNumber);
+
+ inline int GetTypeIndex(void) const;
+
+ inline std::string GetTypeString(const std::vector<RegionType> & types) const;
+
+ inline RegionAnnotation & SetTypeIndex(int typeIndex);
+
+ inline int GetStart(void) const;
+
+ inline RegionAnnotation & SetStart(int start);
+
+ inline int GetEnd(void) const;
+
+ inline RegionAnnotation & SetEnd(int end);
+
+ inline int GetScore(void) const;
+
+ inline RegionAnnotation & SetScore(int score);
+
+public:
+ friend std::ostream & operator << (std::ostream & os, const RegionAnnotation& ra);
+};
+
+inline
+bool compare_region_annotation_by_type(const RegionAnnotation & lhs,
+ const RegionAnnotation & rhs);
+
+inline
+RegionAnnotation::RegionAnnotation(UInt holeNumber,
+ int typeIndex, int start, int end, int score) {
+ SetHoleNumber(static_cast<int>(holeNumber));
+ SetTypeIndex(typeIndex);
+ SetStart(start);
+ SetEnd(end);
+ SetScore(score);
+}
+
+inline
+bool RegionAnnotation::operator<(const RegionAnnotation &rhs) const
+{
+ if (GetHoleNumber() == rhs.GetHoleNumber())
+ if (GetStart() == rhs.GetStart()) {
+ if (GetEnd() == rhs.GetEnd())
+ return GetScore() < rhs.GetScore();
+ else
+ return GetEnd() > rhs.GetEnd();
+ } else {
+ return GetStart() < rhs.GetStart();
+ }
+ else
+ return GetHoleNumber() < rhs.GetHoleNumber();
+}
+
+inline
+bool RegionAnnotation::operator<(int holeNumber) const
+{ return GetHoleNumber() < holeNumber; }
+
+
+inline
+RegionAnnotation& RegionAnnotation::operator=(const RegionAnnotation &rhs) {
+ memcpy(row, rhs.row, sizeof(int)*NCOLS);
+ return *this;
+}
+
+inline
+bool RegionAnnotation::operator==(const RegionAnnotation &rhs) const {
+ return (GetHoleNumber() == rhs.GetHoleNumber() and
+ GetTypeIndex() == rhs.GetTypeIndex() and
+ GetStart() == rhs.GetStart() and
+ GetEnd() == rhs.GetEnd() and
+ GetScore() == rhs.GetScore());
+}
+
+inline
+int RegionAnnotation::GetHoleNumber(void) const {
+ return row[HOLENUMBERCOL];
+}
+
+inline
+RegionAnnotation & RegionAnnotation::SetHoleNumber(int holeNumber) {
+ row[HOLENUMBERCOL] = holeNumber;
+ return *this;
+}
+
+inline
+int RegionAnnotation::GetTypeIndex(void) const {
+ return row[REGIONTYPEINDEXCOL];
+}
+
+inline std::string RegionAnnotation::GetTypeString(const std::vector<RegionType> & typesTable) const {
+ assert(GetTypeIndex() >= 0 and GetTypeIndex() < static_cast<int>(typesTable.size()));
+ return RegionTypeMap::ToString(typesTable[GetTypeIndex()]);
+}
+
+inline
+RegionAnnotation & RegionAnnotation::SetTypeIndex(int regionTypeIndex) {
+ row[REGIONTYPEINDEXCOL] = regionTypeIndex;
+ return *this;
+}
+
+inline
+int RegionAnnotation::GetStart(void) const {
+ return row[REGIONSTARTCOL];
+}
+
+inline
+RegionAnnotation & RegionAnnotation::SetStart(int start) {
+ row[REGIONSTARTCOL] = start;
+ return *this;
+}
+
+inline
+int RegionAnnotation::GetEnd(void) const {
+ return row[REGIONENDCOL];
+}
+
+inline
+RegionAnnotation & RegionAnnotation::SetEnd(int end) {
+ row[REGIONENDCOL] = end;
+ return *this;
+}
+
+inline
+int RegionAnnotation::GetScore(void) const {
+ return row[REGIONSCORECOL];
+}
+
+inline
+RegionAnnotation & RegionAnnotation::SetScore(int score) {
+ row[REGIONSCORECOL] = score;
+ return *this;
+}
+
+inline
+bool compare_region_annotation_by_type(const RegionAnnotation & lhs,
+ const RegionAnnotation & rhs)
+{
+ if (lhs.GetHoleNumber() == rhs.GetHoleNumber()) {
+ if (lhs.GetTypeIndex() == rhs.GetTypeIndex()) {
+ if (lhs.GetStart() == rhs.GetStart()) {
+ if (lhs.GetEnd() == rhs.GetEnd())
+ return lhs.GetScore() < rhs.GetScore();
+ else return lhs.GetEnd() > rhs.GetEnd();
+ } else return lhs.GetStart() < rhs.GetStart();
+ } else return lhs.GetTypeIndex() < rhs.GetTypeIndex();
+ } else {
+ return lhs.GetHoleNumber() < rhs.GetHoleNumber();
+ }
+}
+
+#endif // _BLASR_REGION_ANNOTATION_HPP_
diff --git a/pbdata/reads/RegionAnnotations.cpp b/pbdata/reads/RegionAnnotations.cpp
new file mode 100644
index 0000000..22a5e3f
--- /dev/null
+++ b/pbdata/reads/RegionAnnotations.cpp
@@ -0,0 +1,179 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+
+#include "RegionAnnotations.hpp"
+#include <algorithm>
+#include <cassert>
+
+
+RegionAnnotations::RegionAnnotations(const UInt holeNumber,
+ const std::vector<RegionAnnotation> & annotations,
+ const std::vector<RegionType> & types)
+ : holeNumber_(holeNumber)
+ , table_(annotations)
+ , types_(types)
+{
+ for (auto annotation: annotations) {
+ // Only allow RegionAnnotations of a single ZMW
+ if (holeNumber_ != annotation.GetHoleNumber()) {
+ assert(false && "RegionAnnotations must contain regions from a single ZMW");
+ }
+ }
+
+ std::sort(table_.begin(), table_.end(), compare_region_annotation_by_type);
+}
+
+RegionAnnotations::RegionAnnotations(const RegionAnnotations & rhs)
+ : holeNumber_(rhs.holeNumber_)
+ , table_(rhs.table_)
+ , types_(rhs.types_)
+{ }
+
+UInt RegionAnnotations::HoleNumber(void) const {
+ return holeNumber_;
+}
+
+std::vector<RegionAnnotation>
+RegionAnnotations::RegionAnnotationsOfType(RegionType type) const {
+
+ std::vector<RegionAnnotation> ret;
+ int typeIndex = RegionTypeMap::ToIndex(type, types_);
+ if (typeIndex >= 0) {
+ for (auto ra: table_)
+ if (ra.GetTypeIndex() == typeIndex) ret.push_back(ra);
+ sort(ret.begin(), ret.end());
+ }
+ return ret;
+}
+
+std::vector<RegionAnnotation> RegionAnnotations::Adapters() const {
+ return RegionAnnotationsOfType(Adapter);
+}
+
+bool RegionAnnotations::HasHQRegion() const {
+ return (HQRegions().size() >= 1 and
+ HQEnd() - HQStart() > 0);
+}
+
+std::vector<RegionAnnotation>
+RegionAnnotations::HQRegions() const {
+ return RegionAnnotationsOfType(HQRegion);
+}
+
+RegionAnnotation
+RegionAnnotations::TheHQRegion() const {
+ std::vector<RegionAnnotation> hqs_ = HQRegions();
+ if (hqs_.size() == 0)
+ return RegionAnnotation(holeNumber_, RegionTypeMap::ToIndex(HQRegion, types_), 0, 0, 0);
+ else if (hqs_.size() == 1)
+ return hqs_[0];
+ else assert(false && "Zmw has more than one HQRegion.");
+}
+
+DNALength RegionAnnotations::HQStart() const {
+ return TheHQRegion().GetStart();
+}
+
+DNALength RegionAnnotations::HQEnd() const {
+ return TheHQRegion().GetEnd();
+}
+
+int RegionAnnotations::HQScore() const {
+ return TheHQRegion().GetScore();
+}
+
+std::vector<RegionAnnotation>
+RegionAnnotations::Inserts() const {
+ return RegionAnnotationsOfType(Insert);
+}
+
+std::vector<ReadInterval>
+RegionAnnotations::AdapterIntervals() const {
+ std::vector<ReadInterval> ret;
+ for (auto adapter: Adapters()) {
+ ret.push_back(ReadInterval(adapter));
+ }
+ return ret;
+}
+
+std::vector<ReadInterval>
+RegionAnnotations::SubreadIntervals(const DNALength wholeLength,
+ const bool byAdapter,
+ const bool byHQRegion) const {
+ std::vector<RegionAnnotation> inserts;
+ if (not byAdapter) {
+ inserts = Inserts();
+ } else {
+ if (Adapters().size() != 0) {
+ // Must have at least one adapter in order find inserts by adapter.
+ std::vector<DNALength> starts, ends;
+ starts.push_back(0);
+ for(auto adapter: Adapters()) {
+ assert(wholeLength >= adapter.GetStart() and
+ wholeLength >= adapter.GetEnd()); // bug if fail assert
+ starts.push_back(adapter.GetEnd());
+ ends.push_back(adapter.GetStart());
+ }
+ ends.push_back(wholeLength);
+
+ for (size_t i = 0; i < starts.size(); i++) {
+ // Use adapter to infer subreads, read score considered unknown.
+ if (ends[i] > starts[i]) {
+ inserts.push_back(RegionAnnotation(holeNumber_, Insert, starts[i], ends[i], 0));
+ }
+ }
+ } // else no inserts can be found
+ }
+
+ std::vector<ReadInterval> ret;
+ for (auto insert: inserts) {
+ if (byHQRegion) {
+ if (HasHQRegion()) {
+ DNALength s = std::max(static_cast<UInt>(insert.GetStart()), HQStart());
+ DNALength e = std::min(static_cast<UInt>(insert.GetEnd()), HQEnd());
+ if (s < e) {
+ // subreads' read score = HQRegion score.
+ ret.push_back(ReadInterval(s, e, HQScore()));
+ }
+ } // else ret = {}
+ } else {
+ ret.push_back(ReadInterval(insert));
+ }
+ }
+ return ret;
+}
diff --git a/pbdata/reads/RegionAnnotations.hpp b/pbdata/reads/RegionAnnotations.hpp
new file mode 100644
index 0000000..8a16490
--- /dev/null
+++ b/pbdata/reads/RegionAnnotations.hpp
@@ -0,0 +1,122 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#ifndef _PBDATA_READS_REGION_ANNOTATIONS_HPP_
+#define _PBDATA_READS_REGION_ANNOTATIONS_HPP_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "ReadInterval.hpp"
+#include "RegionAnnotation.hpp"
+
+class RegionAnnotations {
+
+ /// \name Region Annotations of a single ZMW
+ /// \{
+private:
+ /// \name region table of a zmw
+ std::vector<RegionAnnotation> table_;
+
+ /// \name hole number of a zmw
+ UInt holeNumber_;
+
+ /// \name region types in order.
+ std::vector<RegionType> types_;
+
+public:
+ RegionAnnotations(const UInt holeNumber,
+ const std::vector<RegionAnnotation> & annotations,
+ const std::vector<RegionType> & types);
+
+ RegionAnnotations(const RegionAnnotations & rhs);
+
+ ~RegionAnnotations() {}
+
+ /// \returns zmw holeNumber.
+ UInt HoleNumber(void) const;
+
+ /// \returns sorted adapters of this zmw
+ std::vector<RegionAnnotation> Adapters() const;
+
+ /// \returns whether or not has HQ region specified in table.
+ bool HasHQRegion() const;
+
+ /// \returns exactly one HQ region of this zmw.
+ /// \note If no HQ region exists, return a RegionAnnotation of length 0.
+ /// If more than one HQ region is found for this zmw, raise an assertion error.
+ RegionAnnotation TheHQRegion() const;
+
+ /// \returns HQ start position of this zmw.
+ DNALength HQStart() const;
+
+ /// \returns HQ end position of this zmw.
+ DNALength HQEnd() const;
+
+ /// \returns HQ score of this zmw.
+ int HQScore() const;
+
+ /// \returns sorted insert regions of this zmw.
+ std::vector<RegionAnnotation> Inserts() const;
+
+ /// \returns a vector of all adapters
+ std::vector<ReadInterval> AdapterIntervals() const;
+
+ /// \returns a vector of all subreads
+ /// \param[in] wholeLength Length of unrolled sequence of this zmw. Note that
+ /// this piece of info does not exist in region table.
+ /// \param[in] byAdapter false: return inserts in region table directly.
+ /// true : infer inserts according to adapters.
+ /// \param[in] byHQRegion false: inserts may contain both HQ and LQ regions
+ /// true : inserts in HQ regions only.
+ std::vector<ReadInterval>
+ SubreadIntervals(const DNALength wholeLength,
+ const bool byAdapter = true,
+ const bool byHQRegion = true) const;
+
+private:
+ /// \returns sorted vector of region annotations of a RegionType.
+ std::vector<RegionAnnotation>
+ RegionAnnotationsOfType(RegionType type) const;
+
+ /// \returns HQ regions of this zmw.
+ std::vector<RegionAnnotation> HQRegions() const;
+
+ /// \}
+};
+
+#endif
diff --git a/pbdata/reads/RegionTable.cpp b/pbdata/reads/RegionTable.cpp
index 54582a7..e50c699 100644
--- a/pbdata/reads/RegionTable.cpp
+++ b/pbdata/reads/RegionTable.cpp
@@ -1,159 +1,136 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+// Modified by: Yuan Li
+
+
#include <algorithm>
+#include <iostream>
+#include <ostream>
#include "RegionTable.hpp"
using namespace std;
-RegionAnnotation& RegionAnnotation::operator=(const RegionAnnotation &rhs) {
- memcpy(row, rhs.row, sizeof(int)*NCOLS);
- return *this;
-}
-int RegionAnnotation::GetHoleNumber() {
- return row[HoleNumber];
-}
-
-void RegionAnnotation::SetHoleNumber(int holeNumber) {
- row[HoleNumber] = holeNumber;
-}
-int RegionAnnotation::GetType() const {
- return row[RegionType];
-}
-
-void RegionAnnotation::SetType(int regionType) {
- row[RegionType] = regionType;
-}
-
-int RegionAnnotation::GetStart() {
- return row[RegionStart];
-}
-
-void RegionAnnotation::SetStart(int start) {
- row[RegionStart] = start;
-}
-int RegionAnnotation::GetEnd() {
- return row[RegionEnd];
-}
-
-void RegionAnnotation::SetEnd(int end) {
- row[RegionEnd] = end;
-}
-
-int RegionAnnotation::GetScore() {
- return row[RegionScore];
+RegionTable & RegionTable::Reset() {
+ map_.clear();
+ columnNames.clear();
+ regionTypes.clear();
+ regionDescriptions.clear();
+ regionSources.clear();
+ regionTypeEnums.clear();
+ return *this;
}
-void RegionAnnotation::SetScore(int score) {
- row[RegionScore] = score;
-}
+std::vector<RegionType> RegionTable::RegionTypeEnums(void) const
+{ return regionTypeEnums; }
-int RegionTable::LookupRegionsByHoleNumber(int holeNumber, int &low, int &high) const {
- std::vector<RegionAnnotation>::const_iterator lowIt, highIt;
- lowIt = std::lower_bound(table.begin(), table.end(), holeNumber);
- highIt = std::lower_bound(table.begin(), table.end(), holeNumber+1);
- low = lowIt - table.begin();
- high = highIt - table.begin();
- return high-low;
-}
+std::vector<std::string> RegionTable::RegionTypes(void) const
+{ return regionTypes; }
-//
-// Define a bunch of accessor functions.
-//
+std::vector<std::string> RegionTable::ColumnNames(void) const
+{ return columnNames; }
-//
-// Different region tables have different ways of encoding regions.
-// This maps from the way they are encoded in the rgn table to a
-// standard encoding.
-//
+std::vector<std::string> RegionTable::RegionDescriptions(void) const
+{ return regionDescriptions; }
-RegionType RegionTable::GetType(int regionIndex) const {
- assert(regionIndex < table.size());
- assert(regionIndex >= 0);
- return (RegionType) regionTypeEnums[table[regionIndex].GetType()];
-}
+std::vector<std::string> RegionTable::RegionSources(void) const
+{ return regionSources;}
-int RegionTable::GetStart(int regionIndex) {
- assert(regionIndex < table.size());
- assert(regionIndex >= 0);
- return table[regionIndex].GetStart();
-}
+RegionTable & RegionTable::ConstructTable(std::vector<RegionAnnotation> & table,
+ const std::vector<std::string> & regionTypeStrs) {
+ RegionTypes(regionTypeStrs); //< Set both regionTypes and regionTypeEnums.
-void RegionTable::SetStart(int regionIndex, int start) {
- assert(regionIndex < table.size());
- assert(regionIndex >= 0);
- table[regionIndex].SetStart(start);
-}
+ // Must sort region annotations by HoleNumber, RegionTypeIndex, Start, End, and Score
+ std::sort(table.begin(), table.end(), compare_region_annotation_by_type);
-int RegionTable::GetEnd(int regionIndex) {
- assert(regionIndex < table.size());
- assert(regionIndex >= 0);
- return table[regionIndex].GetEnd();
-}
+ // Construct map_<holeNumber, RegionAnnotations>
+ if (table.size() > 0) {
+ UInt pre_hn = table[0].GetHoleNumber();
+ auto itBegin = table.begin();
+ for (auto it = table.begin(); it != table.end(); it++) {
+ if (it->GetHoleNumber() > pre_hn) {
+ map_.insert(std::pair<UInt, RegionAnnotations>(pre_hn,
+ RegionAnnotations(pre_hn,
+ std::vector<RegionAnnotation>(itBegin, it),
+ regionTypeEnums)));
+ pre_hn = it->GetHoleNumber();
+ itBegin = it;
+ }
+ }
-void RegionTable::SetEnd(int regionIndex, int end) {
- assert(regionIndex < table.size());
- assert(regionIndex >= 0);
- table[regionIndex].SetEnd(end);
+ map_.insert(std::pair<UInt, RegionAnnotations>(pre_hn,
+ RegionAnnotations(pre_hn,
+ std::vector<RegionAnnotation>(itBegin, table.end()),
+ regionTypeEnums)));
+ }
}
-int RegionTable::GetHoleNumber(int regionIndex) {
- assert(regionIndex < table.size());
- assert(regionIndex >= 0);
- return table[regionIndex].GetHoleNumber();
+std::vector<RegionType> RegionTable::DefaultRegionTypes(void) {
+ std::vector<RegionType> ret;
+ for (std::string regionTypeString: PacBio::AttributeValues::Regions::regiontypes) {
+ ret.push_back(RegionTypeMap::ToRegionType(regionTypeString));
+ }
+ return ret;
}
-void RegionTable::SetHoleNumber(int regionIndex, int holeNumber) {
- assert(regionIndex < table.size());
- assert(regionIndex >= 0);
- table[regionIndex].SetHoleNumber(holeNumber);
+RegionTable & RegionTable::RegionTypes(const std::vector<std::string> & regionTypeStrs) {
+ regionTypes = regionTypeStrs;
+ for (std::string regionTypeString: regionTypeStrs) {
+ regionTypeEnums.push_back(RegionTypeMap::ToRegionType(regionTypeString));
+ }
+ return *this;
}
-int RegionTable::GetScore(int regionIndex) {
- assert(regionIndex < table.size());
- assert(regionIndex >= 0);
- return table[regionIndex].row[RegionAnnotation::RegionScore];
-}
+RegionTable & RegionTable::ColumnNames(const std::vector<std::string> & in)
+{ columnNames = in; return *this; }
-void RegionTable::SetScore(int regionIndex, int score) {
- assert(regionIndex < table.size());
- assert(regionIndex >= 0);
- table[regionIndex].row[RegionAnnotation::RegionScore] = score;
-}
+RegionTable & RegionTable::RegionDescriptions(const std::vector<std::string> & in)
+{ regionDescriptions = in; return *this; }
-void RegionTable::SortTableByHoleNumber() {
- std::stable_sort(table.begin(), table.end());
-}
+RegionTable & RegionTable::RegionSources(const std::vector<std::string> & in)
+{ regionSources = in; return *this; }
-void RegionTable::Reset() {
- table.clear();
- columnNames.clear();
- regionTypes.clear();
- regionDescriptions.clear();
- regionSources.clear();
- regionTypeEnums.clear();
+bool RegionTable::HasHoleNumber(const UInt holeNumber) const {
+ return (map_.find(holeNumber) != map_.end());
}
-void RegionTable::CreateDefaultAttributes() {
- columnNames.clear();
- columnNames.push_back("HoleNumber");
- columnNames.push_back("Region type index");
- columnNames.push_back("Region start in bases");
- columnNames.push_back("Region end in bases");
- columnNames.push_back("Region score");
-
- regionTypes.push_back("Adapter");
- regionTypes.push_back("Insert");
- regionTypes.push_back("HQRegion");
-
- regionDescriptions.push_back("Adapter Hit");
- regionDescriptions.push_back("Insert Region");
- regionDescriptions.push_back("High Quality bases region. Score is 1000 * "
- "predicted accuracy, where predicted accuary is 0 to 1.0");
-
- regionSources.push_back("AdapterFinding");
- regionSources.push_back("AdapterFinding");
- regionSources.push_back("PulseToBase Region classifer");
-
- regionTypeEnums.push_back(Adapter);
- regionTypeEnums.push_back(Insert);
- regionTypeEnums.push_back(HQRegion);
+RegionAnnotations RegionTable::operator [] (const UInt holeNumber) const {
+ // Must check whether a zmw exists or not first.
+ assert (HasHoleNumber(holeNumber)
+ && "Could not find zmw in region table.");
+ return map_.find(holeNumber)->second;
}
diff --git a/pbdata/reads/RegionTable.hpp b/pbdata/reads/RegionTable.hpp
index 497e625..f5b750a 100644
--- a/pbdata/reads/RegionTable.hpp
+++ b/pbdata/reads/RegionTable.hpp
@@ -1,95 +1,145 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+
#ifndef _BLASR_REGION_TABLE_HPP_
#define _BLASR_REGION_TABLE_HPP_
#include <cassert>
#include <cstring>
#include <string>
+#include <iostream>
#include <vector>
+#include <map>
+#include <ostream>
+#include "Types.h"
#include "Enumerations.h"
+#include "PacBioDefs.h"
+#include "RegionAnnotation.hpp"
+#include "RegionAnnotations.hpp"
-class RegionAnnotation {
-public:
- typedef enum T_AnnotationRow {HoleNumber, RegionType, RegionStart,
- RegionEnd, RegionScore} AnnotationRow;
- static const int NCOLS=5;
- int row[NCOLS];
-
- inline
- bool operator<(const RegionAnnotation &rhs) const
- { return row[HoleNumber] < rhs.row[HoleNumber]; }
-
- inline
- bool operator<(int holeNumber) const
- { return row[HoleNumber] < holeNumber; }
-
- RegionAnnotation& operator=(const RegionAnnotation &rhs);
-
- int GetHoleNumber();
-
- void SetHoleNumber(int holeNumber);
-
- int GetType() const;
-
- void SetType(int regionType);
-
- int GetStart();
-
- void SetStart(int start);
-
- int GetEnd();
-
- void SetEnd(int end);
-
- int GetScore();
-
- void SetScore(int score);
-};
class RegionTable {
-public:
- std::vector<RegionAnnotation> table;
+private:
+ /// RegionTable reading from h5 file 'Regions' dataset.
+ /// \name member variables
+ /// \{
+ /// Map zmw hole number to zmw RegionAnnotations.
+ std::map<UInt, RegionAnnotations> map_;
+ /// \}
+
+ /// \name Region table attributes.
std::vector<std::string> columnNames;
std::vector<std::string> regionTypes;
std::vector<std::string> regionDescriptions;
std::vector<std::string> regionSources;
- std::vector<RegionType> regionTypeEnums;
+ std::vector<RegionType> regionTypeEnums;
+ /// \}
+
+public:
+ /// \name Constructor & destructor & reset
+ /// \{
+ RegionTable() {}
- int LookupRegionsByHoleNumber(int holeNumber, int &low, int &high) const;
+ ~RegionTable() {}
- //
- // Define a bunch of accessor functions.
- //
+ /// Clears member variables in region table.
+ /// \returns *this
+ RegionTable& Reset();
+ /// \}
- //
// Different region tables have different ways of encoding regions.
// This maps from the way they are encoded in the rgn table to a
// standard encoding.
//
+ /// \name Accessor functions to region table attributes.
+ /// \{
- RegionType GetType(int regionIndex) const;
+ /// \returns *default PacBio* region types (order matters).
+ static std::vector<RegionType> DefaultRegionTypes(void);
- int GetStart(int regionIndex);
+ /// \returns RegionType enums (order matters).
+ std::vector<RegionType> RegionTypeEnums(void) const;
- void SetStart(int regionIndex, int start);
+ /// \returns RegionType strings in order
+ std::vector<std::string> RegionTypes(void) const;
- int GetEnd(int regionIndex);
+ /// \returns column names.
+ std::vector<std::string> ColumnNames(void) const;
- void SetEnd(int regionIndex, int end);
+ /// \returns region descriptions.
+ std::vector<std::string> RegionDescriptions(void) const;
- int GetHoleNumber(int regionIndex);
+ /// \returns region sources.
+ std::vector<std::string> RegionSources(void) const;
- void SetHoleNumber(int regionIndex, int holeNumber);
+ /// Construct map_ (holeNumber --> RegionAnnotations) from table.
+ /// \params[in] region table containing region annotations of all zmws
+ /// \params[in] ordered region type strings, which maps region types
+ /// to region type indice.
+ RegionTable & ConstructTable(std::vector<RegionAnnotation> & table,
+ const std::vector<std::string> & regionTypeStrs);
- int GetScore(int regionIndex);
+ /// Note that the ORDER of region types does matter.
+ /// Set region types (order matters).
+ RegionTable & RegionTypes(const std::vector<std::string> & in);
- void SetScore(int regionIndex, int score);
+ /// Set column names, e.g.,
+ /// {"HoleNumber", "TypeIndex", "Start", "End", "Score"}
+ RegionTable & ColumnNames(const std::vector<std::string> & in);
- void SortTableByHoleNumber();
+ /// Set region descriptions. e.g.,
+ /// {"desc of holenumber", "desc of index", "desc of start", "desc of end", "desc of score"}
+ RegionTable & RegionDescriptions(const std::vector<std::string> & in);
- void Reset();
+ /// Set region sources, e.g.,
+ /// {"source of holenumber", "source of index", "source of start", "source of end", "source of score"}
+ RegionTable & RegionSources(const std::vector<std::string> & in);
+ /// \}
- void CreateDefaultAttributes();
-};
+ /// \name Assessor functions to zmw region annotations.
+ /// \{
+ /// \returns Whether or not this region table has regions of a zmw.
+ bool HasHoleNumber(const UInt holeNumber) const;
+ /// Get zmw region annotaions given its hole number.
+ /// Note that HasHoleNumber must be called first.
+ /// \returns RegionAnnotations of a zmw.
+ RegionAnnotations operator [] (const UInt holeNumber) const;
+ /// \}
+};
#endif // _BLASR_REGION_TABLE_HPP_
diff --git a/pbdata/reads/RegionTypeMap.cpp b/pbdata/reads/RegionTypeMap.cpp
new file mode 100644
index 0000000..e6e4f4a
--- /dev/null
+++ b/pbdata/reads/RegionTypeMap.cpp
@@ -0,0 +1,89 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#include "RegionTypeMap.hpp"
+
+std::string RegionTypeMap::ToString(RegionType rt) {
+ assert(RegionTypeToString.find(rt) != RegionTypeToString.end());
+ return RegionTypeToString.find(rt)->second;
+}
+
+RegionType RegionTypeMap::ToRegionType(const std::string & str) {
+ if (StringToRegionType.find(str) == StringToRegionType.end()) {
+ std::cout << "Unsupported RegionType " << str << std::endl;
+ assert(false);
+ }
+ return StringToRegionType.find(str)->second;
+}
+
+int RegionTypeMap::ToIndex(const std::string & typeStr, const std::vector<std::string> & typeStrs) {
+ auto it = std::find(typeStrs.begin(), typeStrs.end(), typeStr);
+ if (it == typeStrs.end()) {
+ std::cout << "Could not find RegionType " << typeStr << std::endl;
+ assert(false);
+ } else {
+ return std::distance(typeStrs.begin(), it);
+ }
+}
+
+int RegionTypeMap::ToIndex(RegionType rt, const std::vector<std::string> & typeStrs) {
+ return RegionTypeMap::ToIndex(RegionTypeMap::ToString(rt), typeStrs);
+}
+
+int RegionTypeMap::ToIndex(RegionType rt, const std::vector<RegionType> & regionTypes) {
+ auto it = std::find(regionTypes.begin(), regionTypes.end(), rt);
+ if (it == regionTypes.end()) {
+ std::cout << "Could not find RegionType " << RegionTypeMap::ToString(rt) << std::endl;
+ assert(false);
+ } else {
+ return std::distance(regionTypes.begin(), it);
+ }
+}
+
+const std::map<RegionType, std::string> RegionTypeMap::RegionTypeToString = {
+ {Adapter, "Adapter"},
+ {Insert, "Insert"},
+ {HQRegion, "HQRegion"},
+ {BarCode, "Barcode"}
+};
+
+const std::map<std::string, RegionType> RegionTypeMap::StringToRegionType = {
+ {"Adapter", Adapter},
+ {"Insert", Insert},
+ {"HQRegion", HQRegion},
+ {"Barcode", BarCode},
+};
diff --git a/pbdata/reads/RegionTypeMap.hpp b/pbdata/reads/RegionTypeMap.hpp
new file mode 100644
index 0000000..0b2903c
--- /dev/null
+++ b/pbdata/reads/RegionTypeMap.hpp
@@ -0,0 +1,85 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#ifndef _BLASR_REGION_TYPE_MAP_HPP_
+#define _BLASR_REGION_TYPE_MAP_HPP_
+
+#include <cassert>
+#include <string>
+#include <iostream>
+#include <vector>
+#include <map>
+#include <algorithm>
+#include "Types.h"
+#include "Enumerations.h"
+
+
+class RegionTypeMap {
+public:
+ /// \name Map region type to/from string and index
+ /// \{
+ static std::string ToString(RegionType rt);
+
+ static RegionType ToRegionType(const std::string & str);
+
+ /// \params[in] typeStr - query region type as a string
+ /// \params[in] typeStrs - a vector region type strings in order
+ /// \returns index of a region type as string in a vector of region type strings
+ static int ToIndex(const std::string & typeStr,
+ const std::vector<std::string> & typeStrs);
+
+ /// \params[in] rt - query region type
+ /// \params[in] typeStrs - a vector region type strings in order
+ /// \returns index of the query region type in a vector of region type strings
+ static int ToIndex(RegionType rt,
+ const std::vector<std::string> & typeStrs);
+
+ /// \params[in] rt - query region type
+ /// \params[in] regionTypes - a vector region type strings in order
+ /// \returns index of the query region type in a vector of region type enums
+ static int ToIndex(RegionType rt,
+ const std::vector<RegionType> & regionTypes);
+private:
+ // Map region type to string
+ static const std::map<RegionType, std::string> RegionTypeToString;
+
+ // Map string to region type
+ static const std::map<std::string, RegionType> StringToRegionType;
+ /// \}
+};
+
+#endif // _BLASR_REGION_TYPE_MAP_HPP_
diff --git a/pbdata/reads/ScanData.cpp b/pbdata/reads/ScanData.cpp
index 3114376..37699f8 100644
--- a/pbdata/reads/ScanData.cpp
+++ b/pbdata/reads/ScanData.cpp
@@ -1,8 +1,87 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+
#include "ScanData.hpp"
+#include <iostream>
+
+std::string ScanData::BaseMapToStr(const std::map<char, size_t> & baseMap) {
+ std::string baseMapStr = ""; //4 dye channels.
+ if (not baseMap.empty()) {
+ baseMapStr = " ";
+ for (auto it = baseMap.begin(); it != baseMap.end(); ++it){
+ if (it->second > 4 or it->second < 0) {
+ std::cout << "ERROR, there are more than four dye channels."
+ << std::endl;
+ exit(1);
+ }
+ baseMapStr[it->second]= it->first;
+ }
+ }
+ return baseMapStr;
+}
+
+std::map<char, size_t> ScanData::StrToBaseMap(const std::string & baseMapStr) {
+ std::map<char, size_t> ret;
+ for (auto i = 0; i < baseMapStr.size(); i++) {
+ ret[baseMapStr[i]] = i;
+ }
+ return ret;
+}
+
+bool ScanData::IsValidBaseMap(const std::map<char, size_t> & baseMap) {
+ const char X = 'x';
+ std::string v(4, X);
+
+ for(const char base : {'A', 'T', 'G', 'C'}) {
+ size_t index = baseMap.find(base)->second;
+ if (not (baseMap.find(base) != baseMap.end() and
+ index >= 0 and index <= 3))
+ return false;
+ else
+ v[index] = 'o';
+ }
+ if (v.find(X) != std::string::npos) return false;
+ else return true;
+}
ScanData::ScanData() {
platformId = NoPlatform;
- frameRate = numFrames = 0;
+ frameRate = 0.0;
+ numFrames = 0;
movieName = runCode = whenStarted = "";
baseMap.clear();
}
@@ -10,3 +89,77 @@ ScanData::ScanData() {
std::string ScanData::GetMovieName() {
return movieName;
}
+
+ScanData & ScanData::PlatformID(const PlatformId & id) {
+ platformId = id;
+ return *this;
+}
+ScanData & ScanData::FrameRate(const float & rate) {
+ frameRate = rate;
+ return *this;
+}
+ScanData & ScanData::NumFrames(const unsigned int & num) {
+ numFrames = num;
+ return *this;
+}
+ScanData & ScanData::MovieName(const std::string & name) {
+ movieName = name;
+ return *this;
+}
+ScanData & ScanData::RunCode(const std::string & code) {
+ runCode = code;
+ return *this;
+}
+ScanData & ScanData::WhenStarted(const std::string & when) {
+ whenStarted = when;
+ return *this;
+}
+ScanData & ScanData::BaseMap(const std::map<char, size_t> & bmp) {
+ baseMap.clear();
+ baseMap.insert(bmp.begin(), bmp.end());
+ return *this;
+}
+ScanData & ScanData::BaseMap(const std::string & baseMapStr) {
+ return this->BaseMap(ScanData::StrToBaseMap(baseMapStr));
+}
+ScanData & ScanData::SequencingKit(const std::string sequencingKit) {
+ sequencingKit_ = sequencingKit;
+ return *this;
+}
+ScanData & ScanData::BindingKit(const std::string bindingKit) {
+ bindingKit_ = bindingKit;
+ return *this;
+}
+
+PlatformId ScanData::PlatformID(void) const {
+ return platformId;
+}
+float ScanData::FrameRate(void) const {
+ return frameRate;
+}
+unsigned int ScanData::NumFrames(void) const {
+ return numFrames;
+}
+std::string ScanData::MovieName(void) const {
+ return movieName;
+}
+std::string ScanData::RunCode(void) const {
+ return runCode;
+}
+std::string ScanData::WhenStarted(void) const {
+ return whenStarted;
+}
+std::map<char, size_t> ScanData::BaseMap(void) const {
+ return baseMap;
+}
+
+std::string ScanData::BaseMapStr(void) const {
+ return ScanData::BaseMapToStr(baseMap);
+}
+
+std::string ScanData::SequencingKit(void) const {
+ return sequencingKit_;
+}
+std::string ScanData::BindingKit(void) const {
+ return bindingKit_;
+}
diff --git a/pbdata/reads/ScanData.hpp b/pbdata/reads/ScanData.hpp
index 8cf07ea..a68bb37 100644
--- a/pbdata/reads/ScanData.hpp
+++ b/pbdata/reads/ScanData.hpp
@@ -1,20 +1,104 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Mark Chaisson
+
#ifndef DATASTRUCTURES_READS_SCAN_DATA_H_
#define DATASTRUCTURES_READS_SCAN_DATA_H_
#include <string>
#include <map>
#include "Enumerations.h"
+#include "PacBioDefs.h"
+
+class HDFScanDataReader;
+class HDFScanDataWriter;
class ScanData {
+friend class HDFScanDataReader;
+friend class HDFScanDataWriter;
+public:
+ // Convert base map from a map<char, size_t> to a string.
+ // e.g., {{'A', 2}, {'C', 1}, {'T', 0}, {'G', 3}} --> TCAG
+ static std::string BaseMapToStr(const std::map<char, size_t> & baseMap);
+
+ // Convert base map from a string to a map<char, size_t>.
+ // e.g., TCAG --> {{'A', 2}, {'C', 1}, {'T', 0}, {'G', 3}}
+ static std::map<char, size_t> StrToBaseMap(const std::string & baseMapStr);
+
+ // A baseMap must contain maps from bases (e.g., ACGT) to indices (e.g., 0, 1, 2, 3).
+ static bool IsValidBaseMap(const std::map<char, size_t> & baseMap);
+
public:
PlatformId platformId;
float frameRate;
unsigned int numFrames;
std::string movieName, runCode;
std::string whenStarted;
- std::map<char, int> baseMap;
+ std::map<char, size_t> baseMap;
+
ScanData();
std::string GetMovieName();
+
+ ScanData & PlatformID(const PlatformId & id);
+ ScanData & FrameRate(const float & rate);
+ ScanData & NumFrames(const unsigned int & num);
+ ScanData & MovieName(const std::string & name);
+ ScanData & RunCode(const std::string & code);
+ ScanData & WhenStarted(const std::string & when);
+ ScanData & BaseMap(const std::map<char, size_t> & bmp);
+ ScanData & BaseMap(const std::string & baseMapStr);
+ ScanData & SequencingKit(const std::string sequencingKit);
+ ScanData & BindingKit(const std::string bindingKit);
+
+ PlatformId PlatformID(void) const;
+ float FrameRate(void) const;
+ unsigned int NumFrames(void) const;
+ std::string MovieName(void) const;
+ std::string RunCode(void) const;
+ std::string WhenStarted(void) const;
+ std::map<char, size_t> BaseMap(void) const;
+ std::string BaseMapStr(void) const;
+
+ std::string SequencingKit(void) const;
+ std::string BindingKit(void) const;
+
+
+private:
+ std::string sequencingKit_;
+ std::string bindingKit_;
};
#endif
diff --git a/pbdata/sam/SAMReaderImpl.hpp b/pbdata/sam/SAMReaderImpl.hpp
index fd83c76..b92ffc1 100644
--- a/pbdata/sam/SAMReaderImpl.hpp
+++ b/pbdata/sam/SAMReaderImpl.hpp
@@ -73,7 +73,7 @@ void SAMReader<T_ReferenceSequence, T_ReadGroup, T_SAMAlignment>::StoreKVPairs(s
// Split on tab delineated line.
//
std::vector<std::string> kvPairStrings;
- Tokenize(line, "\t", kvPairStrings);
+ Splice(line, "\t", kvPairStrings);
KeywordValueStringsToPairs(kvPairStrings, kvPairs);
}
diff --git a/pbdata/utils.hpp b/pbdata/utils.hpp
index 9686cb5..ed80f30 100644
--- a/pbdata/utils.hpp
+++ b/pbdata/utils.hpp
@@ -13,7 +13,10 @@ template<typename T_Int>
T_Int CeilOfFraction(T_Int num, T_Int denom);
template<typename T>
-T* ProtectedNew(unsigned long size);
+inline T* ProtectedNew(unsigned long size);
+
+template<typename T>
+inline T* ProtectedNew(void);
#include "utilsImpl.hpp"
diff --git a/pbdata/utils/SMRTReadUtils.cpp b/pbdata/utils/SMRTReadUtils.cpp
index 4bc2f6c..b2430b0 100644
--- a/pbdata/utils/SMRTReadUtils.cpp
+++ b/pbdata/utils/SMRTReadUtils.cpp
@@ -4,7 +4,7 @@
void GetSMRTReadCoordinates(FASTQSequence &seq, int &x, int &y) {
std::string str(seq.title, seq.titleLength);
std::vector<std::string> titleTokens;
- Tokenize(str, "_", titleTokens);
+ Splice(str, "_", titleTokens);
int i;
x = y = -1;
int cmp;
@@ -22,7 +22,7 @@ void GetSMRTReadCoordinates(FASTQSequence &seq, int &x, int &y) {
void GetSpringfieldHoleNumberFromTitle(FASTQSequence &seq, unsigned int &holeNumber) {
std::vector<std::string> titleTokens;
- Tokenize(seq.title, "/", titleTokens);
+ Splice(seq.title, "/", titleTokens);
if (titleTokens.size() < 2) {
return;
}
diff --git a/pbdata/utils/SMRTTitle.hpp b/pbdata/utils/SMRTTitle.hpp
index 33b108b..6b0da3e 100644
--- a/pbdata/utils/SMRTTitle.hpp
+++ b/pbdata/utils/SMRTTitle.hpp
@@ -20,5 +20,19 @@ public:
/// \returns smrt title movie/zmw/s_e, if input read is a smrt title;
/// otherwise, return an empty string.
std::string ToString();
+
+public:
+ inline std::string MovieName(void) const;
+ inline UInt HoleNumber(void) const;
+ inline DNALength Start(void) const;
+ inline DNALength End(void) const;
+ inline operator bool(void) const;
};
+
+inline std::string SMRTTitle::MovieName(void) const {return movieName;}
+inline UInt SMRTTitle::HoleNumber(void) const {return holeNumber;}
+inline DNALength SMRTTitle::Start(void) const {return start;}
+inline DNALength SMRTTitle::End(void) const {return end;}
+inline SMRTTitle::operator bool(void) const {return isSMRTTitle;}
+
#endif
diff --git a/pbdata/utilsImpl.hpp b/pbdata/utilsImpl.hpp
index ee899cd..f58aa10 100644
--- a/pbdata/utilsImpl.hpp
+++ b/pbdata/utilsImpl.hpp
@@ -1,5 +1,10 @@
#ifndef _BLASR_UTIL_IMPL_HPP_
#define _BLASR_UTIL_IMPL_HPP_
+#include <stdlib.h>
+#include <cstdlib> // abort()
+#include <new> // bad_alloc
+#include <iostream> // cout/cerr
+
template<typename t_file>
void CrucialOpen(std::string &fileName, t_file &file, std::ios_base::openmode mode) {
@@ -19,12 +24,27 @@ T_Int CeilOfFraction(T_Int num, T_Int denom) {
}
template<typename T>
-T* ProtectedNew(unsigned long size) {
- T* ptr;
- ptr = new T[size];
- if (ptr == NULL) {
- std::cout << "ERROR, allocating " << size * sizeof(T) << " bytes.";
- exit(1);
+inline T* ProtectedNew(unsigned long size) {
+ T * ptr = nullptr;
+ try {
+ ptr = new T[size];
+ } catch (std::bad_alloc & ba) {
+ std::cout << "ERROR, allocating " << size * sizeof(T) << " bytes."
+ << ba.what() << std::endl;
+ abort();
+ }
+ return ptr;
+}
+
+template<typename T>
+inline T* ProtectedNew(void) {
+ T * ptr = nullptr;
+ try {
+ ptr = new T;
+ } catch (std::bad_alloc & ba) {
+ std::cout << "ERROR, allocating " << sizeof(T) << " bytes."
+ << ba.what() << std::endl;
+ abort();
}
return ptr;
}
diff --git a/rules.mk b/rules.mk
new file mode 100644
index 0000000..6b1739a
--- /dev/null
+++ b/rules.mk
@@ -0,0 +1,28 @@
+ARFLAGS := rc
+CXX_SHAREDFLAGS := -fPIC
+#LD_SHAREDFLAGS := -dynamiclib -fPIC
+CPPFLAGS += $(patsubst %,-I%,${INCLUDES})
+CFLAGS += -fno-common
+LDFLAGS += ${EXTRA_LDFLAGS}
+
+
+%.a:
+ ${AR} ${ARFLAGS} $@ $^
+
+%.so:
+ ${CXX} -shared ${LDFLAGS} -o $@ -Wl,-soname,$@ $^ ${LDLIBS}
+
+%.dylib:
+ ${CXX} -dynamiclib ${LDFLAGS} -o $@ -Wl,-install_name,$@ $^ ${LDLIBS}
+
+%.o: %.cpp
+ ${CXX} ${CXXOPTS} ${CXXFLAGS} ${CPPFLAGS} -c $< -o $@
+
+%.shared.o: %.cpp
+ ${CXX} ${CXXOPTS} ${CXXFLAGS} ${CPPFLAGS} ${CXX_SHAREDFLAGS} -c $< -o $@
+
+%.depend: %.cpp
+ ${CXX} ${CXXOPTS} ${CXXFLAGS} ${CPPFLAGS} -MM -MP -MG -MT $(@:.depend=.o) -MF $(@:.depend=.d) $<
+
+%.shared.depend: %.cpp
+ ${CXX} ${CXXOPTS} ${CXXFLAGS} ${CPPFLAGS} -MM -MP -MG -MT $(@:.depend=.o) -MF $(@:.depend=.d) $<
diff --git a/simple.mk b/simple.mk
deleted file mode 100644
index e725a6e..0000000
--- a/simple.mk
+++ /dev/null
@@ -1,16 +0,0 @@
-SHELL = bash
-G_BUILDOS_CMD := bash -c 'set -e; set -o pipefail; id=$$(lsb_release -si | tr "[:upper:]" "[:lower:]"); rel=$$(lsb_release -sr); case $$id in ubuntu) printf "$$id-%04d\n" $${rel/./};; centos) echo "$$id-$${rel%%.*}";; *) echo "$$id-$$rel";; esac' 2>/dev/null
-OS_STRING ?= $(shell $(G_BUILDOS_CMD))
-
-# magic for non-verbose builds
-V ?= 0
-
-CXX_0 = @echo " CXX $@"; $(CXX)
-CXX_1 = $(CXX)
-CXX_pp = $(CXX_$(V))
-
-AR_0 = @echo " AR $@"; $(AR)
-AR_1 = $(AR)
-AR_pp = $(AR_$(V))
-
-ARFLAGS := rc
diff --git a/travis.sh b/travis.sh
new file mode 100755
index 0000000..a0edb8b
--- /dev/null
+++ b/travis.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+ls /usr/include/hdf*
+ls /usr/lib/libhdf*
+set -ex
+NOHDF=1 NOPBBAM=1 ./configure.py
+make -j4 libpbdata
+make -j4 libblasr
+
+# Test compilation of ./hdf using our own HDF5 headers, for now.
+# (This fails on Darwin b/c our HDF5_HEADERS were configured for Linux.)
+NOPBBAM=1 ./configure.py
+make -j4 -C ./hdf libpbihdf.a
+
+# make -j4 gtest
diff --git a/unittest/.gitignore b/unittest/.gitignore
new file mode 100644
index 0000000..26ca380
--- /dev/null
+++ b/unittest/.gitignore
@@ -0,0 +1 @@
+/test-runner
diff --git a/unittest/Makefile b/unittest/Makefile
deleted file mode 100644
index 6df2e5f..0000000
--- a/unittest/Makefile
+++ /dev/null
@@ -1,41 +0,0 @@
-SHELL=bash
-
-PBINCROOT := $(realpath ..)
-PREBUILT ?= $(realpath ../../../../prebuilt.out)
-THIRD_PARTY_PREFIX ?= $(realpath ../..)
-
-include ./common.mk
-
-OS := $(shell uname)
-
-ifeq ($(OS), Darwin)
- LD_WHOLE_ARCHIVE := -all_load
- LD_NO_WHOLE_ARCHIVE := -noall_load
-else
- LD_WHOLE_ARCHIVE := --whole-archive
- LD_NO_WHOLE_ARCHIVE := --no-whole-archive
-endif
-
-EXE := test-runner
-
-all: $(EXE)
-
-gtest: $(EXE)
- ./$< --gtest_output=xml:./xml/all.xml
-
-LIBS := alignment/libblasr_gtest.a \
- hdf/libpbihdf_gtest.a \
- pbdata/libpbdata_gtest.a
-
-$(EXE): $(LIBS)
- $(CXX_pp) $(CXXOPTS) $(CXXFLAGS) $(GTEST_SRC) -Wl,$(LD_WHOLE_ARCHIVE) $^ -Wl,$(LD_NO_WHOLE_ARCHIVE) -o $@ -I$(GTEST_ROOT) $(LIBDIRS) $(LDFLAGS)
-
-$(LIBS):
- make -C $(dir $@) $(notdir $@)
-
-clean:
- @make -C alignment clean
- @make -C hdf clean
- @make -C pbdata clean
- @rm -fr $(EXE) xml
-
diff --git a/unittest/alignment/Makefile b/unittest/alignment/Makefile
index f0bc981..154ab5e 100644
--- a/unittest/alignment/Makefile
+++ b/unittest/alignment/Makefile
@@ -1,4 +1,5 @@
-include ../common.mk
+include ../../rules.mk
+include ../defines.mk
SOURCES = $(wildcard *.cpp) \
$(wildcard utils/*.cpp) \
@@ -17,13 +18,13 @@ EXE := test-runner
all debug profile: $(EXE)
libblasr_gtest.a: $(OBJECTS)
- $(AR_pp) $(ARFLAGS)c $@ $^
+ $(AR) $(ARFLAGS)c $@ $^
$(EXE): $(OBJECTS)
- $(CXX_pp) $(CXXOPTS) $(CXXFLAGS) $^ $(GTEST_SRC) -o $@ -I$(GTEST_ROOT) $(LIBDIRS) $(LDFLAGS)
+ $(CXX) $(CXXOPTS) $(CXXFLAGS) $^ $(GTEST_SRC) -o $@ -I$(GTEST_ROOT) $(LIBDIRS) $(LDFLAGS)
$(OBJECTS): %.o: %.cpp
- $(CXX_pp) $(CXXOPTS) $(CXXFLAGS) -c $< -o $@ $(INCDIRS)
+ $(CXX) $(CXXOPTS) $(CXXFLAGS) -c $< -o $@ $(INCDIRS)
gtest: $(EXE)
./$< --gtest_output=xml:../xml/alignment.xml
diff --git a/unittest/alignment/files/CCSIterator_gtest.cpp b/unittest/alignment/files/CCSIterator_gtest.cpp
index eba2a24..91da996 100644
--- a/unittest/alignment/files/CCSIterator_gtest.cpp
+++ b/unittest/alignment/files/CCSIterator_gtest.cpp
@@ -40,8 +40,6 @@ public:
EXPECT_TRUE(rev);
reader->ReadTable(*rgn);
reader->Close();
-
- rgn->SortTableByHoleNumber();
}
void TearDown() {
diff --git a/unittest/alignment/files/FragmentCCSIterator_gtest.cpp b/unittest/alignment/files/FragmentCCSIterator_gtest.cpp
index e1b2d0c..d97ce22 100644
--- a/unittest/alignment/files/FragmentCCSIterator_gtest.cpp
+++ b/unittest/alignment/files/FragmentCCSIterator_gtest.cpp
@@ -41,8 +41,6 @@ public:
EXPECT_TRUE(rev);
reader->ReadTable(*rgn);
reader->Close();
-
- rgn->SortTableByHoleNumber();
}
void TearDown() {
@@ -64,7 +62,8 @@ public:
TEST_F(FragmentCCSIteratorTestFixture, Initialize) {
// void Initialize(CCSSequence *_seqPtr, RegionTable *_regionTablePtr) {
- ccs->zmwData.holeNumber = 10;
+ ccs->HoleNumber(10);
+ ccs->unrolledRead.Allocate(7000);
it.Initialize(ccs, rgn);
int numPasses = it.GetNumPasses();
diff --git a/unittest/alignment/files/FragmentCCSIterator_other_gtest.cpp b/unittest/alignment/files/FragmentCCSIterator_other_gtest.cpp
new file mode 100644
index 0000000..cd13cb2
--- /dev/null
+++ b/unittest/alignment/files/FragmentCCSIterator_other_gtest.cpp
@@ -0,0 +1,100 @@
+/*
+ * =====================================================================================
+ *
+ * Filename: CCSIterator_gtest.cpp
+ *
+ * Description: Test alignment/files/CCSIterator.hpp
+ *
+ * Version: 1.0
+ * Created: 11/29/2012 04:51:02 PM
+ * Revision: 08/20/2014
+ * Compiler: gcc
+ *
+ * Author: Yuan Li (yli), yli at pacificbiosciences.com
+ * Company: Pacific Biosciences
+ *
+ * =====================================================================================
+ */
+
+#include "gtest/gtest.h"
+#define private public
+#include "files/CCSIterator.hpp"
+#include "files/FragmentCCSIterator.hpp"
+#include "reads/RegionTable.hpp"
+#include "HDFRegionTableReader.hpp"
+#include "pbdata/testdata.h"
+
+using namespace std;
+
+static const UInt HOLENUMBER = 76772;
+
+// Adater - 0, Insert - 1, HQRegion - 2
+static const std::vector<RegionType> TYPES = {Adapter, Insert, HQRegion};
+static const std::vector<std::string> TYPESTRS = {"Adapter", "Insert", "HQRegion"};
+
+static const std::vector<RegionAnnotation> INSERTS = {
+ RegionAnnotation(HOLENUMBER, 1, 0, 253, -1),
+ RegionAnnotation(HOLENUMBER, 1, 301, 678, -1),
+ RegionAnnotation(HOLENUMBER, 1, 724, 1101, -1),
+ RegionAnnotation(HOLENUMBER, 1, 1150, 1534, -1),
+ RegionAnnotation(HOLENUMBER, 1, 1575, 1956, -1),
+ RegionAnnotation(HOLENUMBER, 1, 1999, 2379, -1),
+ RegionAnnotation(HOLENUMBER, 1, 2417, 2803, -1),
+ RegionAnnotation(HOLENUMBER, 1, 2852, 3245, -1),
+ RegionAnnotation(HOLENUMBER, 1, 3287, 3727, -1),
+ RegionAnnotation(HOLENUMBER, 1, 3778, 4176, -1),
+ RegionAnnotation(HOLENUMBER, 1, 4221, 4618, -1),
+ RegionAnnotation(HOLENUMBER, 1, 4661, 4862, -1)
+};
+
+static const std::vector<RegionAnnotation> ADAPTERS = {
+ RegionAnnotation(HOLENUMBER, 0, 253, 301, 854),
+ RegionAnnotation(HOLENUMBER, 0, 678, 724, 978),
+ RegionAnnotation(HOLENUMBER, 0, 1101, 1150, 897),
+ RegionAnnotation(HOLENUMBER, 0, 1534, 1575, 804),
+ RegionAnnotation(HOLENUMBER, 0, 1956, 1999, 930),
+ RegionAnnotation(HOLENUMBER, 0, 2379, 2417, 736),
+ RegionAnnotation(HOLENUMBER, 0, 2803, 2852, 918),
+ RegionAnnotation(HOLENUMBER, 0, 3245, 3287, 928),
+ RegionAnnotation(HOLENUMBER, 0, 3727, 3778, 784),
+ RegionAnnotation(HOLENUMBER, 0, 4176, 4221, 911),
+ RegionAnnotation(HOLENUMBER, 0, 4618, 4661, 767)
+};
+
+static const std::vector<RegionAnnotation> HQREGION = {
+ RegionAnnotation(HOLENUMBER, 2, 0, 4861, 865)
+};
+
+static const DNALength EXPECTED_HQSTART = 0;
+
+static const DNALength EXPECTED_HQEND = 4861;
+
+static const DNALength EXPECTED_SCORE = 865;
+
+static const DNALength WHOLE_LENGTH = 5000;
+
+static const int EXPECTED_NUM_SUBREADS = 12;
+
+
+TEST(CCSFragmentIterator, Constructor) {
+ std::vector<RegionAnnotation> regions = INSERTS;
+ regions.insert(regions.end(), HQREGION.begin(), HQREGION.end());
+ regions.insert(regions.end(), ADAPTERS.begin(), ADAPTERS.end());
+
+ CCSSequence ccs;
+ ccs.HoleNumber(HOLENUMBER);
+ ccs.Allocate(WHOLE_LENGTH);
+ ccs.unrolledRead.Allocate(WHOLE_LENGTH);
+
+ RegionTable table;
+ table.ConstructTable(regions, TYPESTRS);
+
+ FragmentCCSIterator it;
+ it.Initialize(&ccs, &table);
+
+ EXPECT_EQ(it.subreadIntervals.size(), EXPECTED_NUM_SUBREADS);
+
+ EXPECT_EQ(it.subreadIntervals[0], ReadInterval(0, 253, 865));
+
+ EXPECT_EQ(it.subreadIntervals[EXPECTED_NUM_SUBREADS-1], ReadInterval(4661, 4861, 865));
+}
diff --git a/unittest/alignment/files/ReaderAgglomerate_gtest.cpp b/unittest/alignment/files/ReaderAgglomerate_gtest.cpp
index f27eb16..f39a529 100644
--- a/unittest/alignment/files/ReaderAgglomerate_gtest.cpp
+++ b/unittest/alignment/files/ReaderAgglomerate_gtest.cpp
@@ -127,3 +127,19 @@ TEST_F(ReaderAgglomerateTest, ReadFromBam) {
reader->Close();
}
+
+TEST_F(ReaderAgglomerateTest, ReadsFromBam) {
+ string fn (bamFile1);
+ reader->SetReadFileName(fn);
+ EXPECT_EQ(reader->Initialize(), 1);
+
+ vector<SMRTSequence> seqs;
+ int ret, count=0;
+ while (ret = reader->GetNext(seqs) and ret != 0) {
+ count+ = seqs.size();
+ }
+
+ EXPECT_EQ(count, 116);
+
+ reader->Close();
+}
diff --git a/unittest/alignment/utils/RegionUtils_gtest.cpp b/unittest/alignment/utils/RegionUtils_gtest.cpp
index 2daaa86..52b0b44 100644
--- a/unittest/alignment/utils/RegionUtils_gtest.cpp
+++ b/unittest/alignment/utils/RegionUtils_gtest.cpp
@@ -233,4 +233,3 @@ TEST_F(RegionUtilTestFixture, GetTypicalFullSubreadIndex) {
EXPECT_EQ(idx, 22);
// Typical = the second longest full pass subread (6647, 7145)
}
-
diff --git a/unittest/build.mk b/unittest/build.mk
new file mode 100644
index 0000000..a8dde8f
--- /dev/null
+++ b/unittest/build.mk
@@ -0,0 +1,27 @@
+all:
+
+include ../rules.mk
+include defines.mk
+
+EXE := test-runner
+
+all: $(EXE)
+
+gtest: $(EXE)
+ ./$< --gtest_output=xml:./xml/all.xml
+
+LIBS := alignment/libblasr_gtest.a \
+ hdf/libpbihdf_gtest.a \
+ pbdata/libpbdata_gtest.a
+
+$(EXE): $(LIBS)
+ $(CXX) $(CXXOPTS) $(CXXFLAGS) $(GTEST_SRC) -Wl,$(LD_WHOLE_ARCHIVE) $^ -Wl,$(LD_NO_WHOLE_ARCHIVE) -o $@ -I$(GTEST_ROOT) $(LIBDIRS) $(LDFLAGS)
+
+$(LIBS):
+ ${MAKE} -C $(dir $@) PBINCROOT=${PBINCROOT}/.. $(notdir $@)
+
+clean:
+ @${MAKE} -C alignment clean
+ @${MAKE} -C hdf clean
+ @${MAKE} -C pbdata clean
+ @${RM} -fr $(EXE) xml
diff --git a/unittest/common.mk b/unittest/common.mk
deleted file mode 100644
index 5337ea9..0000000
--- a/unittest/common.mk
+++ /dev/null
@@ -1,64 +0,0 @@
-SHELL=bash
-
-.PHONY: all debug profile clean
-
-#
-# Definitions common to all make files for unit tests.
-# All paths are relative from inside the subdirectories, not this file
-#
-
-PBINCROOT ?= $(realpath ../..)
-PREBUILT ?= $(realpath ../../../../../prebuilt.out)
-THIRD_PARTY_PREFIX ?= ../../..
-
-include $(PBINCROOT)/common.mk
-
-# All Google Test headers. Usually you shouldn't change this.
-GTEST_ROOT := $(THIRD_PARTY)/gtest/fused-src
-GTEST_SRC := $(GTEST_ROOT)/gtest/gtest-all.cc \
- $(GTEST_ROOT)/gtest/gtest_main.cc
-
-# INCLUDE DIRS
-INCDIRS = -I$(PBINCROOT)/alignment \
- -I$(PBINCROOT)/hdf \
- -I$(PBINCROOT)/pbdata \
- -I$(PBINCROOT)/unittest \
- -I$(GTEST_ROOT) \
- -I$(HDF5_INC)
-
-# LIB DIRS
-LIBDIRS = -L$(PBINCROOT)/alignment \
- -L$(PBINCROOT)/hdf \
- -L$(PBINCROOT)/pbdata \
- -L$(HDF5_LIB)
-
-LDFLAGS1 := -lblasr -lpbihdf -lpbdata
-# The order of -l{lib} matters
-
-ifeq ($(origin nopbbam), undefined)
- INCDIRS += -I$(PBBAM)/include -I$(PBBAM)/third-party/htslib
- LIBDIRS += -L$(PBBAM)/lib -L$(PBBAM)/third-party/htslib
- LDFLAGS1 += -lpbbam
-# Use libhts.a built with pbbam
-ifneq ($(wildcard "$(PBBAM)/third-party/htslib/libhts.a"), "")
- LDFLAGS1 += $(PBBAM)/third-party/htslib/libhts.a
-else
- LDFLAGS1 += lhts
-endif
-endif
-
-ifneq ($(ZLIB_ROOT), notfound)
- INCDIRS += -I$(ZLIB_ROOT)/include
- LIBDIRS += -L$(ZLIB_ROOT)/lib
-endif
-
-ifneq ($(wildcard "$(HDF5_LIB)/libhdf5_cpp.a"),"")
- LDFLAGS := $(LDFLAGS1) $(HDF5_LIB)/libhdf5_cpp.a $(HDF5_LIB)/libhdf5.a -lpthread -lz -ldl
-else
- LDFLAGS := $(LDFLAGS1) -lhdf5_cpp -lhdf5 -lpthread -lz -ldl
-endif
-
-
-CXX := g++
-CXXOPTS := -std=c++11 -Wno-div-by-zero
-CXXFLAGS := -O3
diff --git a/unittest/hdf/HDFPlsReader_gtest.cpp b/unittest/hdf/HDFPlsReader_gtest.cpp
index a4f63a2..8bcf35d 100644
--- a/unittest/hdf/HDFPlsReader_gtest.cpp
+++ b/unittest/hdf/HDFPlsReader_gtest.cpp
@@ -42,7 +42,8 @@ TEST_F(HDFPlsReaderTEST, ReadToPulseFile) {
reader.IncludeField("StartFrame");
reader.ReadPulseFileInit(pulseFile);
reader.ReadPulseFile(pulseFile);
- ASSERT_EQ(pulseFile.platformId, 0);
+ //Astro = 1, Springfield = 2
+ ASSERT_EQ(pulseFile.platformId, 2);
ASSERT_EQ(pulseFile.startFrame.size(), 197626964);
}
diff --git a/unittest/hdf/HDFScanDataWriter_gtest.cpp b/unittest/hdf/HDFScanDataWriter_gtest.cpp
index 15831ca..40e07a4 100644
--- a/unittest/hdf/HDFScanDataWriter_gtest.cpp
+++ b/unittest/hdf/HDFScanDataWriter_gtest.cpp
@@ -17,6 +17,7 @@
*/
#include "gtest/gtest.h"
+#define private public
#include "HDFScanDataWriter.hpp"
#include "HDFFile.hpp"
#include "reads/ScanData.hpp"
@@ -24,6 +25,7 @@
TEST(HDFScanDataWriter, Write) {
ScanData sd;
sd.frameRate = 100;
+ sd.BaseMap("ATGC");
HDFFile outFile;
outFile.Open("scandata.h5", H5F_ACC_TRUNC);
diff --git a/unittest/hdf/Makefile b/unittest/hdf/Makefile
index 77fe15e..d942d8f 100644
--- a/unittest/hdf/Makefile
+++ b/unittest/hdf/Makefile
@@ -1,4 +1,5 @@
-include ../common.mk
+include ../../rules.mk
+include ../defines.mk
SOURCES = $(wildcard *.cpp)
OBJECTS = $(SOURCES:.cpp=.o)
@@ -8,13 +9,13 @@ EXE := test-runner
all debug profile: $(EXE)
libpbihdf_gtest.a: $(OBJECTS)
- $(AR_pp) $(ARFLAGS)c $@ $^
+ $(AR) $(ARFLAGS)c $@ $^
$(EXE): $(OBJECTS)
- $(CXX_pp) $(CXXOPTS) $(CXXFLAGS) $^ $(GTEST_SRC) -o $@ -I$(GTEST_ROOT) $(LIBDIRS) $(LDFLAGS)
+ $(CXX) $(CXXOPTS) $(CXXFLAGS) $^ $(GTEST_SRC) -o $@ -I$(GTEST_ROOT) $(LIBDIRS) $(LDFLAGS)
$(OBJECTS): %.o: %.cpp
- $(CXX_pp) $(CXXOPTS) $(CXXFLAGS) -c $< -o $@ $(INCDIRS)
+ $(CXX) $(CXXOPTS) $(CXXFLAGS) -c $< -o $@ $(INCDIRS)
gtest: $(EXE)
./$< --gtest_output=xml:../xml/hdf.xml
diff --git a/unittest/makefile b/unittest/makefile
new file mode 100644
index 0000000..e2a56c7
--- /dev/null
+++ b/unittest/makefile
@@ -0,0 +1,120 @@
+SHELL=bash
+THISDIR:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+SRCDIR:=${THISDIR}
+-include ${CURDIR}/defines.mk
+include ${THISDIR}/../rules.mk
+
+MKDIR := mkdir
+
+null :=
+space := $(null) $(null)
+
+broken_test_sources := \
+ ${SRCDIR}/alignment/files/ReaderAgglomerate_gtest.cpp \
+ ${SRCDIR}/alignment/format/SAMHeaderPrinter_gtest.cpp \
+ ${SRCDIR}/alignment/format/SAMPrinter_gtest.cpp \
+ ${SRCDIR}/alignment/utils/FileUtils_gtest.cpp \
+ $(null)
+
+
+gtest_sources := $(GTEST_SRCDIR)/gtest/gtest-all.cc \
+ $(GTEST_SRCDIR)/gtest/gtest_main.cc \
+ $(null)
+
+test_sources := $(wildcard ${SRCDIR}/pbdata/*.cpp) \
+ $(wildcard ${SRCDIR}/pbdata/utils/*.cpp) \
+ $(wildcard ${SRCDIR}/pbdata/metagenome/*.cpp) \
+ $(wildcard ${SRCDIR}/pbdata/saf/*.cpp) \
+ $(wildcard ${SRCDIR}/pbdata/reads/*.cpp) \
+ $(wildcard ${SRCDIR}/pbdata/qvs/*.cpp) \
+ \
+ $(wildcard ${SRCDIR}/hdf/*.cpp) \
+ \
+ $(wildcard ${SRCDIR}/alignment/*.cpp) \
+ $(wildcard ${SRCDIR}/alignment/utils/*.cpp) \
+ $(wildcard ${SRCDIR}/alignment/datastructures/alignment/*.cpp) \
+ $(wildcard ${SRCDIR}/alignment/files/*.cpp) \
+ $(wildcard ${SRCDIR}/alignment/format/*.cpp) \
+ $(null)
+
+# Remove broken tests from the test_sources list
+test_sources := $(filter-out $(broken_test_sources),$(test_sources))
+
+paths := alignment alignment/files alignment/datastructures/alignment alignment/utils alignment/format \
+ pbdata pbdata/utils pbdata/metagenome pbdata/saf pbdata/reads pbdata/qvs \
+ hdf
+paths := $(patsubst %,${SRCDIR}%,${paths}) ${GTEST_SRCDIR}/gtest
+sources := $(gtest_sources) $(test_sources)
+sources := $(notdir ${sources})
+objects := $(patsubst %.cc,%.o,$(filter %.cc,$(sources))) \
+ $(patsubst %.cpp,%.o,$(filter %.cpp,$(sources))) \
+ $(null)
+dependencies:=$(objects:%.o=%.d)
+
+
+INCLUDES+= \
+ ${SRCDIR} \
+ $(GTEST_INC) \
+ $(LIBBLASR_INC) \
+ $(LIBPBIHDF_INC) \
+ $(LIBPBDATA_INC) \
+ $(PBBAM_INC) \
+ $(HTSLIB_INC) \
+ $(HDF5_INC) \
+ $(BOOST_INC) \
+ $(null)
+
+LIBS+= \
+ $(LIBBLASR_LIB) \
+ $(LIBPBIHDF_LIB) \
+ $(LIBPBDATA_LIB) \
+ $(PBBAM_LIB) \
+ $(HTSLIB_LIB) \
+ $(HDF5_LIB) \
+ $(HDF5_CPP_LIB) \
+ $(ZLIB_LIB) \
+ $(GCC_LIB) \
+ $(null)
+
+ldlibs := -lblasr -lpbihdf -lpbdata -lpbbam -lhts -lhdf5_cpp -lhdf5 -lz
+sys_ldlibs := -lpthread -ldl -lrt
+
+cxxopts := -std=c++11 -Wno-div-by-zero
+cxxflags := -O3
+cppflags := $(patsubst %,-I%,${includes})
+ldflags := $(patsubst %,-L%,${LIBS}) $(sys_ldflags)
+
+
+
+override CPPFLAGS := $(cppflags) $(CPPFLAGS)
+override CXXFLAGS := $(cxxflags) $(cxxopts) $(CXXFLAGS)
+override LDLIBS := $(ldlibs) $(sys_ldlibs) $(LDLIBS)
+override LDFLAGS := $(ldflags) $(LDFLAGS)
+
+COMPILE.cpp = $(CXX) $(CXXFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c
+COMPILE.cc = $(COMPILE.cpp)
+LINK.o = $(CXX) $(LDFLAGS) $(TARGET_ARCH)
+
+vpath %.cpp ${paths}
+vpath %.cc ${paths}
+
+all: test-runner
+
+test-runner: $(objects)
+ $(LINK.o) $^ $(LDLIBS) -o $@
+
+gtest: test-runner
+ LD_LIBRARY_PATH=$(subst $(space),:,$(strip $(LIBS))) ./$< --gtest_output=xml:./xml/all.xml
+
+# Build objects
+%.o: %.cpp
+ $(COMPILE.cpp) -o $@ $<
+
+%.o: %.cc
+ $(COMPILE.cc) -o $@ $<
+
+clean:
+ $(RM) -r $(OUTDIR) *.o test-runner
+
+-include ${dependencies}
+depend: $(dependencies:.d=.depend)
diff --git a/unittest/pbdata/CCSSequence_gtest.cpp b/unittest/pbdata/CCSSequence_gtest.cpp
index 02e72cd..f5e13d1 100644
--- a/unittest/pbdata/CCSSequence_gtest.cpp
+++ b/unittest/pbdata/CCSSequence_gtest.cpp
@@ -44,9 +44,9 @@ public:
smrt.length = size;
smrt.deleteOnExit = false;
- smrt.zmwData.holeNumber = holeNumber;
- smrt.subreadStart = start;
- smrt.subreadEnd = end;
+ smrt.HoleNumber (holeNumber);
+ smrt.SubreadStart(start);
+ smrt.SubreadEnd (end);
stringstream ss;
}
@@ -74,7 +74,7 @@ public:
ccs.passDirection.resize(numSubreads);
s = 0;
for(int i=0; i < ccs.numPasses; i++) {
- ccs.passStartBase[i] = subreads[i].subreadStart;
+ ccs.passStartBase[i] = subreads[i].SubreadStart();
ccs.passDirection[i] = (i%2==0)?(0):(1);
ccs.passNumBases[i] = subreads[i].length;
}
diff --git a/unittest/pbdata/DNASequence_gtest.cpp b/unittest/pbdata/DNASequence_gtest.cpp
index 6d16f01..96e516d 100644
--- a/unittest/pbdata/DNASequence_gtest.cpp
+++ b/unittest/pbdata/DNASequence_gtest.cpp
@@ -257,7 +257,7 @@ TEST_F(DNASequenceTest, ReferenceSubstring) {
EXPECT_FALSE(dnaTwo.deleteOnExit);
// EXPECT_DEATH_IF_SUPPORTED(dnaTwo.ReferenceSubstring(dnaOne, 100), "");
- delete dnaOne.seq;
+ delete [] dnaOne.seq;
}
/*
TEST_F(DNASequenceTest, CopyFromString) {
diff --git a/unittest/pbdata/Makefile b/unittest/pbdata/Makefile
index 61df3a9..a6474af 100644
--- a/unittest/pbdata/Makefile
+++ b/unittest/pbdata/Makefile
@@ -1,4 +1,5 @@
-include ../common.mk
+include ../../rules.mk
+include ../defines.mk
SOURCES = $(wildcard *.cpp) \
$(wildcard utils/*.cpp) \
@@ -13,13 +14,13 @@ EXE := test-runner
all debug profile: $(EXE)
libpbdata_gtest.a: $(OBJECTS)
- $(AR_pp) $(ARFLAGS)c $@ $^
+ $(AR) $(ARFLAGS)c $@ $^
$(EXE): $(OBJECTS)
- $(CXX_pp) $(CXXOPTS) $(CXXFLAGS) $^ $(GTEST_SRC) -o $@ -I$(GTEST_ROOT) $(LIBDIRS) $(LDFLAGS)
+ $(CXX) $(CXXOPTS) $(CXXFLAGS) $^ $(GTEST_SRC) -o $@ -I$(GTEST_ROOT) $(LIBDIRS) $(LDFLAGS)
$(OBJECTS): %.o: %.cpp
- $(CXX_pp) $(CXXOPTS) $(CXXFLAGS) -c $< -o $@ $(INCDIRS)
+ $(CXX) $(CXXOPTS) $(CXXFLAGS) -c $< -o $@ $(INCDIRS)
gtest: $(EXE)
./$< --gtest_output=xml:../xml/pbdata.xml
diff --git a/unittest/pbdata/SMRTSequence_gtest.cpp b/unittest/pbdata/SMRTSequence_gtest.cpp
index 43f40d9..a3cfaea 100644
--- a/unittest/pbdata/SMRTSequence_gtest.cpp
+++ b/unittest/pbdata/SMRTSequence_gtest.cpp
@@ -30,9 +30,9 @@ public:
smrt.seq = seqnt;
int len = sizeof(seqnt) / sizeof(Nucleotide) - 1;
smrt.length = len;
- smrt.zmwData.holeNumber = 1;
- smrt.subreadStart = 0;
- smrt.subreadEnd = 19;
+ smrt.HoleNumber(1);
+ smrt.SubreadStart(0);
+ smrt.SubreadEnd (19);
smrt.AllocateDeletionQVSpace(len);
for(int i=0; i < 19; i ++) {
smrt.deletionQV[i] = i;
diff --git a/unittest/pbdata/StringUtils_gtest.cpp b/unittest/pbdata/StringUtils_gtest.cpp
index 3b2b4fb..abd6622 100644
--- a/unittest/pbdata/StringUtils_gtest.cpp
+++ b/unittest/pbdata/StringUtils_gtest.cpp
@@ -32,7 +32,49 @@ TEST(StringUtilTest, MakeReadGroupId) {
readType = ReadType::CCS;
expectedReadGroupId = "f5b4ffb6";
EXPECT_EQ(MakeReadGroupId(movieName, readType), expectedReadGroupId);
-
}
+TEST(StringUtilTest, Splice) {
+ vector<string> tokens;
+
+ Splice("movie/zmw/0_1", "/", tokens);
+ vector<string> exp = {"movie", "zmw", "0_1"};
+ EXPECT_EQ(tokens, exp);
+
+ string test = "abc,ef,12,4";
+ Splice(test, ",", tokens);
+ exp = vector<string>{"abc", "ef", "12", "4"};
+ EXPECT_EQ(tokens, exp);
+
+ Splice(test, "ef,", tokens);
+ exp = vector<string>{"abc,", "12,4"};
+ EXPECT_EQ(tokens, exp);
+
+ Splice("", ",", tokens);
+ exp = vector<string>{""};
+ EXPECT_EQ(tokens, exp);
+
+ Splice(",", ",", tokens);
+ exp = vector<string>{"", ""};
+ EXPECT_EQ(tokens, exp);
+
+ Splice(",abc,", ",", tokens);
+ exp = vector<string>{"", "abc", ""};
+ EXPECT_EQ(tokens, exp);
+ Splice("abc,", ",", tokens);
+ exp = vector<string>{"abc", ""};
+ EXPECT_EQ(tokens, exp);
+
+ Splice(",abc", ",", tokens);
+ exp = vector<string>{"", "abc"};
+ EXPECT_EQ(tokens, exp);
+
+ Splice("abc", "abc", tokens);
+ exp = vector<string>{"", ""};
+ EXPECT_EQ(tokens, exp);
+
+ Splice("a\tb\tc", "\t", tokens);
+ exp = vector<string>{"a", "b", "c"};
+ EXPECT_EQ(tokens, exp);
+}
diff --git a/unittest/pbdata/reads/RegionAnnotations_gtest.cpp b/unittest/pbdata/reads/RegionAnnotations_gtest.cpp
new file mode 100644
index 0000000..03014e7
--- /dev/null
+++ b/unittest/pbdata/reads/RegionAnnotations_gtest.cpp
@@ -0,0 +1,203 @@
+/*
+ * ==================================================================
+ *
+ * Filename: RegionAnnotations_gtest.cpp
+ *
+ * Description: Test pbdata/reads/RegionAnnotations.hpp
+ *
+ * Version: 1.0
+ * Created: 09/27/2015 03:54:55 PM
+ * Compiler: gcc
+ *
+ * Author: Yuan Li (yli), yli at pacificbiosciences.com
+ * Company: Pacific Biosciences
+ *
+ * ==================================================================
+ */
+#include "gtest/gtest.h"
+#define private public
+#include "reads/ReadInterval.hpp"
+#include "reads/RegionAnnotations.hpp"
+#include <algorithm>
+#include <iostream>
+
+using namespace std;
+
+static const UInt HOLENUMBER = 1720;
+
+// Adater - 0, Insert - 1, HQRegion - 2
+static const std::vector<RegionType> TYPES = {Adapter, Insert, HQRegion};
+static const std::vector<std::string> TYPESTRS = {"Adapter", "Insert", "HQRegion"};
+
+static const std::vector<RegionAnnotation> REGIONS = {
+ RegionAnnotation(HOLENUMBER, 2, 50, 900, 900),// hqregion
+ RegionAnnotation(HOLENUMBER, 1, 700, 999, -1), // insert
+ RegionAnnotation(HOLENUMBER, 0, 649, 700, 700),// adapter
+ RegionAnnotation(HOLENUMBER, 1, 300, 650, -1), // insert
+ RegionAnnotation(HOLENUMBER, 0, 249, 329, 800),// adapter
+ RegionAnnotation(HOLENUMBER, 1, 0, 250, -1) // insert
+};
+
+static const std::vector<RegionAnnotation> EXPECTED_HQREGIONS = {
+ RegionAnnotation(HOLENUMBER, 2, 50, 900, 900) // hqregion
+};
+
+static const DNALength EXPECTED_HQSTART = 50;
+
+static const DNALength EXPECTED_HQEND = 900;
+
+static const DNALength WHOLE_LENGTH = 1200;
+
+static const std::vector<RegionAnnotation> EXPECTED_ADAPTERS = {
+ RegionAnnotation(HOLENUMBER, 0, 249, 329, 800),// adapter
+ RegionAnnotation(HOLENUMBER, 0, 649, 700, 700) // adapter
+};
+
+static const std::vector<RegionAnnotation> EXPECTED_INSERTS = {
+ RegionAnnotation(HOLENUMBER, 1, 0, 250, -1),// insert
+ RegionAnnotation(HOLENUMBER, 1, 300, 650, -1),// insert
+ RegionAnnotation(HOLENUMBER, 1, 700, 999, -1) // insert
+};
+
+static const std::vector<ReadInterval> EXPECTED_SUBREAD_INTERVALS_BYADAPTER_NOHQ = {
+ ReadInterval(0, 249, 0), // by Adapter, subread score unknown.
+ ReadInterval(329, 649, 0),
+ ReadInterval(700, 1200, 0)
+};
+
+static const std::vector<ReadInterval> EXPECTED_SUBREAD_INTERVALS_BYADAPTER_HQ = {
+ ReadInterval(50, 249, 900), // by HQ, subread score = HQRegion score
+ ReadInterval(329, 649, 900),
+ ReadInterval(700, 900, 900)
+};
+
+static const std::vector<ReadInterval> EXPECTED_SUBREAD_INTERVALS_NOHQ = {
+ ReadInterval(0, 250, -1), // not by adapter, not by HQ, use the original score.
+ ReadInterval(300, 650, -1),
+ ReadInterval(700, 999, -1)
+};
+
+static const std::vector<ReadInterval> EXPECTED_SUBREAD_INTERVALS_HQ = {
+ ReadInterval(50, 250, 900), // by HQ, subread score = HQRegion score
+ ReadInterval(300, 650, 900),
+ ReadInterval(700, 900, 900)
+};
+
+static const std::vector<ReadInterval> EXPECTED_ADAPTER_INTERVALS = {
+ ReadInterval(249, 329, 800),
+ ReadInterval(649, 700, 700)
+};
+
+static const std::vector<RegionAnnotation> REGIONS_SORTED_BY_POS = {
+ RegionAnnotation(HOLENUMBER, 1, 0, 250, -1), // insert
+ RegionAnnotation(HOLENUMBER, 2, 50, 900, 900),// hqregion
+ RegionAnnotation(HOLENUMBER, 0, 249, 329, 800),// adapter
+ RegionAnnotation(HOLENUMBER, 1, 300, 650, -1), // insert
+ RegionAnnotation(HOLENUMBER, 0, 649, 700, 700),// adapter
+ RegionAnnotation(HOLENUMBER, 1, 700, 999, -1) // insert
+};
+
+static const std::vector<RegionAnnotation> REGIONS_SORTED_BY_TYPE = {
+ RegionAnnotation(HOLENUMBER, 0, 249, 329, 800),// adapter
+ RegionAnnotation(HOLENUMBER, 0, 649, 700, 700),// adapter
+ RegionAnnotation(HOLENUMBER, 1, 0, 250, -1), // insert
+ RegionAnnotation(HOLENUMBER, 1, 300, 650, -1), // insert
+ RegionAnnotation(HOLENUMBER, 1, 700, 999, -1), // insert
+ RegionAnnotation(HOLENUMBER, 2, 50, 900, 900) // hqregion
+};
+
+TEST(RegionAnnotationTest, Sort_By_Pos) {
+ std::vector<RegionAnnotation> ras = REGIONS;
+ std::sort(ras.begin(), ras.end());
+ EXPECT_EQ(ras, REGIONS_SORTED_BY_POS);
+}
+
+TEST(RegionAnnotationTest, Sort_By_Type) {
+ std::vector<RegionAnnotation> ras = REGIONS;
+ std::sort(ras.begin(), ras.end(), compare_region_annotation_by_type);
+ EXPECT_EQ(ras, REGIONS_SORTED_BY_TYPE);
+}
+
+TEST(RegionAnnotationsTest, Constructor) {
+ RegionAnnotations ras(HOLENUMBER, REGIONS, TYPES);
+ EXPECT_EQ(ras.table_, REGIONS_SORTED_BY_TYPE);
+ EXPECT_EQ(ras.HoleNumber(), HOLENUMBER);
+}
+
+TEST(RegionAnnotationsTest, RegionAnnotationsOfType) {
+ RegionAnnotations ras(HOLENUMBER, REGIONS, TYPES);
+ EXPECT_EQ(ras.Adapters(), EXPECTED_ADAPTERS);
+ EXPECT_EQ(ras.HQRegions(), EXPECTED_HQREGIONS);
+ EXPECT_EQ(ras.Inserts(), EXPECTED_INSERTS);
+ EXPECT_EQ(ras.HQStart(), EXPECTED_HQSTART);
+ EXPECT_EQ(ras.HQEnd(), EXPECTED_HQEND);
+}
+
+TEST(RegionAnnotationsTest, SubreadIntervals) {
+ RegionAnnotations ras(HOLENUMBER, REGIONS, TYPES);
+ vector<ReadInterval> ris = ras.SubreadIntervals(WHOLE_LENGTH, true, false);
+ EXPECT_EQ(ris, EXPECTED_SUBREAD_INTERVALS_BYADAPTER_NOHQ);
+
+ ris = ras.SubreadIntervals(WHOLE_LENGTH, true, true);
+ EXPECT_EQ(ris, EXPECTED_SUBREAD_INTERVALS_BYADAPTER_HQ);
+
+ ris = ras.SubreadIntervals(WHOLE_LENGTH, false, false);
+ EXPECT_EQ(ris, EXPECTED_SUBREAD_INTERVALS_NOHQ);
+
+ ris = ras.SubreadIntervals(WHOLE_LENGTH, false, true);
+ EXPECT_EQ(ris, EXPECTED_SUBREAD_INTERVALS_HQ);
+}
+
+TEST(RegionAnnotationsTest, AdapterIntervals) {
+ RegionAnnotations ras(HOLENUMBER, REGIONS, TYPES);
+ EXPECT_EQ(ras.AdapterIntervals(), EXPECTED_ADAPTER_INTERVALS);
+}
+
+TEST(RegionAnnotationsTest, SubreadIntervals_2) {
+ std::vector<RegionAnnotation> regions({
+ RegionAnnotation(HOLENUMBER, 0, 0, 112, -1)// adapter, no insert, no hq
+ });
+ RegionAnnotations ras(HOLENUMBER, regions, TYPES);
+
+ vector<ReadInterval> ris = ras.SubreadIntervals(WHOLE_LENGTH, true, false);
+ EXPECT_EQ(ris.size(), 1); // (112, WHOLE_LENGTH, -1)
+ EXPECT_EQ(ris[0].start, 112);
+ EXPECT_EQ(ris[0].end, WHOLE_LENGTH);
+
+ // no insert, no hq && require adapter, require hq
+ ris = ras.SubreadIntervals(WHOLE_LENGTH, true, true);
+ EXPECT_EQ(ris.size(), 0);
+
+ // no require adapter, no require hq
+ ris = ras.SubreadIntervals(WHOLE_LENGTH, false, false);// no insert
+ EXPECT_EQ(ris.size(), 0);
+
+ // no require adapter, require hq
+ ris = ras.SubreadIntervals(WHOLE_LENGTH, false, true); // no hq
+ EXPECT_EQ(ris.size(), 0);
+}
+
+TEST(RegionAnnotationsTest, SubreadIntervals_3) {
+ std::vector<RegionAnnotation> regions({
+ RegionAnnotation(HOLENUMBER, 1, 0, 170, -1),// insert
+ RegionAnnotation(HOLENUMBER, 2, 0, 0, 0) // hq length = 0
+ });
+ RegionAnnotations ras(HOLENUMBER, regions, TYPES);
+
+ // require adapter, no require hq
+ vector<ReadInterval> ris = ras.SubreadIntervals(WHOLE_LENGTH, true, false);
+ EXPECT_EQ(ris.size(), 0);
+
+ // require adapter, require hq
+ ris = ras.SubreadIntervals(WHOLE_LENGTH, true, true);
+ EXPECT_EQ(ris.size(), 0);
+
+ // no require adapter, no require hq
+ ris = ras.SubreadIntervals(WHOLE_LENGTH, false, false);
+ EXPECT_EQ(ris.size(), 1);
+ EXPECT_EQ(ris[0], ReadInterval(0, 170, -1));
+
+ // no require adapter, require hq
+ ris = ras.SubreadIntervals(WHOLE_LENGTH, false, true);
+ EXPECT_EQ(ris.size(), 0);
+}
diff --git a/unittest/pbdata/reads/RegionTypeMap_gtest.cpp b/unittest/pbdata/reads/RegionTypeMap_gtest.cpp
new file mode 100644
index 0000000..12e108d
--- /dev/null
+++ b/unittest/pbdata/reads/RegionTypeMap_gtest.cpp
@@ -0,0 +1,61 @@
+/*
+ * ==================================================================
+ *
+ * Filename: RegionTypeMap_gtest.cpp
+ *
+ * Description: Test pbdata/reads/RegionAnnotations.hpp
+ *
+ * Version: 1.0
+ * Created: 09/27/2015 03:54:55 PM
+ * Compiler: gcc
+ *
+ * Author: Yuan Li (yli), yli at pacificbiosciences.com
+ * Company: Pacific Biosciences
+ *
+ * ==================================================================
+ */
+#include "gtest/gtest.h"
+#define private public
+#include "reads/RegionTypeMap.hpp"
+
+using namespace std;
+
+
+// Adater - 0, Insert - 1, HQRegion - 2
+const vector<RegionType> TYPES = {Adapter, Insert, HQRegion};
+
+TEST(RegionTypeMapTest, ToString) {
+ EXPECT_EQ(RegionTypeMap::ToString(Adapter), "Adapter");
+ EXPECT_EQ(RegionTypeMap::ToString(HQRegion), "HQRegion");
+ EXPECT_EQ(RegionTypeMap::ToString(Insert), "Insert");
+}
+
+TEST(RegionTypeMapTest, ToRegionType) {
+ EXPECT_EQ(RegionTypeMap::ToRegionType("Adapter"), Adapter);
+ EXPECT_EQ(RegionTypeMap::ToRegionType("HQRegion"), HQRegion);
+ EXPECT_EQ(RegionTypeMap::ToRegionType("Insert"), Insert);
+}
+
+TEST(RegionTypeMapTest, ToIndex) {
+ // In most bas.h5 files, order of region types:
+ std::vector<std::string> typeStrs = {"Insert", "Adapter", "HQRegion"};
+
+ EXPECT_EQ(RegionTypeMap::ToIndex(Insert, typeStrs), 0);
+ EXPECT_EQ(RegionTypeMap::ToIndex(Adapter, typeStrs), 1);
+ EXPECT_EQ(RegionTypeMap::ToIndex(HQRegion, typeStrs), 2);
+
+ EXPECT_EQ(RegionTypeMap::ToIndex("Insert", typeStrs), 0);
+ EXPECT_EQ(RegionTypeMap::ToIndex("Adapter", typeStrs), 1);
+ EXPECT_EQ(RegionTypeMap::ToIndex("HQRegion", typeStrs), 2);
+
+ // Test given a different region type order.
+ typeStrs = {"Insert", "HQRegion", "Adapter", "BarCode"};
+
+ EXPECT_EQ(RegionTypeMap::ToIndex(Insert, typeStrs), 0);
+ EXPECT_EQ(RegionTypeMap::ToIndex(HQRegion, typeStrs), 1);
+ EXPECT_EQ(RegionTypeMap::ToIndex(Adapter, typeStrs), 2);
+
+ EXPECT_EQ(RegionTypeMap::ToIndex("Insert", typeStrs), 0);
+ EXPECT_EQ(RegionTypeMap::ToIndex("HQRegion", typeStrs), 1);
+ EXPECT_EQ(RegionTypeMap::ToIndex("Adapter", typeStrs), 2);
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbseqlib.git
More information about the debian-med-commit
mailing list