[med-svn] [pbdagcon] 01/08: Imported Upstream version 0~20160325+ds
Afif Elghraoui
afif at moszumanska.debian.org
Mon Oct 24 04:08:29 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository pbdagcon.
commit 7b1ae4f97525fb08834fbc3c4b28fa46d0620545
Author: Afif Elghraoui <afif at debian.org>
Date: Sun Oct 23 15:59:09 2016 -0700
Imported Upstream version 0~20160325+ds
---
.gitignore | 1 +
.gitmodules | 8 +-
.travis.yml | 4 +-
src/cpp/pbdagcon_wf.sh => LICENSE | 40 +-
Makefile | 48 -
README.md | 27 +-
configure.py | 320 ++++
makefile | 45 +
src/cpp/Alignment.cpp | 123 +-
src/cpp/Alignment.hpp | 55 +-
src/cpp/AlnGraphBoost.cpp | 108 +-
src/cpp/AlnGraphBoost.hpp | 67 +-
src/cpp/AlnProvider.hpp | 47 +-
src/cpp/BlasrM5AlnProvider.cpp | 40 +-
src/cpp/BlasrM5AlnProvider.hpp | 55 +-
src/cpp/BoundedBuffer.hpp | 53 +-
src/cpp/DB.c | 1106 -----------
src/cpp/DB.h | 357 ----
src/cpp/DazAlnProvider.cpp | 79 +-
src/cpp/DazAlnProvider.hpp | 62 +-
src/cpp/Makefile | 38 -
src/cpp/ProgramOpts.hpp | 5 +-
src/cpp/SimpleAligner.cpp | 40 +-
src/cpp/SimpleAligner.hpp | 42 +-
src/cpp/align.c | 3805 -------------------------------------
src/cpp/align.h | 331 ----
src/cpp/boost.mk | 18 -
src/cpp/dazcon.cpp | 78 +-
src/cpp/main.cpp | 88 +-
src/cpp/makefile | 76 +
src/cpp/pbdagcon_wf.sh | 39 -
src/cpp/pbi.mk | 22 -
src/q-sense.py | 39 -
test/cpp/.gitignore | 2 +
test/cpp/AlignmentTest.cpp | 89 +-
test/cpp/AlnGraphBoostTest.cpp | 9 +-
test/cpp/Makefile | 77 -
test/cpp/TargetHitTest.cpp | 14 +-
test/cpp/TargetTest.cpp | 4 +-
test/cpp/gtest.mk | 10 -
test/cpp/makefile | 117 ++
travis.sh | 7 +
42 files changed, 913 insertions(+), 6682 deletions(-)
diff --git a/.gitignore b/.gitignore
index b69e8a4..f1288e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ dazcon
test_*
boost_*
gtest-*
+defines.mk
diff --git a/.gitmodules b/.gitmodules
index ce6830a..35ea031 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,9 @@
[submodule "blasr_libcpp"]
path = blasr_libcpp
- url = https://github.com/PacificBiosciences/blasr_libcpp.git
+ url = git://github.com/PacificBiosciences/blasr_libcpp.git
+[submodule "DALIGNER"]
+ path = DALIGNER
+ url = git://github.com/PacificBiosciences/DALIGNER.git
+[submodule "DAZZ_DB"]
+ path = DAZZ_DB
+ url = git://github.com/PacificBiosciences/DAZZ_DB.git
diff --git a/.travis.yml b/.travis.yml
index 8ce6492..1dc27d1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,8 +1,6 @@
language: cpp
script:
- - make init-submodule
- - make
- - make check
+ - ./travis.sh
compiler:
- gcc
- clang
diff --git a/src/cpp/pbdagcon_wf.sh b/LICENSE
old mode 100755
new mode 100644
similarity index 60%
copy from src/cpp/pbdagcon_wf.sh
copy to LICENSE
index 76912da..fe329bf
--- a/src/cpp/pbdagcon_wf.sh
+++ b/LICENSE
@@ -1,7 +1,5 @@
-#!/bin/bash
-
#################################################################################$$
-# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
@@ -36,39 +34,3 @@
# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#################################################################################$$
-
-
-# Simple pbdagcon workflow script. Written for the benefit of running via
-# smrtpipe so I can communicate pipe errors to the task. We're overcoming
-# the limitation of smrtpipe forcing tasks to run serially, enabling a new
-# level of pipelining that's extremely efficient in an imperfect world ...
-# However, direct file I/O is faster by default.
-
-tmp=${tmp-"/tmp"}
-
-trap "rm -f $tmp/aln.$$.pre" EXIT SIGINT
-
-echo "Generating pre-alignments"
-echo "m4topre.py $mym4 $allm4 $subreads ${bestn-24} > $tmp/aln.$$.pre"
-
-# generate pre-alignments to a tmp directory
-m4topre.py $mym4 $allm4 $subreads ${bestn-24} > $tmp/aln.$$.pre || exit $?
-
-echo "Correcting reads"
-# pipe it to consensus and generate fasta
-pbdagcon -c ${cov-8} -a -j ${nproc-15} $tmp/aln.$$.pre | tee ${fasta-"corrected.fa"} | \
-# generate a fastq
-awk '{if($0~/>/){sub(/>/,"@",$0);print;}else{l=length($0);q="";while(l--){q=q "9"}printf("%s\n+\n%s\n",$0,q)}}' > ${fastq-"corrected.fq"}
-
-
-# check the status of each piped command and exit non-zero if found
-for exitval in ${PIPESTATUS[*]}
-do
- if [ $exitval -gt 0 ]
- then
- exit $exitval
- fi
-done
-
-
-exit 0;
diff --git a/Makefile b/Makefile
deleted file mode 100755
index 6d11406..0000000
--- a/Makefile
+++ /dev/null
@@ -1,48 +0,0 @@
-.PHONY: all clean test init-submodule cpp-github cpp cpp-check cpp-clean
-SHELL = /bin/bash -e
-
-all: cpp-github
-
-clean: cpp-clean
-
-check: cpp-github-check
-
-project: init-submodule cpp-github
-
-init-submodule:
- $(MAKE) update-submodule
- $(MAKE) build-submodule
-
-update-submodule:
- git submodule update --init
-
-build-submodule:
- $(MAKE) -C blasr_libcpp/pbdata nopbbam=1 mklibconfig
- #$(MAKE) -C blasr_libcpp/pbdata mklibconfig
- $(MAKE) -C blasr_libcpp/alignment -f simple.mk nohdf=1
- $(MAKE) -C blasr_libcpp/pbdata nopbbam=1
-
-submodule-clean:
- $(RM) -r blasr_libcpp
-
-# C++ project build directives
-cpp-github:
- $(MAKE) -C src/cpp BLASR=$(PWD)/blasr_libcpp/alignment PBDATA=$(PWD)/blasr_libcpp/pbdata
-
-cpp-github-check:
- $(MAKE) -C test/cpp BLASR=$(PWD)/blasr_libcpp/alignment PBDATA=$(PWD)/blasr_libcpp/pbdata
-
-cpp:
- $(MAKE) -C src/cpp
-
-cpp-check: cpp
- $(MAKE) -C test/cpp
-
-cpp-clean:
- $(MAKE) -C src/cpp clean
- $(MAKE) -C test/cpp clean
-
-clean-all: cpp-clean submodule-clean
- $(RM)r src/cpp/third-party/boost_1_58_0-headersonly
- $(RM)r test/cpp/gtest-1.7.0
-
diff --git a/README.md b/README.md
index 5b858e6..c5b1fc6 100644
--- a/README.md
+++ b/README.md
@@ -45,18 +45,19 @@ internet.
### Compile/Check (pbdagcon)
```sh
- # First fetch and build the relevant portions of the blasr_libcpp
+ # First, configure your build. (You can look at `defines.mk` and
+ # `blasr_libcpp/defines.mk` to diagnose any problems.)
+ ./configure.py --boost --gtest --sub --no-pbbam
+
+ # Then, fetch and build the relevant portions of the blasr_libcpp
# submodule
make init-submodule
# build pbdagcon executable (Makefile fetches boost headers)
make
- # or, if you already have boost headers
- make boost=<path to headers>
# build and run unit tests
- # THIS IS CURRENTLY BROKEN. -cdunn
- ###make check
+ make check
# usage
cd src/cpp
@@ -78,15 +79,19 @@ For *-m 4* format, the alignments must be run through a format adapter,
The following example shows the simplest way to generate a consensus for one
target using BLASR *-m 5* alignments as input.
-
+```sh
blasr queries.fasta target.fasta -bestn 1 -m 5 -out mapped.m5
pbdagcon mapped.m5 > consensus.fasta
+```
-### Use Case: Generating consensus from daligner alignments
-Can parse LAS/DB files generated from the following commits:
-* [DALIGNER](https://github.com/thegenemyers/DALIGNER/commit/8edd180ba7b5302c6f1fc859eef5c646db99fd87)
-* [DAZ_DB](https://github.com/thegenemyers/DAZZ_DB/commit/84fa98fde94ba0ab56dd715aa7f8fe7e150290f8)
-
+### Use Case: Generating corrected reads from daligner alignments
+Support for generating consensus from daligner output has been added in the
+form of a new executable *dazcon*. Note that it is sensitive to the version
+of daligner used and may crash if using inputs generated by versions other
+than what is referenced in the submodules.
+```sh
+ dazcon -ox -j 4 -s subreads.db -a subreads.las > corrected.fasta
+```
### Use Case: HGAP correction of PacBio reads
Walks through how one could use pbdagcon to correct PacBio reads. This
diff --git a/configure.py b/configure.py
new file mode 100755
index 0000000..be0dd11
--- /dev/null
+++ b/configure.py
@@ -0,0 +1,320 @@
+#!/usr/bin/env python
+"""Configure the build.
+
+- Fetch boost/gtest.
+- Create defines.mk
+"""
+import argparse
+import commands
+import contextlib
+import os
+import sys
+
+ROOT = os.path.abspath(os.path.dirname(__file__))
+
+def log(msg):
+ sys.stderr.write(msg)
+ sys.stderr.write('\n')
+
+def shell(cmd):
+ log(cmd)
+ status, output = commands.getstatusoutput(cmd)
+ if status:
+ raise Exception('%d <-| %r' %(status, cmd))
+ return output
+
+def system(cmd):
+ log(cmd)
+ status = os.system(cmd)
+ if status:
+ raise Exception('%d <- %r' %(status, cmd))
+ return
+
+def mkdirs(path):
+ if not os.path.isdir(path):
+ os.makedirs(path)
+
+ at contextlib.contextmanager
+def cd(nwd):
+ cwd = os.getcwd()
+ log('cd %r -> %r' %(cwd, nwd))
+ os.chdir(nwd)
+ yield
+ os.chdir(cwd)
+ log('cd %r <- %r' %(cwd, nwd))
+
+def fetch_gtest(build_dir):
+ gtest_version = 'gtest-1.7.0'
+ gtest_uri = 'https://googletest.googlecode.com/files/%s.zip' %gtest_version
+ gdir = os.path.join(build_dir, 'test', 'cpp', gtest_version)
+ if not os.path.isdir(gdir):
+ #mkdirs(gdir)
+ zipfile = gdir + '.zip'
+ if not os.path.isfile(zipfile):
+ get_gtest_cmd = 'curl -L %s --output %s' %(gtest_uri, zipfile)
+ system(get_gtest_cmd)
+ install_gtest_cmd = 'unzip -q %s -d %s' %(zipfile, os.path.join(build_dir, 'test', 'cpp'))
+ system(install_gtest_cmd)
+ assert os.path.isdir(gdir)
+ return gdir
+
+def fetch_boost_headers(build_dir):
+ """Fetch into {build_dir}/src/cpp/third-party/
+ Return actual directory path, relative to subdirs.
+ """
+ uri = 'https://www.dropbox.com/s/g22iayi83p5gbbq/boost_1_58_0-headersonly.tbz2?dl=0'
+ hdir = os.path.join(build_dir, 'src', 'cpp', 'third-party', 'boost_1_58_0-headersonly')
+ if not os.path.isdir(hdir):
+ mkdirs(os.path.dirname(hdir))
+ #get_boost_cmd = 'curl -L %s | tar xjf -C src/cpp/third-party -' %uri
+ tbz = os.path.join(build_dir, 'src', 'cpp', 'third-party', 'boost_1_58_0-headersonly.tbz2')
+ if not os.path.isfile(tbz):
+ get_boost_cmd = 'curl -L %s --output %s' %(uri, tbz)
+ system(get_boost_cmd)
+ install_boost_cmd = 'tar vxjf %s -C %s/src/cpp/third-party | head' %(tbz, build_dir)
+ system(install_boost_cmd)
+ assert os.path.isdir(hdir)
+ return hdir
+
+def update_content(fn, content):
+ current_content = open(fn).read() if os.path.exists(fn) else None
+ if content != current_content:
+ log('writing to %r' %fn)
+ log('"""\n' + content + '"""')
+ open(fn, 'w').write(content)
+
+def compose_defines_with_hdf_headers(HDF_HEADERS):
+ thisdir = os.path.dirname(os.path.abspath(__file__))
+ return """
+HDF_HEADERS:=%(HDF_HEADERS)s
+#HDF5_INCLUDE?=${HDF_HEADERS}/src
+CPPFLAGS+=-I${HDF_HEADERS}/src -I${HDF_HEADERS}/c++/src
+CPPFLAGS+=-I../pbdata -I../hdf -I../alignment
+LIBPBDATA_LIB ?=../pbdata/libpbdata.so
+LIBPBIHDF_LIB ?=../pbdata/libpbihdf.so
+LIBBLASR_LIB ?=../pbdata/libblasr.so
+"""%(dict(thisdir=thisdir, HDF_HEADERS=HDF_HEADERS))
+
+def compose_defines():
+ """
+ Note that our local 'hdf' subdir will not even build
+ in this case.
+ """
+ thisdir = os.path.dirname(os.path.abspath(__file__))
+ return """
+LIBPBDATA_INCLUDE ?=../pbdata
+LIBPBIHDF_INCLUDE ?=../hdf
+LIBBLASR_INCLUDE ?=../alignment
+LIBPBDATA_LIB ?=%(thisdir)s/pbdata/libpbdata.so
+LIBPBIHDF_LIB ?=%(thisdir)s/pbdata/libpbihdf.so
+LIBBLASR_LIB ?=%(thisdir)s/pbdata/libblasr.so
+nohdf ?=1
+"""%(dict(thisdir=thisdir))
+
+def ifenvf(env, key, func):
+ if key in env:
+ return env[key]
+ else:
+ return func()
+def setifenvf(envout, envin, key, func):
+ envout[key] = ifenvf(envin, key, func)
+def setifenv(envout, envin, key, val):
+ envout[key] = envin.get(key, val)
+def setenv(envout, key, val):
+ envout[key] = val
+def update_env_if(envout, envin, keys):
+ for key in keys:
+ if key in envin:
+ envout[key] = envin[key]
+def compose_defs_env(env):
+ # We disallow env overrides for anything with a default from GNU make.
+ nons = ['CXX', 'CC', 'AR'] # 'SHELL'?
+ ovr = ['%-20s ?= %s' %(k, v) for k,v in sorted(env.items()) if k not in nons]
+ nonovr = ['%-20s := %s' %(k, v) for k,v in sorted(env.items()) if k in nons]
+ return '\n'.join(ovr + nonovr + [''])
+def compose_defines_pacbio(envin):
+ """
+ This is used by mobs via buildcntl.sh.
+ """
+ env = dict()
+ #setifenv(env, envin, 'LIBPBDATA_INCLUDE', '../pbdata')
+ #setifenv(env, envin, 'LIBPBIHDF_INCLUDE', '../hdf')
+ #setifenv(env, envin, 'LIBBLASR_INCLUDE', '../alignment')
+ #setifenv(env, envin, 'LIBPBDATA_LIB', '../pbdata/libpbdata.so')
+ #setifenv(env, envin, 'LIBPBIHDF_LIB', '../hdf/libpbihdf.so')
+ #setifenv(env, envin, 'LIBBLASR_LIB', '../alignment/libblasr.so')
+ #setifenv(env, envin, 'nohdf', '1')
+ possibs = set([
+ 'CC', 'CXX', 'AR',
+ 'GTEST_INCLUDE', 'GTEST_SRC',
+ 'LIBBLASR_INCLUDE', 'LIBBLASR_LIB', 'LIBBLASR_LIBFLAGS',
+ 'LIBPBDATA_INCLUDE', 'LIBPBDATA_LIB', 'LIBPBDATA_LIBFLAGS',
+ 'LIBPBIHDF_INCLUDE', 'LIBPBIHDF_LIB', 'LIBPBIHDF_LIBFLAGS',
+ 'HDF5_INCLUDE', 'HDF5_LIB', 'HDF5_LIBFLAGS',
+ 'PBBAM_INCLUDE', 'PBBAM_LIB', 'PBBAM_LIBFLAGS',
+ 'HTSLIB_INCLUDE', 'HTSLIB_LIB', 'HTSLIB_LIBFLAGS',
+ 'BOOST_INCLUDE','PTHREAD_LIBFLAGS',
+ 'ZLIB_LIB', 'ZLIB_LIBFLAGS',
+ 'GCC_LIB',
+ 'DAZZ_DB_SRC', 'DAZZ_DB_INCLUDE',
+ 'DALIGNER_SRC', 'DALIGNER_INCLUDE',
+ ])
+ update_env_if(env, envin, possibs)
+ return compose_defs_env(env)
+
+def configure_pacbio(envin, shared, build_dir):
+ content1 = compose_defines_pacbio(envin)
+ if shared:
+ content1 += 'LDLIBS+=-lrt\n'
+ update_content(os.path.join(build_dir, 'defines.mk'), content1)
+
+def get_make_style_env(envin, args):
+ envout = dict()
+ for arg in args:
+ if '=' in arg:
+ k, v = arg.split('=')
+ envout[k] = v
+ envout.update(envin)
+ return envout
+
+class OsType:
+ Unknown, Linux, Darwin = range(3)
+
+def getOsType():
+ uname = shell('uname -s')
+ log('uname=%r' %uname)
+ if 'Darwin' in uname:
+ return OsType.Darwin
+ elif 'Linux' in uname:
+ return OsType.Linux
+ else:
+ return OsType.Unknown
+
+def update_env_for_linux(env):
+ env['SET_LIB_NAME'] = '-soname'
+ env['SH_LIB_EXT'] = '.so'
+ env['EXTRA_LDFLAGS'] = '-Wl,--no-as-needed'
+def update_env_for_darwin(env):
+ env['SET_LIB_NAME'] = '-install_name'
+ env['SH_LIB_EXT'] = '.dylib'
+ env['EXTRA_LDFLAGS'] = '-flat_namespace'
+ # -flat_namespace makes BSD ld act like Linux ld, finding
+ # shared libs recursively.
+def update_env_for_unknown(env):
+ env['SET_LIB_NAME'] = '-soname'
+ env['SH_LIB_EXT'] = '.so'
+update_env_for_os = {
+ OsType.Linux: update_env_for_linux,
+ OsType.Darwin: update_env_for_darwin,
+ OsType.Unknown: update_env_for_unknown,
+}
+
+
+def parse_args(args):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--boost-headers', action='store_true',
+ help='Download Boost headers.')
+ parser.add_argument('--gtest', action='store_true',
+ help='Download google-test.')
+ parser.add_argument('--no-pbbam', action='store_true',
+ help='Avoid compiling anything which would need pbbam.')
+ parser.add_argument('--submodules', action='store_true',
+ help='Set variables to use our git-submodules, which must be pulled and built first. (Implies --no-pbbam.)')
+ parser.add_argument('--shared', action='store_true',
+ help='Build for dynamic linking.')
+ parser.add_argument('--mode', default='opt',
+ help='debug, opt, profile [default=%(default)s] CURRENTLY IGNORED')
+ parser.add_argument('--build-dir',
+ help='Can be different from source directory, but only when *not* also building submodule.')
+ parser.add_argument('makevars', nargs='*',
+ help='Variables in the style of make: FOO=val1 BAR=val2 etc.')
+ return parser.parse_args(args)
+
+def set_defs_defaults(env, nopbbam):
+ defaults = {
+ 'LIBPBDATA_LIBFLAGS': '-lpbdata',
+ 'LIBBLASR_LIBFLAGS': '-lblasr',
+ 'SHELL': 'bash -xe',
+ 'DAZZ_DB_SRC': os.path.join(ROOT, '..', 'DAZZ_DB'),
+ 'DALIGNER_SRC': os.path.join(ROOT, '..', 'DALIGNER'),
+ 'DAZZ_DB_INCLUDE': '${DAZZ_DB_SRC}',
+ 'DALIGNER_INCLUDE': '${DALIGNER_SRC}',
+ 'PTHREAD_LIBFLAGS': '-lpthread',
+ }
+ pbbam_defaults = {
+ 'LIBPBIHDF_LIBFLAGS': '-lpbihdf',
+ 'PBBAM_LIBFLAGS': '-lpbbam',
+ 'HTSLIB_LIBFLAGS': '-lhts',
+ 'HDF5_LIBFLAGS': '-lhdf5_cpp -lhdf5',
+ 'ZLIB_LIBFLAGS': '-lz',
+ 'PTHREAD_LIBFLAGS': '-lpthread',
+ 'DL_LIBFLAGS': '-ldl', # neeeded by HDF5 always
+ }
+ if not nopbbam:
+ defaults.update(pbbam_defaults)
+ for k in defaults:
+ if k not in env:
+ env[k] = defaults[k]
+
+def set_defs_submodule_defaults(env, nopbbam):
+ libcpp = os.path.join(ROOT, 'blasr_libcpp')
+ daligner = os.path.join(ROOT, 'DALIGNER')
+ dazz_db = os.path.join(ROOT, 'DAZZ_DB')
+ defaults = {
+ 'LIBPBDATA_INCLUDE': os.path.join(libcpp, 'pbdata'),
+ 'LIBBLASR_INCLUDE': os.path.join(libcpp, 'alignment'),
+ 'LIBPBIHDF_INCLUDE': '' if nopbbam else os.path.join(libcpp, 'hdf'),
+ 'LIBPBDATA_LIB': os.path.join(libcpp, 'pbdata'),
+ 'LIBBLASR_LIB': os.path.join(libcpp, 'alignment'),
+ 'LIBPBIHDF_LIB': '' if nopbbam else os.path.join(libcpp, 'hdf'),
+ 'DALIGNER_SRC': daligner,
+ 'DAZZ_DB_SRC': dazz_db,
+ }
+ for k in defaults:
+ if k not in env:
+ env[k] = defaults[k]
+
+def write_makefile(build_dir_root, src_dir_root, makefilename, relpath):
+ src_dir = os.path.join(src_dir_root, relpath)
+ build_dir = os.path.join(build_dir_root, relpath)
+ content = """\
+vpath %%.cpp %(src_dir)s
+vpath %%.c %(src_dir)s
+include %(src_dir)s/%(makefilename)s
+""" %dict(makefilename=makefilename, src_dir=src_dir)
+ mkdirs(build_dir)
+ fn = os.path.join(build_dir, makefilename)
+ update_content(fn, content)
+
+def write_makefiles(build_dir):
+ write_makefile(build_dir, ROOT, 'makefile', '.')
+ write_makefile(build_dir, ROOT, 'makefile', 'src/cpp')
+ write_makefile(build_dir, ROOT, 'makefile', 'test/cpp')
+
+def main(prog, *args):
+ """We are still deciding what env-vars to use, if any.
+ """
+ conf = parse_args(args)
+ envin = get_make_style_env(os.environ, conf.makevars)
+ ost = getOsType()
+ update_env_for_os[ost](envin)
+ if conf.build_dir is not None:
+ write_makefiles(conf.build_dir)
+ else:
+ conf.build_dir = '.'
+ conf.build_dir = os.path.abspath(conf.build_dir)
+ if conf.boost_headers:
+ envin['BOOST_INCLUDE'] = fetch_boost_headers(conf.build_dir)
+ if conf.gtest:
+ gtest_dir = fetch_gtest(conf.build_dir)
+ envin['GTEST_INCLUDE'] = os.path.join(gtest_dir, 'include')
+ envin['GTEST_SRC'] = os.path.join(gtest_dir, 'src')
+ if conf.submodules:
+ set_defs_submodule_defaults(envin, conf.no_pbbam)
+ conf.no_pbbam = True
+ set_defs_defaults(envin, conf.no_pbbam)
+ configure_pacbio(envin, conf.shared, conf.build_dir)
+
+
+if __name__=="__main__":
+ main(*sys.argv)
diff --git a/makefile b/makefile
new file mode 100755
index 0000000..ee923da
--- /dev/null
+++ b/makefile
@@ -0,0 +1,45 @@
+.PHONY: all clean test init-submodule cpp cpp-check cpp-clean
+
+THISDIR:=$(dir $(lastword ${MAKEFILE_LIST}))
+ROOT:=${THISDIR}
+-include ${CURDIR}/defines.mk
+
+SHELL = /bin/bash -e
+
+all: cpp
+
+clean: cpp-clean
+
+check: cpp-check
+
+project: init-submodule cpp
+
+init-submodule:
+ ${MAKE} update-submodule
+ ${MAKE} build-submodule
+
+update-submodule:
+ git submodule update --init
+
+build-submodule:
+ cd blasr_libcpp; NOHDF=1 NOPBBAM=1 ./configure.py
+ ${MAKE} -C blasr_libcpp/pbdata libconfig.h
+ ${MAKE} -C blasr_libcpp/pbdata libpbdata.a
+ ${MAKE} -C blasr_libcpp/alignment libblasr.a
+
+submodule-clean:
+ ${RM} -r blasr_libcpp
+
+cpp:
+ ${MAKE} -C src/cpp
+
+cpp-check: cpp
+ ${MAKE} -C test/cpp
+
+cpp-clean:
+ ${MAKE} -C src/cpp clean
+ ${MAKE} -C test/cpp clean
+
+clean-all: cpp-clean submodule-clean
+ ${RM}r src/cpp/third-party/boost_1_58_0-headersonly
+ ${RM}r test/cpp/gtest-1.7.0
diff --git a/src/cpp/Alignment.cpp b/src/cpp/Alignment.cpp
index c039254..e2a340a 100644
--- a/src/cpp/Alignment.cpp
+++ b/src/cpp/Alignment.cpp
@@ -1,39 +1,3 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-
#include <iostream>
#include <fstream>
#include <sstream>
@@ -53,27 +17,27 @@ std::string revComp(std::string& seq) {
std::string::iterator curr = seq.begin();
for (; curr != seq.end(); ++curr) {
char& c = *curr;
- c = c == 'T' ? bases[0] :
+ c = c == 'T' ? bases[0] :
c == 'G' ? bases[1] :
- c == 'A' ? bases[2] :
+ c == 'A' ? bases[2] :
c == 'C' ? bases[3] : c;
}
return std::string(seq.rbegin(), seq.rend());
}
// Set this to false if the alignments are grouped by query. The parse
-// routine will be adjusted to build the alignment graph based on the
-// queries.
+// routine will be adjusted to build the alignment graph based on the
+// queries.
bool Alignment::groupByTarget = true;
-Alignment::Alignment() :
- tlen(0),
- start(0),
- end(0),
- id(""),
- sid(""),
- strand('+'),
- qstr(""),
+Alignment::Alignment() :
+ tlen(0),
+ start(0),
+ end(0),
+ id(""),
+ sid(""),
+ strand('+'),
+ qstr(""),
tstr("") { }
// Parses blasr m5 output grouped either by target or query.
@@ -91,11 +55,11 @@ void parseM5(std::istream& stream, Alignment* aln) {
// avoids *some* empty lines
if (fields.size() == 0) return;
- // base query id (without the last '/<coordinates>'), allows us to
+ // base query id (without the last '/<coordinates>'), allows us to
// group properly by query when asked.
std::string baseQid = fields[0].substr(0,fields[0].find_last_of("/"));
aln->sid = fields[0];
- aln->id = Alignment::groupByTarget ? fields[5] : baseQid;
+ aln->id = Alignment::groupByTarget ? fields[5] : baseQid;
std::istringstream ssLen(Alignment::groupByTarget ? fields[6] : fields[1]);
ssLen >> aln->tlen;
@@ -166,15 +130,23 @@ std::ostream& operator<<(std::ostream& ostrm, Alignment& data) {
Alignment normalizeGaps(Alignment& aln, bool push) {
// XXX: optimize this
- size_t qlen = aln.qstr.length(), tlen = aln.tstr.length();
- assert(qlen == tlen);
+ assert(aln.qstr.length() == aln.tstr.length());
+ size_t len = aln.qstr.length();
std::string qNorm, tNorm;
- qNorm.reserve(qlen+100);
- tNorm.reserve(tlen+100);
+ qNorm.reserve(len+100);
+ tNorm.reserve(len+100);
+ std::string qstr = aln.qstr;
+ std::string tstr = aln.tstr;
+
+ // convert dots to dashes
+ for (size_t i=0; i < len; i++) {
+ if ('.' == qstr[i]) qstr[i] = '-';
+ if ('.' == tstr[i]) tstr[i] = '-';
+ }
// convert mismatches to indels
- for (size_t i=0; i < qlen; i++) {
- char qb = aln.qstr[i], tb = aln.tstr[i];
+ for (size_t i=0; i < len; i++) {
+ char qb = qstr[i], tb = tstr[i];
if (qb != tb && qb != '-' && tb != '-') {
qNorm += '-';
qNorm += qb;
@@ -186,19 +158,19 @@ Alignment normalizeGaps(Alignment& aln, bool push) {
}
}
- // update lengths
- qlen = qNorm.length();
- tlen = tNorm.length();
+ // update length
+ assert(qNorm.length() == tNorm.length());
+ len = qNorm.length();
if (push) {
// push gaps to the right, but not past the end
- for (size_t i=0; i < qlen-1; i++) {
+ for (size_t i=0; i < len-1; i++) {
// pushing target gaps
if (tNorm[i] == '-') {
size_t j = i;
- while (true) {
- char c = tNorm[++j];
- if (c != '-' || j > qlen - 1) {
+ while (++j < len) {
+ char c = tNorm[j];
+ if (c != '-') {
if (c == qNorm[i]) {
tNorm[i] = c;
tNorm[j] = '-';
@@ -211,9 +183,9 @@ Alignment normalizeGaps(Alignment& aln, bool push) {
// pushing query gaps
if (qNorm[i] == '-') {
size_t j = i;
- while (true) {
- char c = qNorm[++j];
- if (c != '-' || j > tlen - 1) {
+ while (++j < len) {
+ char c = qNorm[j];
+ if (c != '-') {
if (c == tNorm[i]) {
qNorm[i] = c;
qNorm[j] = '-';
@@ -224,6 +196,8 @@ Alignment normalizeGaps(Alignment& aln, bool push) {
}
}
}
+ assert(qNorm.length() == tNorm.length());
+ assert(len == tNorm.length());
// generate the final, normalized alignment strings
Alignment finalNorm;
@@ -232,7 +206,7 @@ Alignment normalizeGaps(Alignment& aln, bool push) {
finalNorm.start = aln.start;
finalNorm.tlen = aln.tlen;
finalNorm.strand = aln.strand;
- for (size_t i=0; i < qlen; i++) {
+ for (size_t i=0; i < len; i++) {
if (qNorm[i] != '-' || tNorm[i] != '-') {
finalNorm.qstr += qNorm[i];
finalNorm.tstr += tNorm[i];
@@ -243,16 +217,21 @@ Alignment normalizeGaps(Alignment& aln, bool push) {
}
void trimAln(Alignment& aln, int trimLen) {
- int lbases = 0, loffs = 0;
- while(lbases < trimLen) {
+ int lbases, rbases;
+ size_t loffs, roffs;
+ auto const len = aln.tstr.length();
+
+ lbases = 0; loffs = 0U;
+ while(lbases < trimLen && loffs < len) {
if (aln.tstr[loffs++] != '-') {
lbases++;
}
}
- int rbases = 0, roffs = aln.tstr.length();
- while (rbases < trimLen) {
- if (aln.tstr[roffs--] != '-') {
+ rbases = 0;
+ roffs = len;
+ while (rbases < trimLen && roffs > loffs) {
+ if (aln.tstr[--roffs] != '-') {
rbases++;
}
}
diff --git a/src/cpp/Alignment.hpp b/src/cpp/Alignment.hpp
index ff1ae71..ddaa21b 100644
--- a/src/cpp/Alignment.hpp
+++ b/src/cpp/Alignment.hpp
@@ -1,45 +1,8 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-
-#ifndef __GCON_ALIGNMENT_HPP__
-#define __GCON_ALIGNMENT_HPP__
+#pragma once
#include <stdint.h>
-///
+///
/// Super-simple alignment representation. Represents an alignment between two
/// PacBio reads, one of which we're trying to correct. The read to correct
/// may be either the target or the query, depending on how the alignment was
@@ -55,7 +18,7 @@ public:
// length of the sequence we are trying to correct
uint32_t tlen;
-
+
// conforming offsets are 1-based
uint32_t start;
@@ -65,7 +28,7 @@ public:
std::string id;
// ID of the supporting read (query)
- std::string sid;
+ std::string sid;
char strand;
@@ -87,7 +50,7 @@ void parseM5(std::istream& stream, dagcon::Alignment* aln);
void parsePre(std::istream& stream, dagcon::Alignment* aln);
/// Simplifies the alignment by normalizing gaps. Converts mismatches into
-/// indels ...
+/// indels ...
/// query: CAC query: C-AC
/// | | ---> | |
/// target: CGC target: CG-C
@@ -100,13 +63,11 @@ void parsePre(std::istream& stream, dagcon::Alignment* aln);
/// Shifts equivalent gaps to the right in the read ...
/// query: -C--CGT query: CCG--T
/// | | | ---> ||| |
-/// target: CCGAC-T target: CCGACT
-/// Allow optional gap pushing, some aligners may not need it and I'd like
+/// target: CCGAC-T target: CCGACT
+/// Allow optional gap pushing, some aligners may not need it and I'd like
/// to get rid of it anyway.
dagcon::Alignment normalizeGaps(dagcon::Alignment& aln, bool push=true);
void trimAln(dagcon::Alignment& aln, int trimLen=50);
-std::string revComp(std::string& seq);
-
-#endif // __GCON_ALIGNMENT_HPP__
+std::string revComp(std::string& seq);
diff --git a/src/cpp/AlnGraphBoost.cpp b/src/cpp/AlnGraphBoost.cpp
index 79ded57..c850039 100644
--- a/src/cpp/AlnGraphBoost.cpp
+++ b/src/cpp/AlnGraphBoost.cpp
@@ -1,39 +1,3 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-
#include <cstdint>
#include <cfloat>
#include <cassert>
@@ -53,9 +17,9 @@ AlnGraphBoost::AlnGraphBoost(const std::string& backbone) {
// initialize the graph structure with the backbone length + enter/exit
// vertex
size_t blen = backbone.length();
- _g = G(blen+1);
- for (size_t i = 0; i < blen+1; i++)
- boost::add_edge(i, i+1, _g);
+ _g = G(blen+2);
+ for (size_t i = 0; i < blen+1; i++)
+ boost::add_edge(i, i+1, _g);
VtxIter curr, last;
boost::tie(curr, last) = boost::vertices(_g);
@@ -75,16 +39,16 @@ AlnGraphBoost::AlnGraphBoost(const std::string& backbone) {
}
AlnGraphBoost::AlnGraphBoost(const size_t blen) {
- _g = G(blen+1);
- for (size_t i = 0; i < blen+1; i++)
- boost::add_edge(i, i+1, _g);
+ _g = G(blen+2);
+ for (size_t i = 0; i < blen+1; i++)
+ boost::add_edge(i, i+1, _g);
VtxIter curr, last;
boost::tie(curr, last) = boost::vertices(_g);
_enterVtx = *curr++;
_g[_enterVtx].base = '^';
_g[_enterVtx].backbone = true;
- for (size_t i = 0; i < blen; i++, ++curr) {
+ for (size_t i = 0; i < blen; ++i, ++curr) {
VtxDesc v = *curr;
_g[v].backbone = true;
_g[v].weight = 1;
@@ -104,6 +68,8 @@ void AlnGraphBoost::addAln(dagcon::Alignment& aln) {
VtxDesc prevVtx = _enterVtx;
for (size_t i = 0; i < aln.qstr.length(); i++) {
char queryBase = aln.qstr[i], targetBase = aln.tstr[i];
+ assert(queryBase != '.');
+ assert(targetBase != '.');
VtxDesc currVtx = index[bbPos];
// match
if (queryBase == targetBase) {
@@ -162,7 +128,7 @@ void AlnGraphBoost::addEdge(VtxDesc u, VtxDesc v) {
void AlnGraphBoost::mergeNodes() {
std::queue<VtxDesc> seedNodes;
- seedNodes.push(_enterVtx);
+ seedNodes.push(_enterVtx);
while(true) {
if (seedNodes.size() == 0)
@@ -184,7 +150,7 @@ void AlnGraphBoost::mergeNodes() {
if (_g[*ii].visited == false)
notVisited++;
}
-
+
// move onto the boost::target node after we visit all incoming edges for
// the boost::target node
if (notVisited == 0)
@@ -194,7 +160,7 @@ void AlnGraphBoost::mergeNodes() {
}
void AlnGraphBoost::mergeInNodes(VtxDesc n) {
- std::map<char, std::vector<VtxDesc>> nodeGroups;
+ std::map<char, std::vector<VtxDesc>> nodeGroups;
InEdgeIter ii, ie;
// Group neighboring nodes by base
for(boost::tie(ii, ie) = boost::in_edges(n, _g); ii != ie; ++ii) {
@@ -203,23 +169,23 @@ void AlnGraphBoost::mergeInNodes(VtxDesc n) {
nodeGroups[_g[inNode].base].push_back(inNode);
}
}
-
+
// iterate over node groups, merge an accumulate information
for(auto kvp = nodeGroups.cbegin(); kvp != nodeGroups.end(); ++kvp) {
std::vector<VtxDesc> nodes = (*kvp).second;
- if (nodes.size() <= 1)
+ if (nodes.size() <= 1)
continue;
std::vector<VtxDesc>::const_iterator ni = nodes.cbegin();
- VtxDesc an = *ni++;
+ VtxDesc an = *ni++;
OutEdgeIter anoi, anoe;
boost::tie(anoi, anoe) = boost::out_edges(an, _g);
-
+
// Accumulate out edge information
for (; ni != nodes.cend(); ++ni) {
OutEdgeIter oi, oe;
boost::tie(oi, oe) = boost::out_edges(*ni, _g);
- _g[*anoi].count += _g[*oi].count;
+ _g[*anoi].count += _g[*oi].count;
_g[an].weight += _g[*ni].weight;
}
@@ -249,7 +215,7 @@ void AlnGraphBoost::mergeInNodes(VtxDesc n) {
}
void AlnGraphBoost::mergeOutNodes(VtxDesc n) {
- std::map<char, std::vector<VtxDesc>> nodeGroups;
+ std::map<char, std::vector<VtxDesc>> nodeGroups;
OutEdgeIter oi, oe;
for(boost::tie(oi, oe) = boost::out_edges(n, _g); oi != oe; ++oi) {
VtxDesc outNode = boost::target(*oi, _g);
@@ -257,22 +223,22 @@ void AlnGraphBoost::mergeOutNodes(VtxDesc n) {
nodeGroups[_g[outNode].base].push_back(outNode);
}
}
-
+
for(auto kvp = nodeGroups.cbegin(); kvp != nodeGroups.end(); ++kvp) {
std::vector<VtxDesc> nodes = (*kvp).second;
- if (nodes.size() <= 1)
+ if (nodes.size() <= 1)
continue;
std::vector<VtxDesc>::const_iterator ni = nodes.cbegin();
VtxDesc an = *ni++;
InEdgeIter anii, anie;
boost::tie(anii, anie) = boost::in_edges(an, _g);
-
+
// Accumulate inner edge information
for (; ni != nodes.cend(); ++ni) {
InEdgeIter ii, ie;
boost::tie(ii, ie) = boost::in_edges(*ni, _g);
- _g[*anii].count += _g[*ii].count;
+ _g[*anii].count += _g[*ii].count;
_g[an].weight += _g[*ni].weight;
}
@@ -329,9 +295,9 @@ const std::string AlnGraphBoost::consensus(int minWeight) {
std::vector<AlnNode>::iterator curr = path.begin();
for (; curr != path.end(); ++curr) {
AlnNode n = *curr;
- if (n.base == _g[_enterVtx].base || n.base == _g[_exitVtx].base)
+ if (n.base == _g[_enterVtx].base || n.base == _g[_exitVtx].base)
continue;
-
+
cns += n.base;
// initial beginning of minimum weight section
@@ -340,7 +306,7 @@ const std::string AlnGraphBoost::consensus(int minWeight) {
metWeight = true;
} else if (metWeight && n.weight < minWeight) {
// concluded minimum weight section, update if longest seen so far
- if ((idx - offs) > length) {
+ if ((idx - offs) > length) {
bestOffs = offs;
length = idx - offs;
}
@@ -354,7 +320,7 @@ const std::string AlnGraphBoost::consensus(int minWeight) {
bestOffs = offs;
length = idx - offs;
}
-
+
return cns.substr(bestOffs, length);
}
@@ -373,9 +339,9 @@ void AlnGraphBoost::consensus(std::vector<CnsResult>& seqs, int minWeight, size_
std::vector<AlnNode>::iterator curr = path.begin();
for (; curr != path.end(); ++curr) {
AlnNode n = *curr;
- if (n.base == _g[_enterVtx].base || n.base == _g[_exitVtx].base)
+ if (n.base == _g[_enterVtx].base || n.base == _g[_exitVtx].base)
continue;
-
+
cns += n.base;
// initial beginning of minimum weight section
@@ -425,10 +391,10 @@ const std::vector<AlnNode> AlnGraphBoost::bestPath() {
VtxDesc n = seedNodes.front();
seedNodes.pop();
-
+
bool bestEdgeFound = false;
float bestScore = -FLT_MAX;
- EdgeDesc bestEdgeD = boost::initialized_value;
+ EdgeDesc bestEdgeD = boost::initialized_value;
OutEdgeIter oi, oe;
for(boost::tie(oi, oe) = boost::out_edges(n, _g); oi != oe; ++oi) {
EdgeDesc outEdgeD = *oi;
@@ -445,10 +411,10 @@ const std::vector<AlnNode> AlnGraphBoost::bestPath() {
if (newScore > bestScore) {
bestScore = newScore;
bestEdgeD = outEdgeD;
- bestEdgeFound = true;
+ bestEdgeFound = true;
}
}
-
+
if (bestEdgeFound) {
nodeScore[n]= bestScore;
bestNodeScoreEdge[n] = bestEdgeD;
@@ -465,7 +431,7 @@ const std::vector<AlnNode> AlnGraphBoost::bestPath() {
if (_g[*oi].visited == false)
notVisited++;
}
-
+
// move onto the target node after we visit all incoming edges for
// the target node
if (notVisited == 0)
@@ -494,7 +460,7 @@ const std::vector<AlnNode> AlnGraphBoost::bestPath() {
void AlnGraphBoost::printGraph() {
reapNodes();
- boost::write_graphviz(std::cout, _g,
+ boost::write_graphviz(std::cout, _g,
make_label_writer(get(&AlnNode::base, _g)),
make_label_writer(get(&AlnEdge::count, _g)));
}
@@ -506,11 +472,11 @@ bool AlnGraphBoost::danglingNodes() {
for (;curr != last; ++curr) {
if (_g[*curr].deleted)
continue;
- if (_g[*curr].base == _g[_enterVtx].base || _g[*curr].base == _g[_exitVtx].base)
+ if (_g[*curr].base == _g[_enterVtx].base || _g[*curr].base == _g[_exitVtx].base)
continue;
- int indeg = out_degree(*curr, _g);
- int outdeg = in_degree(*curr, _g);
+ int indeg = out_degree(*curr, _g);
+ int outdeg = in_degree(*curr, _g);
if (outdeg > 0 && indeg > 0) continue;
found = true;
diff --git a/src/cpp/AlnGraphBoost.hpp b/src/cpp/AlnGraphBoost.hpp
index faf759c..6ec2075 100644
--- a/src/cpp/AlnGraphBoost.hpp
+++ b/src/cpp/AlnGraphBoost.hpp
@@ -1,62 +1,25 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-
-#ifndef __GCON_ALNGRAPHBOOST_HPP__
-#define __GCON_ALNGRAPHBOOST_HPP__
+#pragma once
/// Alignment graph representation and consensus caller. Based on the original
-/// Python implementation, pbdagcon. This class is modelled after its
+/// Python implementation, pbdagcon. This class is modelled after its
/// aligngraph.py component, which accumulates alignment information into a
/// partial-order graph and then calls consensus. Used to error-correct pacbio
-/// on pacbio reads.
+/// on pacbio reads.
///
/// Implemented using the boost graph library.
-// forward declaration
+// forward declaration
//struct Alignment;
// this allows me to forward-declare properties with graph descriptors as
-// members types
+// members types
typedef boost::adjacency_list<boost::vecS, boost::vecS, boost::bidirectionalS> graphTraits;
-/// Graph vertex property. An alignment node, which represents one base position
+/// Graph vertex property. An alignment node, which represents one base position
/// in the alignment graph.
struct AlnNode {
char base; ///< DNA base: [ACTG]
- int coverage; ///< Number of reads align to this position, but not
+ int coverage; ///< Number of reads align to this position, but not
///< necessarily match
int weight; ///< Number of reads that align to this node *with the same base*, but not
///< necessarily represented in the target.
@@ -69,7 +32,7 @@ struct AlnNode {
coverage = 0;
weight = 0;
backbone = false;
- deleted = false;
+ deleted = false;
}
};
@@ -94,7 +57,7 @@ typedef boost::graph_traits<G>::in_edge_iterator InEdgeIter;
typedef boost::graph_traits<G>::out_edge_iterator OutEdgeIter;
typedef boost::property_map<G, boost::vertex_index_t>::type IndexMap;
-///
+///
/// Simple consensus interface datastructure
///
struct CnsResult {
@@ -102,7 +65,7 @@ struct CnsResult {
std::string seq; ///< Consensus fragment
};
-///
+///
/// Core alignments into consensus algorithm, implemented using the boost graph
/// library. Takes a set of alignments to a reference and builds a higher
/// accuracy (~ 99.9) consensus sequence from it. Designed for use in the HGAP
@@ -129,7 +92,7 @@ public:
/// \param v the 'to' vertex descriptor
void addEdge(VtxDesc u, VtxDesc v);
- /// Collapses degenerate nodes (vertices). Must be called before
+ /// Collapses degenerate nodes (vertices). Must be called before
/// consensus(). Calls mergeInNodes() followed by mergeOutNodes().
void mergeNodes();
@@ -145,14 +108,14 @@ public:
/// \param n the node to remove.
void markForReaper(VtxDesc n);
- /// Removes the set of nodes that have been marked. Modifies graph.
+ /// Removes the set of nodes that have been marked. Modifies graph.
/// Prohibitively expensive when using vecS as the vertex container.
void reapNodes();
/// Generates the consensus from the graph. Must be called after
/// mergeNodes(). Returns the longest contiguous consensus sequence where
/// each base meets the minimum weight requirement.
- /// \param minWeight sets the minimum weight for each base in the consensus.
+ /// \param minWeight sets the minimum weight for each base in the consensus.
/// default = 0
const std::string consensus(int minWeight=0);
@@ -173,10 +136,8 @@ public:
virtual ~AlnGraphBoost();
private:
G _g;
- VtxDesc _enterVtx;
+ VtxDesc _enterVtx;
VtxDesc _exitVtx;
std::map<VtxDesc, VtxDesc> _bbMap;
std::vector<VtxDesc> _reaperBag;
};
-
-#endif // __GCON_ALNGRAPHBOOST_HPP__
diff --git a/src/cpp/AlnProvider.hpp b/src/cpp/AlnProvider.hpp
index 98373d4..0b20d45 100644
--- a/src/cpp/AlnProvider.hpp
+++ b/src/cpp/AlnProvider.hpp
@@ -1,41 +1,4 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-
-#ifndef __GCON_ALN_PROVIDER__
-#define __GCON_ALN_PROVIDER__
+#pragma once
#include <exception>
@@ -51,7 +14,7 @@ namespace DagCon {
return desc_;
}
private:
- const char* desc_;
+ const char* desc_;
};
class MemoryException : public std::exception {
@@ -75,11 +38,9 @@ public:
/// \return True if there are more targets, otherwise false.
virtual bool nextTarget(std::vector<dagcon::Alignment>& dest) = 0;
- /// Same as nextTarget(dest), except it also returns the target sequence we are
+ /// Same as nextTarget(dest), except it also returns the target sequence we are
/// going to correct.
- virtual bool nextTarget(std::string& targSeq, std::vector<dagcon::Alignment>& dest) = 0;
+ virtual bool nextTarget(std::string& targSeq, std::vector<dagcon::Alignment>& dest) = 0;
virtual ~AlnProvider() {};
};
-
-#endif //__GCON_ALN_PROVIDER__
diff --git a/src/cpp/BlasrM5AlnProvider.cpp b/src/cpp/BlasrM5AlnProvider.cpp
index 6b71b7c..600560a 100644
--- a/src/cpp/BlasrM5AlnProvider.cpp
+++ b/src/cpp/BlasrM5AlnProvider.cpp
@@ -1,39 +1,3 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-
#include <vector>
#include <iostream>
#include <istream>
@@ -74,7 +38,7 @@ bool BlasrM5AlnProvider::nextTarget(std::vector<dagcon::Alignment>& dest) {
// process up to EOF or next target
// need to maintain state in between calls
if (! firstAln_)
- dest.push_back(prevAln_);
+ dest.push_back(prevAln_);
dagcon::Alignment aln;
while (*is_ >> aln) {
@@ -122,7 +86,7 @@ void BlasrM5AlnProvider::checkFormat() {
dagcon::Alignment aln;
std::vector<std::string> raw, sorted;
int max = 50, count = 0;
- while(ifs >> aln && count++ < max)
+ while(ifs >> aln && count++ < max)
raw.push_back(aln.id);
sorted = raw;
diff --git a/src/cpp/BlasrM5AlnProvider.hpp b/src/cpp/BlasrM5AlnProvider.hpp
index 087555a..c9e9fe1 100644
--- a/src/cpp/BlasrM5AlnProvider.hpp
+++ b/src/cpp/BlasrM5AlnProvider.hpp
@@ -1,41 +1,4 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-
-#ifndef __GCON_BLASRM5_ALN_PROVIDER__
-#define __GCON_BLASRM5_ALN_PROVIDER__
+#pragma once
#include "AlnProvider.hpp"
@@ -54,10 +17,10 @@ namespace M5Exception {
}
///
-/// Provides sets of alignments for a given target sequence from a blasr M5
+/// Provides sets of alignments for a given target sequence from a blasr M5
/// file. File may be grouped by target or query. The grouping determines
/// which set gets corrected. Earlier, pre-assembly reads were corrected as
-/// targets. However, we can avoid the sort step if we can correct the reads
+/// targets. However, we can avoid the sort step if we can correct the reads
/// as queries, since blasr groups alignments by query.
///
class BlasrM5AlnProvider : public AlnProvider {
@@ -71,7 +34,7 @@ public:
/// are actually made on the validity of the format, caveat emptor. This
/// can be used to take a piped stream of alignments straight from blasr.
BlasrM5AlnProvider(std::istream* stream);
-
+
/// Cleans up some stuff.
~BlasrM5AlnProvider();
@@ -81,11 +44,11 @@ public:
/// \param dest reference to a vector to hold the alignments.
/// \return True if there are more targets, otherwise false.
bool nextTarget(std::vector<dagcon::Alignment>& dest);
-
+
/// Same as \fn bool nextTarget(std::vector<dagcon::Alignment>& dest) except it
/// also returns the target sequence we are going to correct.
bool nextTarget(std::string& targetSeq, std::vector<dagcon::Alignment>& dest);
-
+
/// Called during constructor, checks that the file is formatted correctly.
/// Also determines if the input is grouped by query or target.
void checkFormat();
@@ -94,14 +57,12 @@ private:
/// Path to the input file
const std::string fpath_;
- /// State variables
+ /// State variables
std::string currId_;
dagcon::Alignment prevAln_;
bool firstAln_;
-
+
/// Represents an input stream to the alignments.
std::ifstream fs_;
std::istream* is_;
};
-
-#endif //__GCON_BLASRM5_ALN_PROVIDER__
diff --git a/src/cpp/BoundedBuffer.hpp b/src/cpp/BoundedBuffer.hpp
index 394b50a..74a096d 100644
--- a/src/cpp/BoundedBuffer.hpp
+++ b/src/cpp/BoundedBuffer.hpp
@@ -1,40 +1,4 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-#ifndef __GCON_BOUNDEDBUFFER__
-#define __GCON_BOUNDEDBUFFER__
+#pragma once
#include <deque>
#include <condition_variable>
@@ -45,8 +9,8 @@
#include "Alignment.hpp"
///
-/// Templated, thread-safe buffer container, uses uses boost::circular buffer
-/// bounded by a given capacity specified by the caller. When the buffer is
+/// Templated, thread-safe buffer container, uses boost::circular buffer
+/// bounded by a given capacity specified by the caller. When the buffer is
/// full, the push waits for an open spot. When the buffer is empty, the pop
/// waits for an item to be present. Condition variables are used to signal
/// the state of the buffer.
@@ -56,18 +20,18 @@ class BoundedBuffer {
public:
typedef std::deque<T> buffer_type;
- BoundedBuffer(int max) : max_(max) { }
+ BoundedBuffer(size_t max) : max_(max) { }
void push(T item) {
std::unique_lock<std::mutex> lock(mutex_);
- not_full_.wait(lock, [this](){return buffer_.size() != max_;});
+ not_full_.wait(lock, [this](){return buffer_.size() != max_;});
buffer_.push_front(item);
not_empty_.notify_one();
}
void pop(T* pItem) {
std::unique_lock<std::mutex> lock(mutex_);
- not_empty_.wait(lock, [this](){return buffer_.size() != 0;});
+ not_empty_.wait(lock, [this](){return buffer_.size() != 0U;});
*pItem = buffer_.back();
buffer_.pop_back();
not_full_.notify_one();
@@ -78,12 +42,9 @@ public:
}
private:
- int max_;
+ size_t const max_;
buffer_type buffer_;
std::mutex mutex_;
std::condition_variable not_empty_;
std::condition_variable not_full_;
};
-
-#endif // __GCON_BOUNDEDBUFFER__
-
diff --git a/src/cpp/DB.c b/src/cpp/DB.c
deleted file mode 100644
index 95f073d..0000000
--- a/src/cpp/DB.c
+++ /dev/null
@@ -1,1106 +0,0 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
-/*******************************************************************************************
- *
- * Compressed data base module. Auxiliary routines to open and manipulate a data base for
- * which the sequence and read information are separated into two separate files, and the
- * sequence is compressed into 2-bits for each base. Support for tracks of additional
- * information, and trimming according to the current partition. Eventually will also
- * support compressed quality information.
- *
- * Author : Gene Myers
- * Date : July 2013
- * Revised: April 2014
- *
- ********************************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <unistd.h>
-#include <dirent.h>
-
-#include "DB.h"
-
-#ifdef HIDE_FILES
-#define PATHSEP "/."
-#else
-#define PATHSEP "/"
-#endif
-
-
-/*******************************************************************************************
- *
- * GENERAL UTILITIES
- *
- ********************************************************************************************/
-
-char *Prog_Name;
-
-void *Malloc(int64 size, char *mesg)
-{ void *p;
-
- if ((p = malloc(size)) == NULL)
- { if (mesg == NULL)
- fprintf(stderr,"%s: Out of memory\n",Prog_Name);
- else
- fprintf(stderr,"%s: Out of memory (%s)\n",Prog_Name,mesg);
- }
- return (p);
-}
-
-void *Realloc(void *p, int64 size, char *mesg)
-{ if ((p = realloc(p,size)) == NULL)
- { if (mesg == NULL)
- fprintf(stderr,"%s: Out of memory\n",Prog_Name);
- else
- fprintf(stderr,"%s: Out of memory (%s)\n",Prog_Name,mesg);
- }
- return (p);
-}
-
-char *Strdup(char *name, char *mesg)
-{ char *s;
-
- if (name == NULL)
- return (NULL);
- if ((s = strdup(name)) == NULL)
- { if (mesg == NULL)
- fprintf(stderr,"%s: Out of memory\n",Prog_Name);
- else
- fprintf(stderr,"%s: Out of memory (%s)\n",Prog_Name,mesg);
- }
- return (s);
-}
-
-FILE *Fopen(char *name, char *mode)
-{ FILE *f;
-
- if (name == NULL || mode == NULL)
- return (NULL);
- if ((f = fopen(name,mode)) == NULL)
- fprintf(stderr,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode);
- return (f);
-}
-
-char *PathTo(char *name)
-{ char *path, *find;
-
- if (name == NULL)
- return (NULL);
- if ((find = rindex(name,'/')) != NULL)
- { *find = '\0';
- path = Strdup(name,"Extracting path from");
- *find = '/';
- }
- else
- path = Strdup(".","Allocating default path");
- return (path);
-}
-
-char *Root(char *name, char *suffix)
-{ char *path, *find, *dot;
- int epos;
-
- if (name == NULL)
- return (NULL);
- find = rindex(name,'/');
- if (find == NULL)
- find = name;
- else
- find += 1;
- if (suffix == NULL)
- { dot = strchr(find,'.');
- if (dot != NULL)
- *dot = '\0';
- path = Strdup(find,"Extracting root from");
- if (dot != NULL)
- *dot = '.';
- }
- else
- { epos = strlen(find);
- epos -= strlen(suffix);
- if (epos > 0 && strcasecmp(find+epos,suffix) == 0)
- { find[epos] = '\0';
- path = Strdup(find,"Extracting root from");
- find[epos] = suffix[0];
- }
- else
- path = Strdup(find,"Allocating root");
- }
- return (path);
-}
-
-char *Catenate(char *path, char *sep, char *root, char *suffix)
-{ static char *cat = NULL;
- static int max = -1;
- int len;
-
- if (path == NULL || root == NULL || sep == NULL || suffix == NULL)
- return (NULL);
- len = strlen(path);
- len += strlen(sep);
- len += strlen(root);
- len += strlen(suffix);
- if (len > max)
- { max = ((int) (1.2*len)) + 100;
- if ((cat = (char *) realloc(cat,max+1)) == NULL)
- { fprintf(stderr,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root);
- return (NULL);
- }
- }
- sprintf(cat,"%s%s%s%s",path,sep,root,suffix);
- return (cat);
-}
-
-char *Numbered_Suffix(char *left, int num, char *right)
-{ static char *suffix = NULL;
- static int max = -1;
- int len;
-
- if (left == NULL || right == NULL)
- return (NULL);
- len = strlen(left);
- len += strlen(right) + 40;
- if (len > max)
- { max = ((int) (1.2*len)) + 100;
- if ((suffix = (char *) realloc(suffix,max+1)) == NULL)
- { fprintf(stderr,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num);
- return (NULL);
- }
- }
- sprintf(suffix,"%s%d%s",left,num,right);
- return (suffix);
-}
-
-
-#define COMMA ','
-
-// Print big integers with commas/periods for better readability
-
-void Print_Number(int64 num, int width, FILE *out)
-{ if (width == 0)
- { if (num < 1000ll)
- fprintf(out,"%lld",num);
- else if (num < 1000000ll)
- fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll);
- else if (num < 1000000000ll)
- fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,
- COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll);
- else
- fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,
- COMMA,(num%1000000000ll)/1000000ll,
- COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll);
- }
- else
- { if (num < 1000ll)
- fprintf(out,"%*lld",width,num);
- else if (num < 1000000ll)
- { if (width <= 4)
- fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll);
- else
- fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll);
- }
- else if (num < 1000000000ll)
- { if (width <= 8)
- fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll,
- COMMA,num%1000ll);
- else
- fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll,
- COMMA,num%1000ll);
- }
- else
- { if (width <= 12)
- fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA,
- (num%1000000000ll)/1000000ll,COMMA,
- (num%1000000ll)/1000ll,COMMA,num%1000ll);
- else
- fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA,
- (num%1000000000ll)/1000000ll,COMMA,
- (num%1000000ll)/1000ll,COMMA,num%1000ll);
- }
- }
-}
-
-// Return the number of digits, base 10, of num
-
-int Number_Digits(int64 num)
-{ int digit;
-
- digit = 0;
- while (num >= 1)
- { num /= 10;
- digit += 1;
- }
- return (digit);
-}
-
-
-/*******************************************************************************************
- *
- * READ COMPRESSION/DECOMPRESSION UTILITIES
- *
- ********************************************************************************************/
-
-// Compress read into 2-bits per base (from [0-3] per byte representation
-
-void Compress_Read(int len, char *s)
-{ int i;
- char c, d;
- char *s0, *s1, *s2, *s3;
-
- s0 = s;
- s1 = s0+1;
- s2 = s1+1;
- s3 = s2+1;
-
- c = s1[len];
- d = s2[len];
- s0[len] = s1[len] = s2[len] = 0;
-
- for (i = 0; i < len; i += 4)
- *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]);
-
- s1[len] = c;
- s2[len] = d;
-}
-
-// Uncompress read form 2-bits per base into [0-3] per byte representation
-
-void Uncompress_Read(int len, char *s)
-{ int i, tlen, byte;
- char *s0, *s1, *s2, *s3;
- char *t;
-
- s0 = s;
- s1 = s0+1;
- s2 = s1+1;
- s3 = s2+1;
-
- tlen = (len-1)/4;
-
- t = s+tlen;
- for (i = tlen*4; i >= 0; i -= 4)
- { byte = *t--;
- s0[i] = (char) ((byte >> 6) & 0x3);
- s1[i] = (char) ((byte >> 4) & 0x3);
- s2[i] = (char) ((byte >> 2) & 0x3);
- s3[i] = (char) (byte & 0x3);
- }
- s[len] = 4;
-}
-
-// Convert read in [0-3] representation to ascii representation (end with '\n')
-
-void Lower_Read(char *s)
-{ static char letter[4] = { 'a', 'c', 'g', 't' };
-
- for ( ; *s != 4; s++)
- *s = letter[(int) *s];
- *s = '\0';
-}
-
-void Upper_Read(char *s)
-{ static char letter[4] = { 'A', 'C', 'G', 'T' };
-
- for ( ; *s != 4; s++)
- *s = letter[(int) *s];
- *s = '\0';
-}
-
-// Convert read in ascii representation to [0-3] representation (end with 4)
-
-void Number_Read(char *s)
-{ static char number[128] =
- { 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 1, 0, 0, 0, 2,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 3, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 1, 0, 0, 0, 2,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 3, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- };
-
- for ( ; *s != '\0'; s++)
- *s = number[(int) *s];
- *s = 4;
-}
-
-
-/*******************************************************************************************
- *
- * DB OPEN, TRIM & CLOSE ROUTINES
- *
- ********************************************************************************************/
-
-
-// Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has
-// a part # in it then just the part is opened. The index array is allocated (for all or
-// just the part) and read in.
-// Return status of routine:
-// -1: The DB could not be opened for a reason reported by the routine to stderr
-// 0: Open of DB proceeded without mishap
-// 1: Open of DAM proceeded without mishap
-
-int Open_DB(char* path, HITS_DB *db)
-{ char *root, *pwd, *bptr, *fptr, *cat;
- int nreads;
- FILE *index, *dbvis;
- int status, plen, isdam;
- int part, cutoff, all;
- int ofirst, bfirst, olast;
-
-
- plen = strlen(path);
- if (strcmp(path+(plen-4),".dam") == 0)
- root = Root(path,".dam");
- else
- root = Root(path,".db");
- pwd = PathTo(path);
-
- bptr = rindex(root,'.');
- if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-')
- { part = strtol(bptr+1,&fptr,10);
- if (*fptr != '\0' || part == 0)
- part = 0;
- else
- *bptr = '\0';
- }
- else
- part = 0;
-
- isdam = 0;
- cat = Catenate(pwd,"/",root,".db");
- if (cat == NULL)
- exit (1);
- if ((dbvis = fopen(cat,"r")) == NULL)
- { cat = Catenate(pwd,"/",root,".dam");
- if (cat == NULL)
- exit (1);
- if ((dbvis = fopen(cat,"r")) == NULL)
- { status = -1;
- fprintf(stderr,"%s: Could not open database %s\n",Prog_Name,path);
- goto exit;
- }
- isdam = 1;
- }
-
- if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"rm")) == NULL)
- { status = -1;
- goto exit1;
- }
- if (fread(db,sizeof(HITS_DB),1,index) != 1)
- SYSTEM_ERROR
- nreads = db->oreads;
-
- { int p, nblocks, nfiles, blast;
- int64 size;
- char fname[MAX_NAME], prolog[MAX_NAME];
-
- nblocks = 0;
- if (fscanf(dbvis,DB_NFILE,&nfiles) != 1)
- SYSTEM_ERROR
- for (p = 0; p < nfiles; p++)
- if (fscanf(dbvis,DB_FDATA,&blast,fname,prolog) != 3)
- SYSTEM_ERROR
- if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1)
- if (part == 0)
- { cutoff = 0;
- all = 1;
- }
- else
- { fprintf(stderr,"%s: DB %s has not yet been partitioned, cannot request a block !\n",
- Prog_Name,root);
- status = -1;
- goto exit2;
- }
- else
- { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3)
- SYSTEM_ERROR
- if (part > nblocks)
- { fprintf(stderr,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks);
- status = -1;
- goto exit2;
- }
- }
-
- if (part > 0)
- { for (p = 1; p <= part; p++)
- if (fscanf(dbvis,DB_BDATA,&ofirst,&bfirst) != 2)
- SYSTEM_ERROR
- if (fscanf(dbvis,DB_BDATA,&olast,&blast) != 2)
- SYSTEM_ERROR
- }
- else
- { ofirst = bfirst = 0;
- olast = nreads;
- }
- }
-
- db->trimmed = 0;
- db->tracks = NULL;
- db->part = part;
- db->cutoff = cutoff;
- db->all = all;
- db->ofirst = ofirst;
- db->bfirst = bfirst;
-
- if (part <= 0)
- { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+1),"Allocating Open_DB index");
- if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
- SYSTEM_ERROR
- }
- else
- { HITS_READ *reads;
- int i, r, maxlen;
- int64 totlen;
-
- nreads = olast-ofirst;
- reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+1),"Allocating Open_DB index");
-
- fseeko(index,sizeof(HITS_READ)*ofirst,SEEK_CUR);
- if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
- SYSTEM_ERROR
-
- totlen = 0;
- maxlen = 0;
- for (i = 0; i < nreads; i++)
- { r = reads[i].rlen;
- totlen += r;
- if (r > maxlen)
- maxlen = r;
- }
-
- db->maxlen = maxlen;
- db->totlen = totlen;
- db->reads = reads;
- }
-
- db->nreads = nreads;
- db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path");
- db->bases = NULL;
- db->loaded = 0;
-
- status = isdam;
-
-exit2:
- fclose(index);
-exit1:
- fclose(dbvis);
-exit:
- if (bptr != NULL)
- *bptr = '.';
-
- free(pwd);
- free(root);
-
- return (status);
-}
-
-
-// Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings
-// of the current DB partition. Reallocate smaller memory blocks for the information kept
-// for the retained reads.
-
-void Trim_DB(HITS_DB *db)
-{ int i, j, r;
- int allflag, cutoff;
- int64 totlen;
- int maxlen, nreads;
- HITS_TRACK *record;
- HITS_READ *reads;
-
- if (db->trimmed) return;
-
- if (db->cutoff <= 0 && db->all) return;
-
- cutoff = db->cutoff;
- if (db->all)
- allflag = 0;
- else
- allflag = DB_BEST;
-
- reads = db->reads;
- nreads = db->nreads;
-
- for (record = db->tracks; record != NULL; record = record->next)
- { int *anno4, size;
- int64 *anno8;
- char *anno, *data;
-
- size = record->size;
- data = (char *) record->data;
- if (data == NULL)
- { anno = (char *) record->anno;
- j = 0;
- for (i = r = 0; i < db->nreads; i++, r += size)
- if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff)
- { memmove(anno+j,anno+r,size);
- j += size;
- }
- memmove(anno+j,anno+r,size);
- }
- else if (size == 4)
- { int ai;
-
- anno4 = (int *) (record->anno);
- j = anno4[0] = 0;
- for (i = 0; i < db->nreads; i++)
- if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff)
- { ai = anno4[i];
- anno4[j+1] = anno4[j] + (anno4[i+1]-ai);
- memmove(data+anno4[j],data+ai,anno4[i+1]-ai);
- j += 1;
- }
- record->data = Realloc(record->data,anno4[j],NULL);
- }
- else // size == 8
- { int64 ai;
-
- anno8 = (int64 *) (record->anno);
- j = anno8[0] = 0;
- for (i = 0; i < db->nreads; i++)
- if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff)
- { ai = anno8[i];
- anno8[j+1] = anno8[j] + (anno8[i+1]-ai);
- memmove(data+anno8[j],data+ai,anno8[i+1]-ai);
- j += 1;
- }
- record->data = Realloc(record->data,anno8[j],NULL);
- }
- record->anno = Realloc(record->anno,record->size*(j+1),NULL);
- }
-
- totlen = maxlen = 0;
- for (j = i = 0; i < nreads; i++)
- { r = reads[i].rlen;
- if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff)
- { totlen += r;
- if (r > maxlen)
- maxlen = r;
- reads[j++] = reads[i];
- }
- }
-
- db->totlen = totlen;
- db->maxlen = maxlen;
- db->nreads = j;
- db->trimmed = 1;
-
- if (j < nreads)
- db->reads = Realloc(reads,sizeof(HITS_READ)*(j+1),NULL);
-}
-
-// Shut down an open 'db' by freeing all associated space, including tracks and QV structures,
-// and any open file pointers. The record pointed at by db however remains (the user
-// supplied it and so should free it).
-
-void Close_DB(HITS_DB *db)
-{ HITS_TRACK *t, *p;
-
- if (db->loaded)
- free(((char *) (db->bases)) - 1);
- else if (db->bases != NULL)
- fclose((FILE *) db->bases);
- free(db->reads);
- free(db->path);
-
- for (t = db->tracks; t != NULL; t = p)
- { p = t->next;
- free(t->anno);
- free(t->data);
- free(t);
- }
-}
-
-/*******************************************************************************************
- *
- * TRACK LOAD & CLOSE ROUTINES
- *
- ********************************************************************************************/
-
-// Return status of track:
-// 1: Track is for trimmed DB
-// 0: Track is for untrimmed DB
-// -1: Track is not the right size of DB either trimmed or untrimmed
-// -2: Could not find the track
-
-int Check_Track(HITS_DB *db, char *track)
-{ FILE *afile;
- int tracklen;
-
- afile = fopen(Catenate(db->path,".",track,".anno"),"r");
- if (afile == NULL)
- return (-2);
-
- if (fread(&tracklen,sizeof(int),1,afile) != 1)
- SYSTEM_ERROR
-
- fclose(afile);
-
- if (tracklen == db->breads)
- return (1);
- else if (tracklen == db->oreads)
- return (0);
- else
- return (-1);
-}
-
-// If track is not already in the db's track list, then allocate all the storage for it,
-// read it in from the appropriate file, add it to the track list, and return a pointer
-// to the newly created HITS_TRACK record. If the track does not exist or cannot be
-// opened for some reason, then NULL is returned.
-
-HITS_TRACK *Load_Track(HITS_DB *db, char *track)
-{ FILE *afile, *dfile;
- int tracklen, size;
- int nreads;
- void *anno;
- void *data;
- HITS_TRACK *record;
-
- if (track[0] == '.')
- { fprintf(stderr,"Track names cannot begin with a .\n");
- exit (1);
- }
-
- for (record = db->tracks; record != NULL; record = record->next)
- if (strcmp(record->name,track) == 0)
- return (record);
-
- afile = fopen(Catenate(db->path,".",track,".anno"),"r");
- if (afile == NULL)
- return (NULL);
- dfile = fopen(Catenate(db->path,".",track,".data"),"r");
-
- if (fread(&tracklen,sizeof(int),1,afile) != 1)
- SYSTEM_ERROR
- if (fread(&size,sizeof(int),1,afile) != 1)
- SYSTEM_ERROR
-
- if (db->trimmed)
- { if (tracklen != db->breads)
- { fprintf(stderr,"%s: Track %s not same size as database !\n",Prog_Name,track);
- exit (1);
- }
- if (db->part > 0)
- fseeko(afile,size*db->bfirst,SEEK_CUR);
- }
- else
- { if (tracklen != db->oreads)
- { fprintf(stderr,"%s: Track %s not same size as database !\n",Prog_Name,track);
- exit (1);
- }
- if (db->part > 0)
- fseeko(afile,size*db->ofirst,SEEK_CUR);
- }
- nreads = db->nreads;
-
- anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector");
-
- if (size > 0)
- { if (dfile == NULL)
- { if (fread(anno,size,nreads,afile) != (size_t) nreads)
- SYSTEM_ERROR
- }
- else
- { if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1))
- SYSTEM_ERROR
- }
- }
- else
- SYSTEM_ERROR
-
- if (dfile != NULL)
- { int64 *anno8, off8, dlen;
- int *anno4, off4;
- int i;
-
- if (size == 4)
- { anno4 = (int *) anno;
- off4 = anno4[0];
- if (off4 != 0)
- { for (i = 0; i <= nreads; i++)
- anno4[i] -= off4;
- fseeko(dfile,off4,SEEK_SET);
- }
- dlen = anno4[nreads];
- data = (void *) Malloc(dlen,"Allocating Track Data Vector");
- }
- else
- { anno8 = (int64 *) anno;
- off8 = anno8[0];
- if (off8 != 0)
- { for (i = 0; i <= nreads; i++)
- anno8[i] -= off8;
- fseeko(dfile,off8,SEEK_SET);
- }
- dlen = anno8[nreads];
- data = (void *) Malloc(dlen,"Allocating Track Data Vector");
- }
- if (dlen > 0)
- { if (fread(data,dlen,1,dfile) != 1)
- SYSTEM_ERROR
- }
- fclose(dfile);
- }
- else
- data = NULL;
-
- fclose(afile);
-
- record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record");
- record->name = Strdup(track,"Allocating Track Name");
- record->data = data;
- record->anno = anno;
- record->size = size;
-
- if (db->tracks != NULL && strcmp(db->tracks->name,". at qvs") == 0)
- { record->next = db->tracks->next;
- db->tracks->next = record;
- }
- else
- { record->next = db->tracks;
- db->tracks = record;
- }
-
- return (record);
-}
-
-void Close_Track(HITS_DB *db, char *track)
-{ HITS_TRACK *record, *prev;
-
- prev = NULL;
- for (record = db->tracks; record != NULL; record = record->next)
- { if (strcmp(record->name,track) == 0)
- { free(record->anno);
- free(record->data);
- free(record->name);
- if (prev == NULL)
- db->tracks = record->next;
- else
- prev->next = record->next;
- free(record);
- return;
- }
- prev = record;
- }
- return;
-}
-
-
-/*******************************************************************************************
- *
- * READ BUFFER ALLOCATION AND READ ACCESS
- *
- ********************************************************************************************/
-
-// Allocate and return a buffer big enough for the largest read in 'db', leaving room
-// for an initial delimiter character
-
-char *New_Read_Buffer(HITS_DB *db)
-{ char *read;
-
- read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer");
- if (read == NULL)
- exit (1);
- return (read+1);
-}
-
-// Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a
-// lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and
-// 3(T) otherwise.
-//
-// **NB**, the byte before read will be set to a delimiter character!
-
-void Load_Read(HITS_DB *db, int i, char *read, int ascii)
-{ FILE *bases = (FILE *) db->bases;
- int64 off;
- int len, clen;
- HITS_READ *r = db->reads;
-
- if (bases == NULL)
- { db->bases = (void *) (bases = Fopen(Catenate(db->path,"","",".bps"),"r"));
- if (bases == NULL)
- exit (1);
- }
- if (i >= db->nreads)
- { fprintf(stderr,"%s: Index out of bounds (Load_Read)\n",Prog_Name);
- exit (1);
- }
-
- off = r[i].boff;
- len = r[i].rlen;
-
- if (ftello(bases) != off)
- fseeko(bases,off,SEEK_SET);
- clen = COMPRESSED_LEN(len);
- if (clen > 0)
- { if (fread(read,clen,1,bases) != 1)
- SYSTEM_ERROR
- }
- Uncompress_Read(len,read);
- if (ascii == 1)
- { Lower_Read(read);
- read[-1] = '\0';
- }
- else if (ascii == 2)
- { Upper_Read(read);
- read[-1] = '\0';
- }
- else
- read[-1] = 4;
-}
-
-char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii)
-{ FILE *bases = (FILE *) db->bases;
- int64 off;
- int len, clen;
- int bbeg, bend;
- HITS_READ *r = db->reads;
-
- if (bases == NULL)
- { db->bases = (void *) (bases = Fopen(Catenate(db->path,"","",".bps"),"rm"));
- if (bases == NULL)
- exit (1);
- }
- if (i >= db->nreads)
- { fprintf(stderr,"%s: Index out of bounds (Load_Read)\n",Prog_Name);
- exit (1);
- }
-
- bbeg = beg/4;
- bend = (end-1)/4+1;
-
- off = r[i].boff + bbeg;
- len = end - beg;
-
- if (ftello(bases) != off)
- fseeko(bases,off,SEEK_SET);
- clen = bend-bbeg;
- if (clen > 0)
- { if (fread(read,clen,1,bases) != 1)
- SYSTEM_ERROR
- }
- Uncompress_Read(4*clen,read);
- read += beg%4;
- read[len] = 4;
- if (ascii == 1)
- { Lower_Read(read);
- read[-1] = '\0';
- }
- else if (ascii == 2)
- { Upper_Read(read);
- read[-1] = '\0';
- }
- else
- read[-1] = 4;
-
- return (read);
-}
-
-
-/*******************************************************************************************
- *
- * BLOCK LOAD OF ALL READS (PRIMARILY FOR DALIGNER)
- *
- ********************************************************************************************/
-
-// Allocate a block big enough for all the uncompressed sequences, read them into it,
-// reset the 'off' in each read record to be its in-memory offset, and set the
-// bases pointer to point at the block after closing the bases file. If ascii is
-// non-zero then the reads are converted to ACGT ascii, otherwise the reads are left
-// as numeric strings over 0(A), 1(C), 2(G), and 3(T).
-
-void Read_All_Sequences(HITS_DB *db, int ascii)
-{ FILE *bases = (FILE *) db->bases;
- int nreads = db->nreads;
- HITS_READ *reads = db->reads;
- void (*translate)(char *s);
-
- char *seq;
- int64 o, off;
- int i, len, clen;
-
- if (bases == NULL)
- db->bases = (void *) (bases = Fopen(Catenate(db->path,"","",".bps"),"r"));
- else
- rewind(bases);
-
- seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads");
-
- *seq++ = 4;
-
- if (ascii == 1)
- translate = Lower_Read;
- else
- translate = Upper_Read;
-
- o = 0;
- for (i = 0; i < nreads; i++)
- { len = reads[i].rlen;
- off = reads[i].boff;
- if (ftello(bases) != off)
- fseeko(bases,off,SEEK_SET);
- clen = COMPRESSED_LEN(len);
- if (clen > 0)
- { if (fread(seq+o,clen,1,bases) != 1)
- SYSTEM_ERROR
- }
- Uncompress_Read(len,seq+o);
- if (ascii)
- translate(seq+o);
- reads[i].boff = o;
- o += (len+1);
- }
- reads[nreads].boff = o;
-
- fclose(bases);
-
- db->bases = (void *) seq;
- db->loaded = 1;
-}
-
-int List_DB_Files(char *path, void foreach(char *path, char *extension))
-{ int status, rlen, dlen;
- char *root, *pwd, *name;
- int isdam;
- DIR *dirp;
- struct dirent *dp;
-
- status = 0;
- pwd = PathTo(path);
- root = Root(path,".db");
- rlen = strlen(root);
-
- if (root == NULL || pwd == NULL)
- { status = 1;
- goto exit;
- }
-
- if ((dirp = opendir(pwd)) == NULL)
- { status = 1;
- goto exit;
- }
-
- isdam = 0;
- while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary)
- { name = dp->d_name;
- if (strcmp(name,Catenate("","",root,".db")) == 0)
- break;
- if (strcmp(name,Catenate("","",root,".dam")) == 0)
- { isdam = 1;
- break;
- }
- if (strcasecmp(name,Catenate("","",root,".db")) == 0)
- { strncpy(root,name,rlen);
- break;
- }
- if (strcasecmp(name,Catenate("","",root,".dam")) == 0)
- { strncpy(root,name,rlen);
- isdam = 1;
- break;
- }
- }
- if (dp == NULL)
- { status = 1;
- closedir(dirp);
- goto exit;
- }
-
- if (isdam)
- foreach(Catenate(pwd,"/",root,".dam"),"dam");
- else
- foreach(Catenate(pwd,"/",root,".db"),"db");
-
- rewinddir(dirp); // Report each auxiliary file
- while ((dp = readdir(dirp)) != NULL)
- { name = dp->d_name;
- dlen = strlen(name);
-#ifdef HIDE_FILES
- if (name[0] != '.')
- continue;
- dlen -= 1;
- name += 1;
-#endif
- if (dlen < rlen+1)
- continue;
- if (name[rlen] != '.')
- continue;
- if (strncmp(name,root,rlen) != 0)
- continue;
- foreach(Catenate(pwd,PATHSEP,name,""),name+(rlen+1));
- }
- closedir(dirp);
-
-exit:
- free(pwd);
- free(root);
- return (status);
-}
-
-void Print_Read(char *s, int width)
-{ int i;
-
- if (s[0] < 4)
- { for (i = 0; s[i] != 4; i++)
- { if (i%width == 0 && i != 0)
- printf("\n");
- printf("%d",s[i]);
- }
- printf("\n");
- }
- else
- { for (i = 0; s[i] != '\0'; i++)
- { if (i%width == 0 && i != 0)
- printf("\n");
- printf("%c",s[i]);
- }
- printf("\n");
- }
-}
diff --git a/src/cpp/DB.h b/src/cpp/DB.h
deleted file mode 100644
index 16f57ba..0000000
--- a/src/cpp/DB.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
-/*******************************************************************************************
- *
- * Compressed data base module. Auxiliary routines to open and manipulate a data base for
- * which the sequence and read information are separated into two separate files, and the
- * sequence is compressed into 2-bits for each base. Support for tracks of additional
- * information, and trimming according to the current partition. Eventually will also
- * support compressed quality information.
- *
- * Author : Gene Myers
- * Date : July 2013
- * Revised: April 2014
- *
- ********************************************************************************************/
-
-#ifndef _HITS_DB
-
-#define _HITS_DB
-
-#include <stdio.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef unsigned char uint8;
-typedef unsigned short uint16;
-typedef unsigned int uint32;
-typedef unsigned long long uint64;
-typedef signed char int8;
-typedef signed short int16;
-typedef signed int int32;
-typedef signed long long int64;
-typedef float float32;
-typedef double float64;
-
-#define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden"
- // Undefine if you don't want this
-
-
-/*******************************************************************************************
- *
- * COMMAND LINE INTERPRETATION MACROS
- *
- ********************************************************************************************/
-
-extern char *Prog_Name; // Name of program
-
-#define SYSTEM_ERROR \
- { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \
- exit (2); \
- }
-
-#define ARG_INIT(name) \
- Prog_Name = Strdup(name,""); \
- for (i = 0; i < 128; i++) \
- flags[i] = 0;
-
-#define ARG_FLAGS(set) \
- for (k = 1; argv[i][k] != '\0'; k++) \
- { if (index(set,argv[i][k]) == NULL) \
- { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \
- exit (1); \
- } \
- flags[(int) argv[i][k]] = 1; \
- }
-
-#define ARG_POSITIVE(var,name) \
- var = strtol(argv[i]+2,&eptr,10); \
- if (*eptr != '\0' || argv[i][2] == '\0') \
- { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \
- exit (1); \
- } \
- if (var <= 0) \
- { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \
- exit (1); \
- }
-
-#define ARG_NON_NEGATIVE(var,name) \
- var = strtol(argv[i]+2,&eptr,10); \
- if (*eptr != '\0' || argv[i][2] == '\0') \
- { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \
- exit (1); \
- } \
- if (var < 0) \
- { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \
- exit (1); \
- }
-
-#define ARG_REAL(var) \
- var = strtod(argv[i]+2,&eptr); \
- if (*eptr != '\0' || argv[i][2] == '\0') \
- { fprintf(stderr,"%s: -%c argument is not a real number\n",Prog_Name,argv[i][1]); \
- exit (1); \
- }
-
-/*******************************************************************************************
- *
- * UTILITIES
- *
- ********************************************************************************************/
-
-// The following general utilities return NULL if any of their input pointers are NULL, or if they
-// could not perform their function (in which case they also print an error to stderr).
-
-void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc
-void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to
-char *Strdup(char *string, char *mesg); // stderr if out of memory
-
-FILE *Fopen(char *path, char *mode); // Open file path for "mode"
-char *PathTo(char *path); // Return path portion of file name "path"
-char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path"
-
-// Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer
-// Numbered_Suffix returns concatenation of left.<num>.right in a *temporary* buffer
-
-char *Catenate(char *path, char *sep, char *root, char *suffix);
-char *Numbered_Suffix(char *left, int num, char *right);
-
-
-// DB-related utilities
-
-void Print_Number(int64 num, int width, FILE *out); // Print readable big integer
-int Number_Digits(int64 num); // Return # of digits in printed number
-
-#define COMPRESSED_LEN(len) (((len)+3) >> 2)
-
-void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form
-void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form
-void Print_Read(char *s, int width);
-
-void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt)
-void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT)
-void Number_Read(char *s); // Convert read from letters to numbers
-
-
-/*******************************************************************************************
- *
- * DB IN-CORE DATA STRUCTURES
- *
- ********************************************************************************************/
-
-#define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert
-#define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1)
-
-typedef struct
- { int origin; // Well #
- int rlen; // Length of the sequence (Last pulse = fpulse + rlen)
- int fpulse; // First pulse
- int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
- // uncompressed bases in memory block
- int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
- int flags; // QV of read + flags above
- } HITS_READ;
-
-// A track can be of 3 types:
-// data == NULL: there are nreads+1 'anno' records of size 'size'.
-// data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1])
-// contains the variable length data
-// data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1])
-// contains the variable length data
-
-typedef struct _track
- { struct _track *next; // Link to next track
- char *name; // Symbolic name of track
- int size; // Size in bytes of anno records
- void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records
- void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL
- } HITS_TRACK;
-
-
-// The DB record holds all information about the current state of an active DB including an
-// array of HITS_READS, one per read, and a linked list of HITS_TRACKs the first of which
-// is always a HITS_QV pseudo-track (if the QVs have been loaded).
-
-typedef struct
- { int oreads; // Total number of reads in DB
- int breads; // Total number of reads in trimmed DB (if trimmed set)
- int cutoff; // Minimum read length in block (-1 if not yet set)
- int all; // Consider multiple reads from a given well
- float freq[4]; // frequency of A, C, G, T, respectively
-
- // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed)
-
- int maxlen; // length of maximum read (initially over all DB)
- int64 totlen; // total # of bases (initially over all DB)
-
- int nreads; // # of reads in actively loaded portion of DB
- int trimmed; // DB has been trimmed by cutoff/all
- int part; // DB block (if > 0), total DB (if == 0)
- int ofirst; // Index of first read in block (without trimming)
- int bfirst; // Index of first read in block (with trimming)
-
- char *path; // Root name of DB for .bps and tracks
- int loaded; // Are reads loaded in memory?
- void *bases; // file pointer for bases file (to fetch reads from),
- // or memory pointer to uncompressed block of all sequences.
- HITS_READ *reads; // Array [0..nreads] of HITS_READ
- HITS_TRACK *tracks; // Linked list of loaded tracks
- } HITS_DB;
-
-
-/*******************************************************************************************
- *
- * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock
- *
- ********************************************************************************************/
-
-#define MAX_NAME 10000 // Longest file name or fasta header line
-
-#define DB_NFILE "files = %9d\n" // number of files
-#define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name
-#define DB_NBLOCK "blocks = %9d\n" // number of blocks
-#define DB_PARAMS "size = %9lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well
-#define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed)
-
-
-/*******************************************************************************************
- *
- * DB ROUTINES
- *
- ********************************************************************************************/
-
- // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps,
- // .DB.qvs, and files .DB.<track>.anno and DB.<track>.data where <track> is a track name
- // (not containing a . !).
-
- // A DAM is basically a DB except that:
- // 1. there are no QV's, instead .coff points the '\0' terminated fasta header of the read
- // in the file .<dam>.hdr file
- // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences
- // contain N-separated contigs), and .fpulse the first base ofn the contig in the
- // fasta entry
-
- // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has
- // a part # in it then just the part is opened. The index array is allocated (for all or
- // just the part) and read in.
- // Return status of routine:
- // -1: The DB could not be opened for a reason reported by the routine to stderr
- // 0: Open of DB proceeded without mishap
- // 1: Open of DAM proceeded without mishap
-
-int Open_DB(char *path, HITS_DB *db);
-
- // Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings
- // of the current DB partition. Reallocate smaller memory blocks for the information kept
- // for the retained reads.
-
-void Trim_DB(HITS_DB *db);
-
- // Shut down an open 'db' by freeing all associated space, including tracks and QV structures,
- // and any open file pointers. The record pointed at by db however remains (the user
- // supplied it and so should free it).
-
-void Close_DB(HITS_DB *db);
-
- // Look up the file and header in the file of the indicated track. Return:
- // 1: Track is for trimmed DB
- // 0: Track is for untrimmed DB
- // -1: Track is not the right size of DB either trimmed or untrimmed
- // -2: Could not find the track
-
-int Check_Track(HITS_DB *db, char *track);
-
- // If track is not already in the db's track list, then allocate all the storage for it,
- // read it in from the appropriate file, add it to the track list, and return a pointer
- // to the newly created HITS_TRACK record. If the track does not exist or cannot be
- // opened for some reason, then NULL is returned.
-
-HITS_TRACK *Load_Track(HITS_DB *db, char *track);
-
- // If track is on the db's track list, then it is removed and all storage associated with it
- // is freed.
-
-void Close_Track(HITS_DB *db, char *track);
-
- // Allocate and return a buffer big enough for the largest read in 'db'.
- // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte
- // are needed by the alignment algorithms.
-
-char *New_Read_Buffer(HITS_DB *db);
-
- // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an
- // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T)
- // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter
- // for traversals in either direction.
-
-void Load_Read(HITS_DB *db, int i, char *read, int ascii);
-
- // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the
- // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii
- // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string
- // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to
- // the string holding the substring so it has a delimeter for traversals in either direction.
-
-char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii);
-
- // Allocate a block big enough for all the uncompressed sequences, read them into it,
- // reset the 'off' in each read record to be its in-memory offset, and set the
- // bases pointer to point at the block after closing the bases file. If ascii is
- // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and
- // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T).
-
-void Read_All_Sequences(HITS_DB *db, int ascii);
-
- // For the DB or DAM "path" = "prefix/root[.db|.dam]", find all the files for that DB, i.e. all
- // those of the form "prefix/[.]root.part" and call foreach with the complete path to each file
- // pointed at by path, and the suffix of the path by extension. The . proceeds the root
- // name if the defined constant HIDE_FILES is set. Always the first call is with the
- // path "prefix/root.db" and extension "db". There will always be calls for
- // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and
- // so this routine gives one a way to know all the tracks associated with a given DB.
- // Return non-zero iff path could not be opened for any reason.
-
-int List_DB_Files(char *path, void foreach(char *path, char *extension));
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // _HITS_DB
diff --git a/src/cpp/DazAlnProvider.cpp b/src/cpp/DazAlnProvider.cpp
index 0300734..c32e9a6 100644
--- a/src/cpp/DazAlnProvider.cpp
+++ b/src/cpp/DazAlnProvider.cpp
@@ -1,38 +1,3 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
#include <stdlib.h>
#include <cstring>
#include <iostream>
@@ -53,7 +18,7 @@ IOException::IOException(const char* desc) : desc_(desc) {}
static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' };
static int BORDER = 10;
-// Should write my own, but for reasons of expediency, this borrows heavily
+// Should write my own, but for reasons of expediency, this borrows heavily
// from LAshow.c
DazAlnProvider::DazAlnProvider(const ProgramOpts& popts) :
popts_(popts),
@@ -89,10 +54,10 @@ DazAlnProvider::DazAlnProvider(const ProgramOpts& popts) :
throw IOException("Failed to read tspace");
int small;
- if (tspace <= TRACE_XOVR) {
+ if (tspace <= TRACE_XOVR) {
small = 1;
tbytes_ = sizeof(uint8);
- } else {
+ } else {
small = 0;
tbytes_ = sizeof(uint16);
}
@@ -130,7 +95,7 @@ bool DazAlnProvider::nextTarget(std::vector<dagcon::Alignment> &dest) {
trg_->getAlignments(dest, popts_.maxHits, popts_.sortCov);
if (dest.size() < popts_.minCov) {
dest.clear();
- skipTarget = true;
+ skipTarget = true;
}
} else {
skipTarget = true;
@@ -147,12 +112,12 @@ bool DazAlnProvider::nextTarget(std::vector<dagcon::Alignment> &dest) {
}
trg_->addRecord(rec, popts_.properOvls);
}
-
+
return covl_ != novl_;
}
bool DazAlnProvider::nextTarget(std::string& targSeq, std::vector<dagcon::Alignment>& dest) {
-
+
bool hasNext = nextTarget(dest);
targSeq.resize(trg_->length);
@@ -162,19 +127,19 @@ bool DazAlnProvider::nextTarget(std::string& targSeq, std::vector<dagcon::Alignm
int i;
for (i = 0; i < trg_->length; i++)
targSeq[i] = ToU[(int)seq[i]];
-
+
return hasNext;
}
void DazAlnProvider::nextRecord(Record& rec) {
Read_Overlap(input_,&rec.ovl);
- int tmax = ((int)1.2*rec.ovl.path.tlen) + 100;
+ int tmax = ((int)1.2*rec.ovl.path.tlen) + 100;
rec.trace.resize(tmax,0);
rec.ovl.path.trace = (void *) &rec.trace.front();
Read_Trace(input_, &rec.ovl, tbytes_);
}
-TargetHit::TargetHit() :
+TargetHit::TargetHit() :
ovlScore(0.0f),
covScore(0.0f),
aread(-1),
@@ -230,15 +195,15 @@ void TargetHit::computeOvlScore(bool proper) {
Path p = rec.ovl.path;
ahlen += p.aepos - p.abpos;
bhlen += p.bepos - p.bbpos;
- diff += std::abs(ahlen - bhlen) + p.diffs;
+ diff += std::abs(ahlen - bhlen) + p.diffs;
}
ovlScore = (1 - diff/(float)ahlen) * ahlen;
if (proper) {
const Path& f = records.front().ovl.path;
- const Path& b = records.back().ovl.path;
- if (f.abpos != 0 && b.bbpos != 0)
+ const Path& b = records.back().ovl.path;
+ if (f.abpos != 0 && b.bbpos != 0)
ovlScore = 0.0f;
if (f.aepos != alen && b.bepos != blen)
ovlScore = 0.0f;
@@ -253,16 +218,16 @@ int TargetHit::aend() {
return records.back().ovl.path.aepos;
}
-// Simplify unit testing, don't burden with malloc'd
+// Simplify unit testing, don't burden with malloc'd
// daligner structures.
Target::Target(): needsFree_(false) { }
-Target::Target(HITS_DB& db, int tspace, int small) :
- db_(db),
- tspace_(tspace),
+Target::Target(HITS_DB& db, int tspace, int small) :
+ db_(db),
+ tspace_(tspace),
small_(small),
needsFree_(true) {
-
+
work_ = New_Work_Data();
abuffer_ = New_Read_Buffer(&db_);
bbuffer_ = New_Read_Buffer(&db_);
@@ -285,7 +250,7 @@ void Target::firstRecord(Record& rec, bool proper) {
if (coverage_.size() < (unsigned int) length)
coverage_.resize(length);
-
+
auto beg = coverage_.begin();
std::for_each(beg, beg+length, [](unsigned int& x){x=0;});
@@ -360,12 +325,12 @@ void Target::getAlignments(std::vector<dagcon::Alignment> &alns, unsigned int ma
if (amin < 0) amin = 0;
amax = ovl.path.aepos + BORDER;
if (amax > aln.alen) amax = aln.alen;
- if (COMP(aln.flags)) {
+ if (COMP(aln.flags)) {
bmin = (aln.blen-ovl.path.bepos) - BORDER;
if (bmin < 0) bmin = 0;
bmax = (aln.blen-ovl.path.bbpos) + BORDER;
if (bmax > aln.blen) bmax = aln.blen;
- } else {
+ } else {
bmin = ovl.path.bbpos - BORDER;
if (bmin < 0) bmin = 0;
bmax = ovl.path.bepos + BORDER;
@@ -377,13 +342,13 @@ void Target::getAlignments(std::vector<dagcon::Alignment> &alns, unsigned int ma
bseq = Load_Subread(&db_, ovl.bread, bmin, bmax, bbuffer_, 0);
aln.aseq = aseq - amin;
- if (COMP(aln.flags)) {
+ if (COMP(aln.flags)) {
Complement_Seq(bseq,bmax-bmin);
aln.bseq = bseq - (aln.blen - bmax);
} else
aln.bseq = bseq - bmin;
- Compute_Trace_PTS(&aln, work_, tspace_);
+ Compute_Trace_PTS(&aln, work_, tspace_, GREEDIEST);
// initialize the dagcon alignment class
dagcon::Alignment dest;
diff --git a/src/cpp/DazAlnProvider.hpp b/src/cpp/DazAlnProvider.hpp
index a059c51..d24dbdf 100644
--- a/src/cpp/DazAlnProvider.hpp
+++ b/src/cpp/DazAlnProvider.hpp
@@ -1,42 +1,4 @@
-
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-
-#ifndef __GCON_DAZ_ALN_PROVIDER__
-#define __GCON_DAZ_ALN_PROVIDER__
+#pragma once
#include <stdio.h>
#include <string>
@@ -48,8 +10,10 @@
#include "AlnProvider.hpp"
// Dazzler headers
+extern "C" {
#include "DB.h"
#include "align.h"
+}
// Represents one record from the LAS file, essentially a thin container for
// a dazzler overlap so we can manage things on the stack.
@@ -60,8 +24,8 @@ struct Record {
Record() {}
~Record() {}
- Record(Record &&o) noexcept :
- ovl(std::move(o.ovl)),
+ Record(Record &&o) noexcept :
+ ovl(std::move(o.ovl)),
trace(std::move(o.trace)) {
o.ovl.path.trace = NULL;
}
@@ -87,7 +51,7 @@ struct Record {
};
// Holds information for all the a,b overlaps in a particular direction,
-// either forward or reverse. Overlaps for a particular a,b,strand
+// either forward or reverse. Overlaps for a particular a,b,strand
// combination may come in as multiple overlaps. This class allows us to
// handle them as a unit.
class TargetHit {
@@ -137,7 +101,7 @@ public:
// a 'proper' overlap (more stringent).
void firstRecord(Record& rec, bool proper=false);
- // Adds the next overlap record to this target, possibly scoring as
+ // Adds the next overlap record to this target, possibly scoring as
// a 'proper' overlap (more stringent).
void addRecord(Record& rec, bool proper=false);
@@ -150,7 +114,7 @@ public:
int id;
// Length of the target
- int length;
+ int length;
std::vector<TargetHit> hits;
@@ -165,7 +129,7 @@ private:
///
/// Provides sets of alignments for a given target sequence from a daligner
-/// output file.
+/// output file.
///
class DazAlnProvider : public AlnProvider {
public:
@@ -193,7 +157,7 @@ private:
Target* trg_;
const ProgramOpts popts_;
- // Dazzler-related data
+ // Dazzler-related data
HITS_DB db_;
int64 novl_, covl_;
int tbytes_;
@@ -207,10 +171,10 @@ private:
/// Compares the hits based on (percent id) x (query alignment length)
-bool cmpHitOvlScore(const TargetHit& l, const TargetHit& r);
+bool cmpHitOvlScore(const TargetHit& l, const TargetHit& r);
/// Compares based on coverage score
-bool cmpHitCovScore(const TargetHit& l, const TargetHit& r);
+bool cmpHitCovScore(const TargetHit& l, const TargetHit& r);
float invertedSum(float x, unsigned int y);
@@ -218,5 +182,3 @@ float invertedSum(float x, unsigned int y);
/// should update the alignment graph to process the dazzler alignment
/// directly, but this will be useful for debugging purposes.
void decodeAlignment(Alignment* src, dagcon::Alignment& dest);
-
-#endif //__GCON_DAZ_ALN_PROVIDER__
diff --git a/src/cpp/Makefile b/src/cpp/Makefile
deleted file mode 100644
index f1e7099..0000000
--- a/src/cpp/Makefile
+++ /dev/null
@@ -1,38 +0,0 @@
-include pbi.mk
-include boost.mk
-
-COMMON_OBJECTS := Alignment.o AlnGraphBoost.o
-PBDAGCON_OBJECTS := BlasrM5AlnProvider.o main.o SimpleAligner.o
-DAZCON_OBJECTS := DB.o align.o DazAlnProvider.o dazcon.o
-
-CXXFLAGS = -O3 -std=c++11 -Wall -Wuninitialized -pedantic -I third-party \
- -I $(BOOST_HEADERS)
-
-CFLAGS = -O3 -Wall -Wextra -fno-strict-aliasing
-
-INCDIRS := -I$(PBDATA) -I$(BLASR) $(EXTRA_INCDIRS)
-LDFLAGS := -L$(PBDATA) -L$(BLASR) $(EXTRA_LDFLAGS)
-
-all: pbdagcon
-
-dazcon: LDLIBS = -lpthread
-dazcon: $(COMMON_OBJECTS) $(DAZCON_OBJECTS)
- $(CXX) -Wl,--no-as-needed -o $@ $^ $(LDLIBS)
-
-pbdagcon: LDLIBS = -lpbdata -lblasr -lpthread $(EXTRA_LDLIBS)
-pbdagcon: CXXFLAGS += $(INCDIRS)
-pbdagcon: $(COMMON_OBJECTS) $(PBDAGCON_OBJECTS)
- $(CXX) -Wl,--no-as-needed $(LIBDIRS) -o $@ $^ $(LDFLAGS) $(LDLIBS)
-
-$(COMMON_OBJECTS): $(BOOST_HEADERS)
-
-$(BOOST_HEADERS):
- cd third-party && $(GET_BOOST)
-
-clean:
- $(RM) *.d
- $(RM) *.o
- $(RM) pbdagcon
- $(RM) dazcon
-
-.PHONY: all clean
diff --git a/src/cpp/ProgramOpts.hpp b/src/cpp/ProgramOpts.hpp
index 45242ad..e7110dd 100644
--- a/src/cpp/ProgramOpts.hpp
+++ b/src/cpp/ProgramOpts.hpp
@@ -1,5 +1,4 @@
-#ifndef __GCON_PROGRAMOPTS__
-#define __GCON_PROGRAMOPTS__
+#pragma once
#include <string>
#include <set>
@@ -35,5 +34,3 @@ struct ProgramOpts {
/// Limit correction to these targets
std::set<int> targets;
};
-
-#endif // __GCON_PROGRAMOPTS__
diff --git a/src/cpp/SimpleAligner.cpp b/src/cpp/SimpleAligner.cpp
index 6086187..bb8d16c 100644
--- a/src/cpp/SimpleAligner.cpp
+++ b/src/cpp/SimpleAligner.cpp
@@ -1,39 +1,3 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-
#include <vector>
#include <stdint.h>
#include <cstring>
@@ -72,7 +36,7 @@ void SimpleAligner::align(dagcon::Alignment& aln) {
config_.sdpIndel, config_.sdpIndel, config_.indelRate*2,
initialAln, Local);
- GuidedAlign(query, target, initialAln, distScoreFn_,
+ GuidedAlign(query, target, initialAln, distScoreFn_,
config_.bandSize, refinedAln);
std::string queryStr, alignStr, targetStr;
@@ -80,7 +44,7 @@ void SimpleAligner::align(dagcon::Alignment& aln) {
//StickPrintAlignment(initialAln, query, target, std::cout);
//StickPrintAlignment(refinedAln, query, target, std::cout);
- CreateAlignmentStrings(refinedAln, query.seq, target.seq,
+ CreateAlignmentStrings(refinedAln, query.seq, target.seq,
targetStr, alignStr, queryStr, query.length, target.length);
// alignment coordinates may change, update alignment object
diff --git a/src/cpp/SimpleAligner.hpp b/src/cpp/SimpleAligner.hpp
index 209b0dc..08aa1ac 100644
--- a/src/cpp/SimpleAligner.hpp
+++ b/src/cpp/SimpleAligner.hpp
@@ -1,41 +1,5 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
+#pragma once
-
-#ifndef __GCON_SIMPLE_ALIGNER__
-#define __GCON_SIMPLE_ALIGNER__
#include "Types.h"
#include "Enumerations.h"
#include "DNASequence.hpp"
@@ -62,12 +26,10 @@ struct Config {
class SimpleAligner {
public:
SimpleAligner();
- void align(dagcon::Alignment& aln);
+ void align(dagcon::Alignment& aln);
void operator() (dagcon::Alignment& aln);
private:
Aligner::Config config_;
TupleMetrics tupleMetrics_;
DistanceMatrixScoreFunction<DNASequence, FASTQSequence> distScoreFn_;
};
-
-#endif // __GCON_SIMPLE_ALIGNER__
diff --git a/src/cpp/align.c b/src/cpp/align.c
deleted file mode 100644
index b60c6c0..0000000
--- a/src/cpp/align.c
+++ /dev/null
@@ -1,3805 +0,0 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
-/*******************************************************************************************
- *
- * Fast alignment discovery and trace generation along with utilites for displaying alignments
- * Based on previously unpublished ideas from 2005, subsequently refined in 2013-14. Basic
- * idea is to keep a dynamically selected interval of the f.r. waves from my 1986 O(nd) paper.
- * A recent cool idea is to not record all the details of an alignment while discovering it
- * but simply record trace points through which the optimal alignment passes every 100bp,
- * allowing rapid recomputation of the alignment details between trace points.
- *
- * Author : Gene Myers
- * First : June 2013
- * Current: June 1, 2014
- *
- ********************************************************************************************/
-
-// align1: Derived from the original BOA aligner
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <unistd.h>
-#include <math.h>
-#include <limits.h>
-
-#include "DB.h"
-#include "align.h"
-
-#define DELTAS
-
-#undef DEBUG_PASSES // Show forward / backward extension termini for Local_Alignment
-#undef DEBUG_POINTS // Show trace points
-#undef DEBUG_WAVE // Show waves of Local_Alignment
-#undef SHOW_MATCH_WAVE // For waves of Local_Alignment also show # of matches
-#undef SHOW_TRAIL // Show trace at the end of forward and reverse passes
-#undef SHOW_TPS // Show trace points as they are encountered in a wave
-
-#undef DEBUG_EXTEND // Show waves of Extend_Until_Overlap
-
-#undef DEBUG_ALIGN // Show division points of Compute_Trace
-#undef DEBUG_SCRIPT // Show trace additions for Compute_Trace
-#undef DEBUG_AWAVE // Show F/R waves of Compute_Trace
-#define SMALL_BIT 100
-
-#undef SHOW_TRACE // Show full trace for Print_Alignment
-
-#undef WAVE_STATS
-
-
-/****************************************************************************************\
-* *
-* Working Storage Abstraction *
-* *
-\****************************************************************************************/
-
-typedef struct // Hidden from the user, working space for each thread
- { int vecmax;
- void *vector;
- int celmax;
- void *cells;
- int pntmax;
- void *points;
- int tramax;
- void *trace;
- } _Work_Data;
-
-Work_Data *New_Work_Data()
-{ _Work_Data *work;
-
- work = (_Work_Data *) Malloc(sizeof(_Work_Data),"Allocating work data block");
- if (work == NULL)
- exit (1);
- work->vecmax = 0;
- work->vector = NULL;
- work->pntmax = 0;
- work->points = NULL;
- work->tramax = 0;
- work->trace = NULL;
- work->celmax = 0;
- work->cells = NULL;
- return ((Work_Data *) work);
-}
-
-static void enlarge_vector(_Work_Data *work, int newmax)
-{ work->vecmax = ((int) (newmax*1.2)) + 10000;
- work->vector = Realloc(work->vector,work->vecmax,"Enlarging DP vector");
- if (work->vector == NULL)
- exit (1);
-}
-
-static void enlarge_points(_Work_Data *work, int newmax)
-{ work->pntmax = ((int) (newmax*1.2)) + 10000;
- work->points = Realloc(work->points,work->pntmax,"Enlarging point vector");
- if (work->points == NULL)
- exit (1);
-}
-
-static void enlarge_trace(_Work_Data *work, int newmax)
-{ work->tramax = ((int) (newmax*1.2)) + 10000;
- work->trace = Realloc(work->trace,work->tramax,"Enlarging trace vector");
- if (work->trace == NULL)
- exit (1);
-}
-
-void Free_Work_Data(Work_Data *ework)
-{ _Work_Data *work = (_Work_Data *) ework;
- if (work->vector != NULL)
- free(work->vector);
- if (work->cells != NULL)
- free(work->cells);
- if (work->trace != NULL)
- free(work->trace);
- if (work->points != NULL)
- free(work->points);
- free(work);
-}
-
-
-/****************************************************************************************\
-* *
-* ADAPTIVE PATH FINDING *
-* *
-\****************************************************************************************/
-
- // Absolute/Fixed Parameters
-
-#define BVEC uint64 // Can be uint32 if PATH_LEN <= 32
-
-#define TRIM_LEN 15 // Report as the tip, the last wave maximum for which the last
- // 2*TRIM_LEN edits are prefix-positive at rate ave_corr*f(bias)
- // (max value is 20)
-
-#define PATH_LEN 60 // Follow the last PATH_LEN columns/edges (max value is 63)
-
- // Derivative fixed parameters
-
-#define PATH_TOP 0x1000000000000000ll // Must be 1 << PATH_LEN
-#define PATH_INT 0x0fffffffffffffffll // Must be PATH_TOP-1
-#define TRIM_MASK 0x7fff // Must be (1 << TRIM_LEN) - 1
-#define TRIM_MLAG 200 // How far can last trim point be behind best point
-#define WAVE_LAG 30 // How far can worst point be behind the best point
-
-static double Bias_Factor[10] = { .690, .690, .690, .690, .780,
- .850, .900, .933, .966, 1.000 };
-
- // Micro-Sat Band Parameters
-
-#define MICRO_SAT 20
-
-static int Sat_Width[MICRO_SAT+1] =
- { -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
-
-#define SAT_LOW .75
-#define SAT_HGH 1.25
-
- // Adjustable paramters
-
-typedef struct
- { double ave_corr;
- int trace_space;
- float freq[4];
- int ave_path;
- int16 *score;
- int16 *table;
- } _Align_Spec;
-
-/* Fill in bit table: TABLE[x] = 1 iff the alignment modeled by x (1 = match, 0 = mismatch)
- has a non-negative score for every suffix of the alignment under the scoring scheme
- where match = MATCH and mismatch = -1. MATCH is set so that an alignment with TRIM_PCT
- matches has zero score ( (1-TRIM_PCT) / TRIM_PCT ). */
-
-#define FRACTION 1000 // Implicit fractional part of scores, i.e. score = x/FRACTION
-
-typedef struct
- { int mscore;
- int dscore;
- int16 *table;
- int16 *score;
- } Table_Bits;
-
-static void set_table(int bit, int prefix, int score, int max, Table_Bits *parms)
-{ if (bit >= TRIM_LEN)
- { parms->table[prefix] = (int16) (score-max);
- parms->score[prefix] = (int16) score;
- }
- else
- { if (score > max)
- max = score;
- set_table(bit+1,(prefix<<1),score - parms->dscore,max,parms);
- set_table(bit+1,(prefix<<1) | 1,score + parms->mscore,max,parms);
- }
-}
-
-/* Create an alignment specification record including path tip tables & values */
-
-Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq)
-{ _Align_Spec *spec;
- Table_Bits parms;
- double match;
- int bias;
-
- spec = (_Align_Spec *) Malloc(sizeof(_Align_Spec),"Allocating alignment specification");
- if (spec == NULL)
- exit (1);
-
- spec->ave_corr = ave_corr;
- spec->trace_space = trace_space;
- spec->freq[0] = freq[0];
- spec->freq[1] = freq[1];
- spec->freq[2] = freq[2];
- spec->freq[3] = freq[3];
-
- match = freq[0] + freq[3];
- if (match > .5)
- match = 1.-match;
- bias = (int) ((match+.025)*20.-1.);
- if (match < .2)
- { fprintf(stderr,"Warning: Base bias worse than 80/20%% !\n");
- bias = 3;
- }
-
- spec->ave_path = (int) (PATH_LEN * (1. - Bias_Factor[bias] * (1. - ave_corr)));
- parms.mscore = (int) (FRACTION * Bias_Factor[bias] * (1. - ave_corr));
- parms.dscore = FRACTION - parms.mscore;
-
- parms.score = (int16 *) Malloc(sizeof(int16)*(TRIM_MASK+1)*2,"Allocating trim table");
- if (parms.score == NULL)
- exit (1);
- parms.table = parms.score + (TRIM_MASK+1);
-
- set_table(0,0,0,0,&parms);
-
- spec->table = parms.table;
- spec->score = parms.score;
-
- return ((Align_Spec *) spec);
-}
-
-void Free_Align_Spec(Align_Spec *espec)
-{ _Align_Spec *spec = (_Align_Spec *) espec;
- free(spec->score);
- free(spec);
-}
-
-double Average_Correlation(Align_Spec *espec)
-{ return (((_Align_Spec *) espec)->ave_corr); }
-
-int Trace_Spacing(Align_Spec *espec)
-{ return (((_Align_Spec *) espec)->trace_space); }
-
-float *Base_Frequencies(Align_Spec *espec)
-{ return (((_Align_Spec *) espec)->freq); }
-
-
-/****************************************************************************************\
-* *
-* LOCAL ALIGNMENT FINDER: forward_/reverse_wave and Local_Alignment *
-* *
-\****************************************************************************************/
-
-
-#ifdef WAVE_STATS
-
-static int64 MAX, TOT, NWV;
-static int64 RESTARTS;
-
-void Init_Stats()
-{ MAX = TOT = NWV = 0;
- RESTARTS = 0;
-}
-
-void Print_Stats()
-{ printf("\nMax = %lld Ave = %.1f # = %lld\n",MAX,(1.*TOT)/NWV,NWV);
- printf("\nRestarts = %lld\n",RESTARTS);
-}
-
-#endif
-
-
-#ifdef DEBUG_WAVE
-
-static void print_wave(int *V, int *M, int low, int hgh, int besta)
-{ int k, bestk;
-
- (void) M;
- printf(" [%6d,%6d]: ",low,hgh);
- for (k = low; k <= hgh; k++)
- { if (besta == V[k])
- bestk = k;
- // printf(" %3d",(V[k]+k)/2);
- printf(" %3d",besta-V[k]);
- }
- printf(" : %d (%d,%d)\n",besta,(besta+bestk)/2,(besta-bestk)/2);
-#ifdef SHOW_MATCH_WAVE
- printf(" ");
- for (k = low; k <= hgh; k++)
- printf(" %3d",M[k]);
- printf("\n");
-#endif
- fflush(stdout);
-}
-
-#endif
-
-/* At each furthest reaching point, keep a-coordinate of point (V), bitvector
- recording the last TRIM_LEN columns of the implied alignment (T), and the
- # of matches (1-bits) in the bitvector (M). */
-
-typedef struct
- { int ptr;
- int diag;
-#ifdef DELTAS
- int diff;
-#endif
- int mark;
- } Pebble;
-
-// This (and reverse) take the bulk of the processing time in daligner
-// Isn't easily vectorizable given all the 'while/break' looping
-static int forward_wave(_Work_Data *work, _Align_Spec *spec,
- Alignment *align, Path *bpath,
- int mind, int maxd, int mida)
-{ char *aseq = align->aseq;
- char *bseq = align->bseq;
- Path *apath = align->path;
-
- int hgh, low, dif;
- int minp, maxp;
- int *V, *M;
- BVEC *T;
-
- int *HA, *HB;
- int *NA, *NB;
- Pebble *cells;
- int avail, cmax, boff;
-
- int TRACE_SPACE = spec->trace_space;
- int PATH_AVE = spec->ave_path;
- int16 *SCORE = spec->score;
- int16 *TABLE = spec->table;
-
- int besta, besty;
- int trima, trimy, trimd;
- int trimha, trimhb;
- int morea, morey, mored;
- int moreha, morehb;
- int more, morem, lasta;
- int aclip, bclip;
-
- { int alen = align->alen + 1;
- int blen = align->blen + 1;
- int tlen = alen + blen + 1;
-
- V = ((int *) work->vector) + blen;
- M = V + tlen;
- HA = M + tlen;
- HB = HA + tlen;
- NA = HB + tlen;
- NB = NA + tlen;
- T = ((BVEC *) (NB + alen)) + blen;
-
- cells = (Pebble *) (work->cells);
- cmax = work->celmax;
- avail = 0;
-
- if (COMP(align->flags))
- boff = align->blen % TRACE_SPACE;
- else
- boff = 0;
- }
-
- /* Compute 0-wave starting from mid-line */
-
- hgh = maxd;
- low = mind;
- if (aseq == bseq)
- { if (low < 0)
- { int big = -low;
- int sml = -hgh;
-
- if (big <= MICRO_SAT)
- minp = low - Sat_Width[big];
- else
- minp = -SAT_HGH*big;
- if (sml <= MICRO_SAT)
- maxp = hgh + Sat_Width[sml];
- else
- maxp = -SAT_LOW*sml;
- }
- else
- { if (low <= MICRO_SAT)
- minp = low - Sat_Width[low];
- else
- minp = SAT_LOW*low;
- if (hgh <= MICRO_SAT)
- maxp = hgh + Sat_Width[hgh];
- else
- maxp = SAT_HGH*hgh;
- }
- }
- else
- { minp = -INT32_MAX;
- maxp = INT32_MAX;
- }
- dif = 0;
-
- more = 1;
- aclip = INT32_MAX;
- bclip = -INT32_MAX;
-
- besta = trima = morea = lasta = mida;
- besty = trimy = morey = (mida-hgh) >> 1;
- trimd = mored = 0;
- trimha = moreha = 0;
- trimhb = morehb = 1;
- morem = -1;
-
- { int k;
- char *a;
-
- a = aseq + hgh;
- for (k = hgh; k >= low; k--)
- { int y, c, d;
- int ha, hb;
- int na, nb;
- Pebble *pb;
-
- y = (mida-k) >> 1;
-
- if (avail >= cmax-1)
- { cmax = ((int) (avail*1.2)) + 10000;
- cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
- if (cells == NULL)
- exit (1);
- }
-
- na = ((y+k)/TRACE_SPACE)*TRACE_SPACE;
-#ifdef SHOW_TPS
- printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = -1;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = 0;
-#endif
- pb->mark = na;
- ha = avail++;
- na += TRACE_SPACE;
-
- nb = ((y+(TRACE_SPACE-boff))/TRACE_SPACE-1)*TRACE_SPACE+boff;
-#ifdef SHOW_TPS
- printf(" B %d: %d,%d,0,%d\n",avail,-1,k,nb); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = -1;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = 0;
-#endif
- pb->mark = nb;
- hb = avail++;
- nb += TRACE_SPACE;
-
- while (1)
- { c = bseq[y];
- if (c == 4)
- { more = 0;
- if (bclip < k)
- bclip = k;
- break;
- }
- d = a[y];
- if (c != d)
- { if (d == 4)
- { more = 0;
- aclip = k;
- }
- break;
- }
- y += 1;
- }
- c = (y << 1) + k;
-
- while (y+k >= na)
- { if (avail >= cmax)
- { cmax = ((int) (avail*1.2)) + 10000;
- cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
- if (cells == NULL)
- exit (1);
- }
-#ifdef SHOW_TPS
- printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = ha;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = 0;
-#endif
- pb->mark = na;
- ha = avail++;
- na += TRACE_SPACE;
- }
- while (y >= nb)
- { if (avail >= cmax)
- { cmax = ((int) (avail*1.2)) + 10000;
- cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
- if (cells == NULL)
- exit (1);
- }
-#ifdef SHOW_TPS
- printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = hb;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = 0;
-#endif
- pb->mark = nb;
- hb = avail++;
- nb += TRACE_SPACE;
- }
-
- if (c > besta)
- { besta = trima = lasta = c;
- besty = trimy = y;
- trimha = ha;
- trimhb = hb;
- }
-
- V[k] = c;
- T[k] = PATH_INT;
- M[k] = PATH_LEN;
- HA[k] = ha;
- HB[k] = hb;
- NA[k] = na;
- NB[k] = nb;
-
- a -= 1;
- }
- }
-
- if (more == 0)
- { if (bseq[besty] != 4 && aseq[besta - besty] != 4)
- more = 1;
- if (hgh >= aclip)
- { hgh = aclip-1;
- if (morem <= M[aclip])
- { morem = M[aclip];
- morea = V[aclip];
- morey = (morea - aclip)/2;
- moreha = HA[aclip];
- morehb = HB[aclip];
- }
- }
- if (low <= bclip)
- { low = bclip+1;
- if (morem <= M[bclip])
- { morem = M[bclip];
- morea = V[bclip];
- morey = (morea - bclip)/2;
- moreha = HA[bclip];
- morehb = HB[bclip];
- }
- }
- aclip = INT32_MAX;
- bclip = -INT32_MAX;
- }
-
-#ifdef DEBUG_WAVE
- printf("\nFORWARD WAVE:\n");
- print_wave(V,M,low,hgh,besta);
-#endif
-
- /* Compute successive waves until no furthest reaching points remain */
-
- while (more && lasta >= besta - TRIM_MLAG)
- { int k, n;
- int ua, ub;
- BVEC t;
- int am, ac, ap;
- char *a;
-
- if (low > minp)
- { low -= 1;
- NA[low] = NA[low+1];
- NB[low] = NB[low+1];
- V[low] = -1;
- }
- if (hgh < maxp)
- { hgh += 1;
- NA[hgh] = NA[hgh-1];
- NB[hgh] = NB[hgh-1];
- V[hgh] = am = -1;
- }
- else
- am = V[hgh];
- dif += 1;
-
- ac = V[hgh+1] = V[low-1] = -1;
- a = aseq + hgh;
- t = PATH_INT;
- n = PATH_LEN;
- ua = ub = -1;
- for (k = hgh; k >= low; k--)
- { int y, m;
- int ha, hb;
- int c, d;
- BVEC b;
- Pebble *pb;
-
- ap = ac;
- ac = am;
- am = V[d = k-1];
-
- if (ac < am)
- if (am < ap)
- { c = ap+1;
- m = n;
- b = t;
- ha = ua;
- hb = ub;
- }
- else
- { c = am+1;
- m = M[d];
- b = T[d];
- ha = HA[d];
- hb = HB[d];
- }
- else
- if (ac < ap)
- { c = ap+1;
- m = n;
- b = t;
- ha = ua;
- hb = ub;
- }
- else
- { c = ac+2;
- m = M[k];
- b = T[k];
- ha = HA[k];
- hb = HB[k];
- }
-
- if ((b & PATH_TOP) != 0)
- m -= 1;
- b <<= 1;
-
- y = (c-k) >> 1;
- while (1)
- { c = bseq[y];
- if (c == 4)
- { more = 0;
- if (bclip < k)
- bclip = k;
- break;
- }
- d = a[y];
- if (c != d)
- { if (d == 4)
- { more = 0;
- aclip = k;
- }
- break;
- }
- y += 1;
- if ((b & PATH_TOP) == 0)
- m += 1;
- b = (b << 1) | 1;
- }
- c = (y << 1) + k;
-
- while (y+k >= NA[k])
- { if (cells[ha].mark < NA[k])
- { if (avail >= cmax)
- { cmax = ((int) (avail*1.2)) + 10000;
- cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),
- "Reallocating trace cells");
- if (cells == NULL)
- exit (1);
- }
-#ifdef SHOW_TPS
- printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = ha;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = dif;
-#endif
- pb->mark = NA[k];
- ha = avail++;
- }
- NA[k] += TRACE_SPACE;
- }
-
- while (y >= NB[k])
- { if (cells[hb].mark < NB[k])
- { if (avail >= cmax)
- { cmax = ((int) (avail*1.2)) + 10000;
- cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),
- "Reallocating trace cells");
- if (cells == NULL)
- exit (1);
- }
-#ifdef SHOW_TPS
- printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = hb;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = dif;
-#endif
- pb->mark = NB[k];
- hb = avail++;
- }
- NB[k] += TRACE_SPACE;
- }
-
- if (c > besta)
- { besta = c;
- besty = y;
- if (m >= PATH_AVE)
- { lasta = c;
- if (TABLE[b & TRIM_MASK] >= 0)
- if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0)
- { trima = c;
- trimy = y;
- trimd = dif;
- trimha = ha;
- trimhb = hb;
- }
- }
- }
-
- t = T[k];
- n = M[k];
- ua = HA[k];
- ub = HB[k];
- V[k] = c;
- T[k] = b;
- M[k] = m;
- HA[k] = ha;
- HB[k] = hb;
-
- a -= 1;
- }
-
- if (more == 0)
- { if (bseq[besty] != 4 && aseq[besta-besty] != 4)
- more = 1;
- if (hgh >= aclip)
- { hgh = aclip-1;
- if (morem <= M[aclip])
- { morem = M[aclip];
- morea = V[aclip];
- morey = (morea - aclip)/2;
- mored = dif;
- moreha = HA[aclip];
- morehb = HB[aclip];
- }
- }
- if (low <= bclip)
- { low = bclip+1;
- if (morem <= M[bclip])
- { morem = M[bclip];
- morea = V[bclip];
- morey = (morea - bclip)/2;
- mored = dif;
- moreha = HA[bclip];
- morehb = HB[bclip];
- }
- }
- aclip = INT32_MAX;
- bclip = -INT32_MAX;
- }
-
- n = besta - WAVE_LAG;
- while (hgh >= low)
- if (V[hgh] < n)
- hgh -= 1;
- else
- { while (V[low] < n)
- low += 1;
- break;
- }
-
-#ifdef WAVE_STATS
- k = (hgh-low)+1;
- if (k > MAX)
- MAX = k;
- TOT += k;
- NWV += 1;
-#endif
-
-#ifdef DEBUG_WAVE
- print_wave(V,M,low,hgh,besta);
-#endif
- }
-
- { uint16 *atrace = (uint16 *) apath->trace;
- uint16 *btrace = (uint16 *) bpath->trace;
- int atlen, btlen;
- int trimx;
- int a, b, k, h;
-#ifdef DELTAS
- int d, e;
-#endif
-
- if (morem >= 0)
- { trimx = morea-morey;
- trimy = morey;
- trimd = mored;
- trimha = moreha;
- trimhb = morehb;
- }
- else
- trimx = trima-trimy;
-
- atlen = btlen = 0;
-
- a = -1;
- for (h = trimha; h >= 0; h = b)
- { b = cells[h].ptr;
- cells[h].ptr = a;
- a = h;
- }
- h = a;
-
-#ifdef DELTAS
-
- k = cells[h].diag;
- b = (mida-k)/2;
- e = 0;
-#ifdef SHOW_TRAIL
- printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout);
-#endif
- for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
- { k = cells[h].diag;
- a = cells[h].mark - k;
- d = cells[h].diff;
- atrace[atlen++] = (uint16) (d-e);
- atrace[atlen++] = (uint16) (a-b);
-#ifdef SHOW_TRAIL
- printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout);
-#endif
- b = a;
- e = d;
- }
- if (b+k != trimx)
- { atrace[atlen++] = (uint16) (trimd-e);
- atrace[atlen++] = (uint16) (trimy-b);
-#ifdef SHOW_TRAIL
- printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout);
-#endif
- }
- else if (b != trimy)
- { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b));
- atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e));
-#ifdef SHOW_TRAIL
- printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout);
-#endif
- }
-
-#else // DELTAS
-
- k = cells[h].diag;
- b = (mida-k)/2;
-#ifdef SHOW_TRAIL
- printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout);
-#endif
- for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
- { k = cells[h].diag;
- a = cells[h].mark - k;
- atrace[atlen++] = (uint16) (a-b);
-#ifdef SHOW_TRAIL
- printf(" %4d: (%5d,%5d): %3d\n",h,a+k,a,a-b); fflush(stdout);
-#endif
- b = a;
- }
- if (b+k != trimx)
- { atrace[atlen++] = (uint16) (trimy-b);
-#ifdef SHOW_TRAIL
- printf(" (%5d,%5d): %3d\n",trimx,trimy,trimy-b); fflush(stdout);
-#endif
- }
- else if (b != trimy)
- { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b));
-#ifdef SHOW_TRAIL
- printf(" @ (%5d,%5d): %3d\n",trimx,trimy,trimy-b); fflush(stdout);
-#endif
- }
-
-#endif // DELTAS
-
- a = -1;
- for (h = trimhb; h >= 0; h = b)
- { b = cells[h].ptr;
- cells[h].ptr = a;
- a = h;
- }
- h = a;
-
-#ifdef DELTAS
-
- k = cells[h].diag;
- b = (mida+k)/2;
- e = 0;
- low = k;
-#ifdef SHOW_TRAIL
- printf(" B path = (%5d,%5d)\n",b,(mida-k)/2); fflush(stdout);
-#endif
- for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
- { k = cells[h].diag;
- a = cells[h].mark + k;
- d = cells[h].diff;
- btrace[btlen++] = (uint16) (d-e);
- btrace[btlen++] = (uint16) (a-b);
-#ifdef SHOW_TRAIL
- printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,a-b); fflush(stdout);
-#endif
- b = a;
- e = d;
- }
- if (b-k != trimy)
- { btrace[btlen++] = (uint16) (trimd-e);
- btrace[btlen++] = (uint16) (trimx-b);
-#ifdef SHOW_TRAIL
- printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout);
-#endif
- }
- else if (b != trimx)
- { btrace[btlen-1] = (uint16) (btrace[btlen-1] + (trimx-b));
- btrace[btlen-2] = (uint16) (btrace[btlen-2] + (trimd-e));
-#ifdef SHOW_TRAIL
- printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout);
-#endif
- }
-
-#else // DELTAS
-
- k = cells[h].diag;
- b = (mida+k)/2;
- low = k;
-#ifdef SHOW_TRAIL
- printf(" B path = (%5d,%5d)\n",b,(mida-k)/2); fflush(stdout);
-#endif
- for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
- { k = cells[h].diag;
- a = cells[h].mark + k;
- btrace[btlen++] = (uint16) (a-b);
-#ifdef SHOW_TRAIL
- printf(" %4d: (%5d,%5d): %3d\n",h,a,a-k,a-b); fflush(stdout);
-#endif
- b = a;
- }
- if (b-k != trimy)
- { btrace[btlen++] = (uint16) (trimx-b);
-#ifdef SHOW_TRAIL
- printf(" (%5d,%5d): %3d\n",trimx,trimy,trimx-b); fflush(stdout);
-#endif
- }
- else if (b != trimx)
- { btrace[btlen-1] = (uint16) (btrace[btlen-1] + (trimx-b));
-#ifdef SHOW_TRAIL
- printf(" @ (%5d,%5d): %3d\n",trimx,trimy,trimx-b); fflush(stdout);
-#endif
- }
-
-#endif // DELTAS
-
- apath->aepos = trimx;
- apath->bepos = trimy;
- apath->diffs = trimd;
- apath->tlen = atlen;
- if (COMP(align->flags))
- { bpath->abpos = align->blen - apath->bepos;
- bpath->bbpos = align->alen - apath->aepos;
- }
- else
- { bpath->aepos = apath->bepos;
- bpath->bepos = apath->aepos;
- }
- bpath->diffs = trimd;
- bpath->tlen = btlen;
- }
-
- work->cells = (void *) cells;
- work->celmax = cmax;
-
- return (low);
-}
-
-/*** Reverse Wave ***/
-
-static void reverse_wave(_Work_Data *work, _Align_Spec *spec,
- Alignment *align, Path *bpath, int mind, int maxd, int mida)
-{ char *aseq = align->aseq - 1;
- char *bseq = align->bseq - 1;
- Path *apath = align->path;
-
- int hgh, low, dif;
- int minp, maxp;
- int *V, *M;
- BVEC *T;
-
- int *HA, *HB;
- int *NA, *NB;
- Pebble *cells;
- int avail, cmax, boff;
-
- int TRACE_SPACE = spec->trace_space;
- int PATH_AVE = spec->ave_path;
- int16 *SCORE = spec->score;
- int16 *TABLE = spec->table;
-
- int besta, besty;
- int trima, trimy, trimd;
- int trimha, trimhb;
- int morea, morey, mored;
- int moreha, morehb;
- int more, morem, lasta;
- int aclip, bclip;
-
- { int alen = align->alen + 1;
- int blen = align->blen + 1;
- int tlen = alen + blen + 1;
-
- V = ((int *) work->vector) + blen;
- M = V + tlen;
- HA = M + tlen;
- HB = HA + tlen;
- NA = HB + tlen;
- NB = NA + tlen;
- T = ((BVEC *) (NB + alen)) + blen;
-
- cells = (Pebble *) (work->cells);
- cmax = work->celmax;
- avail = 0;
-
- if (COMP(align->flags))
- boff = align->blen % TRACE_SPACE;
- else
- boff = 0;
- }
-
- hgh = maxd;
- low = mind;
- if (aseq == bseq)
- { if (low < 0)
- { int big = -low;
- int sml = -hgh;
-
- if (big <= MICRO_SAT)
- minp = low - Sat_Width[big];
- else
- minp = -SAT_HGH*big;
- if (sml <= MICRO_SAT)
- maxp = hgh + Sat_Width[sml];
- else
- maxp = -SAT_LOW*sml;
- }
- else
- { if (low <= MICRO_SAT)
- minp = low - Sat_Width[low];
- else
- minp = SAT_LOW*low;
- if (hgh <= MICRO_SAT)
- maxp = hgh + Sat_Width[hgh];
- else
- maxp = SAT_HGH*hgh;
- }
- }
- else
- { minp = -INT32_MAX;
- maxp = INT32_MAX;
- }
- dif = 0;
-
- more = 1;
- aclip = -INT32_MAX;
- bclip = INT32_MAX;
-
- besta = trima = morea = lasta = mida;
- besty = trimy = morey = (mida-hgh) >> 1;
- trimd = mored = 0;
- trimha = moreha = 0;
- trimhb = morehb = 1;
- morem = -1;
-
- { int k;
- char *a;
-
- a = aseq + low;
- for (k = low; k <= hgh; k++)
- { int y, c, d;
- int ha, hb;
- int na, nb;
- Pebble *pb;
-
- y = (mida-k) >> 1;
-
- if (avail >= cmax-1)
- { cmax = ((int) (avail*1.2)) + 10000;
- cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
- if (cells == NULL)
- exit (1);
- }
-
- na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE;
-#ifdef SHOW_TPS
- printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = -1;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = 0;
-#endif
- pb->mark = y+k;
- ha = avail++;
-
- nb = ((y+(TRACE_SPACE-boff)-1)/TRACE_SPACE-1)*TRACE_SPACE+boff;
-#ifdef SHOW_TPS
- printf(" B %d: -1,%d,0,%d\n",avail,k,nb+TRACE_SPACE); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = -1;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = 0;
-#endif
- pb->mark = y;
- hb = avail++;
-
- while (1)
- { c = bseq[y];
- if (c == 4)
- { more = 0;
- if (bclip > k)
- bclip = k;
- break;
- }
- d = a[y];
- if (c != d)
- { if (d == 4)
- { more = 0;
- aclip = k;
- }
- break;
- }
- y -= 1;
- }
- c = (y << 1) + k;
-
- while (y+k <= na)
- { if (avail >= cmax)
- { cmax = ((int) (avail*1.2)) + 10000;
- cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
- if (cells == NULL)
- exit (1);
- }
-#ifdef SHOW_TPS
- printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = ha;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = 0;
-#endif
- pb->mark = na;
- ha = avail++;
- na -= TRACE_SPACE;
- }
- while (y <= nb)
- { if (avail >= cmax)
- { cmax = ((int) (avail*1.2)) + 10000;
- cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
- if (cells == NULL)
- exit (1);
- }
-#ifdef SHOW_TPS
- printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = hb;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = 0;
-#endif
- pb->mark = nb;
- hb = avail++;
- nb -= TRACE_SPACE;
- }
-
- if (c < besta)
- { besta = trima = lasta = c;
- besty = trimy = y;
- trimha = ha;
- trimhb = hb;
- }
-
- V[k] = c;
- T[k] = PATH_INT;
- M[k] = PATH_LEN;
- HA[k] = ha;
- HB[k] = hb;
- NA[k] = na;
- NB[k] = nb;
-
- a += 1;
- }
- }
-
- if (more == 0)
- { if (bseq[besty] != 4 && aseq[besta - besty] != 4)
- more = 1;
- if (low <= aclip)
- { low = aclip+1;
- if (morem <= M[aclip])
- { morem = M[aclip];
- morea = V[aclip];
- morey = (morea - aclip)/2;
- moreha = HA[aclip];
- morehb = HB[aclip];
- }
- }
- if (hgh >= bclip)
- { hgh = bclip-1;
- if (morem <= M[bclip])
- { morem = M[bclip];
- morea = V[bclip];
- morey = (morea - bclip)/2;
- moreha = HA[bclip];
- morehb = HB[bclip];
- }
- }
- aclip = -INT32_MAX;
- bclip = INT32_MAX;
- }
-
-#ifdef DEBUG_WAVE
- printf("\nREVERSE WAVE:\n");
- print_wave(V,M,low,hgh,besta);
-#endif
-
- while (more && lasta <= besta + TRIM_MLAG)
- { int k, n;
- int ua, ub;
- BVEC t;
- int am, ac, ap;
- char *a;
-
- if (low > minp)
- { low -= 1;
- NA[low] = NA[low+1];
- NB[low] = NB[low+1];
- V[low] = ap = INT32_MAX;
- }
- else
- ap = V[low];
- if (hgh < maxp)
- { hgh += 1;
- NA[hgh] = NA[hgh-1];
- NB[hgh] = NB[hgh-1];
- V[hgh] = INT32_MAX;
- }
- dif += 1;
-
- ac = V[hgh+1] = V[low-1] = INT32_MAX;
- a = aseq + low;
- t = PATH_INT;
- n = PATH_LEN;
- ua = ub = -1;
- for (k = low; k <= hgh; k++)
- { int y, m;
- int ha, hb;
- int c, d;
- BVEC b;
- Pebble *pb;
-
- am = ac;
- ac = ap;
- ap = V[d = k+1];
-
- if (ac > ap)
- if (ap > am)
- { c = am-1;
- m = n;
- b = t;
- ha = ua;
- hb = ub;
- }
- else
- { c = ap-1;
- m = M[d];
- b = T[d];
- ha = HA[d];
- hb = HB[d];
- }
- else
- if (ac > am)
- { c = am-1;
- m = n;
- b = t;
- ha = ua;
- hb = ub;
- }
- else
- { c = ac-2;
- m = M[k];
- b = T[k];
- ha = HA[k];
- hb = HB[k];
- }
-
- if ((b & PATH_TOP) != 0)
- m -= 1;
- b <<= 1;
-
- y = (c-k) >> 1;
- while (1)
- { c = bseq[y];
- if (c == 4)
- { more = 0;
- if (bclip > k)
- bclip = k;
- break;
- }
- d = a[y];
- if (c != d)
- { if (d == 4)
- { more = 0;
- aclip = k;
- }
- break;
- }
- y -= 1;
- if ((b & PATH_TOP) == 0)
- m += 1;
- b = (b << 1) | 1;
- }
- c = (y << 1) + k;
-
- while (y+k <= NA[k])
- { if (cells[ha].mark > NA[k])
- { if (avail >= cmax)
- { cmax = ((int) (avail*1.2)) + 10000;
- cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),
- "Reallocating trace cells");
- if (cells == NULL)
- exit (1);
- }
-#ifdef SHOW_TPS
- printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = ha;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = dif;
-#endif
- pb->mark = NA[k];
- ha = avail++;
- }
- NA[k] -= TRACE_SPACE;
- }
- while (y <= NB[k])
- { if (cells[hb].mark > NB[k])
- { if (avail >= cmax)
- { cmax = ((int) (avail*1.2)) + 10000;
- cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),
- "Reallocating trace cells");
- if (cells == NULL)
- exit (1);
- }
-#ifdef SHOW_TPS
- printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout);
-#endif
- pb = cells+avail;
- pb->ptr = hb;
- pb->diag = k;
-#ifdef DELTAS
- pb->diff = dif;
-#endif
- pb->mark = NB[k];
- hb = avail++;
- }
- NB[k] -= TRACE_SPACE;
- }
-
- if (c < besta)
- { besta = c;
- besty = y;
- if (m >= PATH_AVE)
- { lasta = c;
- if (TABLE[b & TRIM_MASK] >= 0)
- if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0)
- { trima = c;
- trimy = y;
- trimd = dif;
- trimha = ha;
- trimhb = hb;
- }
- }
- }
-
- t = T[k];
- n = M[k];
- ua = HA[k];
- ub = HB[k];
- V[k] = c;
- T[k] = b;
- M[k] = m;
- HA[k] = ha;
- HB[k] = hb;
-
- a += 1;
- }
-
- if (more == 0)
- { if (bseq[besty] != 4 && aseq[besta - besty] != 4)
- more = 1;
- if (low <= aclip)
- { low = aclip+1;
- if (morem <= M[aclip])
- { morem = M[aclip];
- morea = V[aclip];
- morey = (morea - aclip)/2;
- mored = dif;
- moreha = HA[aclip];
- morehb = HB[aclip];
- }
- }
- if (hgh >= bclip)
- { hgh = bclip-1;
- if (morem <= M[bclip])
- { morem = M[bclip];
- morea = V[bclip];
- morey = (morea - bclip)/2;
- mored = dif;
- moreha = HA[bclip];
- morehb = HB[bclip];
- }
- }
- aclip = -INT32_MAX;
- bclip = INT32_MAX;
- }
-
- n = besta + WAVE_LAG;
- while (hgh >= low)
- if (V[hgh] > n)
- hgh -= 1;
- else
- { while (V[low] > n)
- low += 1;
- break;
- }
-
-#ifdef WAVE_STATS
- k = (hgh-low)+1;
- if (k > MAX)
- MAX = k;
- TOT += k;
- NWV += 1;
-#endif
-
-#ifdef DEBUG_WAVE
- print_wave(V,M,low,hgh,besta);
-#endif
- }
-
- { uint16 *atrace = (uint16 *) apath->trace;
- uint16 *btrace = (uint16 *) bpath->trace;
- int atlen, btlen;
- int trimx;
- int a, b, k, h;
-#ifdef DELTAS
- int d, e;
-#endif
-
- if (morem >= 0)
- { trimx = morea-morey;
- trimy = morey;
- trimd = mored;
- trimha = moreha;
- trimhb = morehb;
- }
- else
- trimx = trima-trimy;
-
- atlen = btlen = 0;
-
- a = -1;
- for (h = trimha; h >= 0; h = b)
- { b = cells[h].ptr;
- cells[h].ptr = a;
- a = h;
- }
- h = a;
-
-#ifdef DELTAS
-
- k = cells[h].diag;
- b = cells[h].mark - k;
- e = 0;
-#ifdef SHOW_TRAIL
- printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout);
-#endif
- if ((b+k)%TRACE_SPACE != 0)
- { h = cells[h].ptr;
- if (h < 0)
- { a = trimy;
- d = trimd;
- }
- else
- { k = cells[h].diag;
- a = cells[h].mark - k;
- d = cells[h].diff;
- }
-#ifdef SHOW_TRAIL
- printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout);
-#endif
- if (apath->tlen == 0)
- { atrace[--atlen] = (uint16) (b-a);
- atrace[--atlen] = (uint16) (d-e);
- }
- else
- { atrace[1] = (uint16) (atrace[1] + (b-a));
- atrace[0] = (uint16) (atrace[0] + (d-e));
- }
- b = a;
- e = d;
- }
- if (h >= 0)
- { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
- { k = cells[h].diag;
- a = cells[h].mark - k;
- atrace[--atlen] = (uint16) (b-a);
- d = cells[h].diff;
- atrace[--atlen] = (uint16) (d-e);
-#ifdef SHOW_TRAIL
- printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout);
-#endif
- b = a;
- e = d;
- }
- if (b+k != trimx)
- { atrace[--atlen] = (uint16) (b-trimy);
- atrace[--atlen] = (uint16) (trimd-e);
-#ifdef SHOW_TRAIL
- printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout);
-#endif
- }
- else if (b != trimy)
- { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy));
- atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e));
-#ifdef SHOW_TRAIL
- printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout);
-#endif
- }
- }
-
-#else // DELTAS
-
- k = cells[h].diag;
- b = cells[h].mark - k;
-#ifdef SHOW_TRAIL
- printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout);
-#endif
- if ((b+k)%TRACE_SPACE != 0)
- { h = cells[h].ptr;
- if (h < 0)
- a = trimy;
- else
- { k = cells[h].diag;
- a = cells[h].mark - k;
- }
-#ifdef SHOW_TRAIL
- printf(" +%4d: (%5d,%5d): %3d\n",h,a+k,a,b-a); fflush(stdout);
-#endif
- if (apath->tlen == 0)
- atrace[--atlen] = (uint16) (b-a);
- else
- atrace[0] = (uint16) (atrace[0] + (b-a));
- b = a;
- }
- if (h >= 0)
- { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
- { k = cells[h].diag;
- a = cells[h].mark - k;
- atrace[--atlen] = (uint16) (b-a);
-#ifdef SHOW_TRAIL
- printf(" %4d: (%5d,%5d): %3d\n",h,a+k,a,b-a); fflush(stdout);
-#endif
- b = a;
- }
- if (b+k != trimx)
- { atrace[--atlen] = (uint16) (b-trimy);
-#ifdef SHOW_TRAIL
- printf(" (%5d,%5d): %3d\n",trimx,trimy,b-trimy); fflush(stdout);
-#endif
- }
- else if (b != trimy)
- { atrace[atlen] = (uint16) (atrace[atlen] + (b-trimy));
-#ifdef SHOW_TRAIL
- printf(" @ (%5d,%5d): %3d\n",trimx,trimy,b-trimy); fflush(stdout);
-#endif
- }
- }
-
-#endif // DELTAS
-
- a = -1;
- for (h = trimhb; h >= 0; h = b)
- { b = cells[h].ptr;
- cells[h].ptr = a;
- a = h;
- }
- h = a;
-
-#ifdef DELTAS
-
- k = cells[h].diag;
- b = cells[h].mark + k;
- e = 0;
-#ifdef SHOW_TRAIL
- printf(" B path = (%5d,%5d)\n",b,b-k); fflush(stdout);
-#endif
- if ((b-k)%TRACE_SPACE != boff)
- { h = cells[h].ptr;
- if (h < 0)
- { a = trimx;
- d = trimd;
- }
- else
- { k = cells[h].diag;
- a = cells[h].mark + k;
- d = cells[h].diff;
- }
-#ifdef SHOW_TRAIL
- printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout);
-#endif
- if (bpath->tlen == 0)
- { btrace[--btlen] = (uint16) (b-a);
- btrace[--btlen] = (uint16) (b-a);
- }
- else
- { btrace[1] = (uint16) (btrace[1] + (b-a));
- btrace[0] = (uint16) (btrace[0] + (d-e));
- }
- b = a;
- e = d;
- }
-
- if (h >= 0)
- { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
- { k = cells[h].diag;
- a = cells[h].mark + k;
- btrace[--btlen] = (uint16) (b-a);
- d = cells[h].diff;
- btrace[--btlen] = (uint16) (d-e);
-#ifdef SHOW_TRAIL
- printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout);
-#endif
- b = a;
- e = d;
- }
- if (b-k != trimy)
- { btrace[--btlen] = (uint16) (b-trimx);
- btrace[--btlen] = (uint16) (trimd-e);
-#ifdef SHOW_TRAIL
- printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout);
-#endif
- }
- else if (b != trimx)
- { btrace[btlen+1] = (uint16) (btrace[btlen+1] + (b-trimx));
- btrace[btlen] = (uint16) (btrace[btlen] + (trimd-e));
-#ifdef SHOW_TRAIL
- printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout);
-#endif
- }
- }
-
-#else // DELTAS
-
- k = cells[h].diag;
- b = cells[h].mark + k;
-#ifdef SHOW_TRAIL
- printf(" B path = (%5d,%5d)\n",b,b-k); fflush(stdout);
-#endif
- if ((b-k)%TRACE_SPACE != boff)
- { h = cells[h].ptr;
- if (h < 0)
- a = trimx;
- else
- { k = cells[h].diag;
- a = cells[h].mark + k;
- }
-#ifdef SHOW_TRAIL
- printf(" +%4d: (%5d,%5d): %3d\n",h,a,a-k,b-a); fflush(stdout);
-#endif
- if (bpath->tlen == 0)
- btrace[--btlen] = (uint16) (b-a);
- else
- btrace[0] = (uint16) (btrace[0] + (b-a));
- b = a;
- }
-
- if (h >= 0)
- { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
- { k = cells[h].diag;
- a = cells[h].mark + k;
- btrace[--btlen] = (uint16) (b-a);
-#ifdef SHOW_TRAIL
- printf(" %4d: (%5d,%5d): %3d\n",h,a,a-k,b-a); fflush(stdout);
-#endif
- b = a;
- }
- if (b-k != trimy)
- { btrace[--btlen] = (uint16) (b-trimx);
-#ifdef SHOW_TRAIL
- printf(" (%5d,%5d): %3d\n",trimx,trimy,b-trimx); fflush(stdout);
-#endif
- }
- else if (b != trimx)
- { btrace[btlen] = (uint16) (btrace[btlen] + (b-trimx));
-#ifdef SHOW_TRAIL
- printf(" @ (%5d,%5d): %3d\n",trimx,trimy,b-trimx); fflush(stdout);
-#endif
- }
- }
-
-#endif // DELTAS
-
- apath->abpos = trimx;
- apath->bbpos = trimy;
- apath->diffs = apath->diffs + trimd;
- apath->tlen = apath->tlen - atlen;
- apath->trace = atrace + atlen;
- if (COMP(align->flags))
- { bpath->aepos = align->blen - apath->bbpos;
- bpath->bepos = align->alen - apath->abpos;
- }
- else
- { bpath->abpos = apath->bbpos;
- bpath->bbpos = apath->abpos;
- }
- bpath->diffs = bpath->diffs + trimd;
- bpath->tlen = bpath->tlen - btlen;
- bpath->trace = btrace + btlen;
- }
-
- work->cells = (void *) cells;
- work->celmax = cmax;
-}
-
-
-/* Find the longest local alignment between aseq and bseq through (xcnt,ycnt)
- See associated .h file for the precise definition of the interface.
-*/
-
-Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
- int xlow, int xhgh, int ycnt)
-{ _Work_Data *work = ( _Work_Data *) ework;
- _Align_Spec *spec = (_Align_Spec *) espec;
-
- Path *apath, *bpath;
-
- { int alen, blen;
- int maxtp, wsize;
-
- alen = align->alen;
- blen = align->blen;
-
- wsize = (6*sizeof(int) + sizeof(BVEC))*(alen+blen+3);
- if (wsize >= work->vecmax)
- enlarge_vector(work,wsize);
-
- if (alen < blen)
- maxtp = 2*(blen/spec->trace_space+2);
- else
- maxtp = 2*(alen/spec->trace_space+2);
- wsize = 4*maxtp*sizeof(uint16) + sizeof(Path);
- if (wsize > work->pntmax)
- enlarge_points(work,wsize);
-
- apath = align->path;
- bpath = (Path *) work->points;
-
- apath->trace = ((uint16 *) (bpath+1)) + maxtp;
- bpath->trace = ((uint16 *) apath->trace) + 2*maxtp;
- }
-
-#ifdef DEBUG_PASSES
- printf("\n");
-#endif
-
- { int l, h, a;
-
- l = xlow-ycnt;
- h = xhgh-ycnt;
- a = (xlow+xhgh)/2+ycnt;
-
- l = forward_wave(work,spec,align,bpath,l,h,a);
-#ifdef DEBUG_PASSES
- printf("F1 (%d-%d,%d) => (%d,%d) %d\n",xlow,xhgh,ycnt,apath->aepos,apath->bepos,apath->diffs);
-#endif
-
- reverse_wave(work,spec,align,bpath,l,l,a);
-#ifdef DEBUG_PASSES
- printf("R1 (%d,%d) => (%d,%d) %d\n",l,ycnt,apath->abpos,apath->bbpos,apath->diffs);
-#endif
- }
-
- if (COMP(align->flags))
- { uint16 *trace = (uint16 *) bpath->trace;
- uint16 p;
- int i, j;
-
-#ifdef DELTAS
- i = bpath->tlen-2;
- j = 0;
- while (j < i)
- { p = trace[i];
- trace[i] = trace[j];
- trace[j] = p;
- p = trace[i+1];
- trace[i+1] = trace[j+1];
- trace[j+1] = p;
- i -= 2;
- j += 2;
- }
-#else
- i = bpath->tlen-1;
- j = 0;
- while (j < i)
- { p = trace[i];
- trace[i] = trace[j];
- trace[j] = p;
- i -= 1;
- j += 1;
- }
-#endif
- }
-
-#ifdef DEBUG_POINTS
- { uint16 *trace = (uint16 *) apath->trace;
- int a, h;
-
- printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos);
- printf(" %c\n",(COMP(align->flags) ? 'c' : 'n'));
- a = apath->bbpos;
-#ifdef DELTAS
- for (h = 1; h < apath->tlen; h += 2)
- { int dif = trace[h-1];
- int del = trace[h];
- a += del;
- printf(" %d / %d (%d)\n",dif,del,a);
- }
-#else
- for (h = 0; h < apath->tlen; h++)
- { int del = trace[h];
- a += del;
- printf(" %d (%d)\n",del,a);
- }
-#endif
- }
-
- { uint16 *trace = (uint16 *) bpath->trace;
- int a, h;
-
- printf("\nB-path (%d,%d)->(%d,%d)",bpath->abpos,bpath->bbpos,bpath->aepos,bpath->bepos);
- printf(" %c [%d,%d]\n",(COMP(align->flags) ? 'c' : 'n'),align->blen,align->alen);
- a = bpath->bbpos;
-#ifdef DELTAS
- for (h = 1; h < bpath->tlen; h += 2)
- { int dif = trace[h-1];
- int del = trace[h];
- a += del;
- printf(" %d / %d (%d)\n",dif,del,a);
- }
-#else
- for (h = 0; h < bpath->tlen; h++)
- { int del = trace[h];
- a += del;
- printf(" %d (%d)\n",del,a);
- }
-#endif
- }
-#endif
-
- return (bpath);
-}
-
-
-/****************************************************************************************\
-* *
-* OVERLAP MANIPULATION *
-* *
-\****************************************************************************************/
-
-static int64 PtrSize = sizeof(void *);
-static int64 OvlIOSize = sizeof(Overlap) - sizeof(void *);
-
-int Read_Overlap(FILE *input, Overlap *ovl)
-{ if (fread( ((char *) ovl) + PtrSize, OvlIOSize, 1, input) != 1)
- return (1);
- return (0);
-}
-
-int Read_Trace(FILE *input, Overlap *ovl, int tbytes)
-{ if (tbytes > 0 && ovl->path.tlen > 0)
- { if (fread(ovl->path.trace, tbytes*ovl->path.tlen, 1, input) != 1)
- return (1);
- }
- return (0);
-}
-
-void Write_Overlap(FILE *output, Overlap *ovl, int tbytes)
-{ fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output);
- if (ovl->path.trace != NULL)
- fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output);
-}
-
-void Compress_TraceTo8(Overlap *ovl)
-{ uint16 *t16 = (uint16 *) ovl->path.trace;
- uint8 *t8 = (uint8 *) ovl->path.trace;
- int j;
-
- for (j = 0; j < ovl->path.tlen; j++)
- t8[j] = (uint8) (t16[j]);
-}
-
-void Decompress_TraceTo16(Overlap *ovl)
-{ uint16 *t16 = (uint16 *) ovl->path.trace;
- uint8 *t8 = (uint8 *) ovl->path.trace;
- int j;
-
- for (j = ovl->path.tlen-1; j >= 0; j--)
- t16[j] = t8[j];
-}
-
-void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent)
-{ int i;
-
- fprintf(output,"%*s%d vs. ",indent,"",ovl->aread);
- if (COMP(ovl->flags))
- fprintf(output,"c(%d)\n",ovl->bread);
- else
- fprintf(output,"%d\n",ovl->bread);
- fprintf(output,"%*s [%d,%d] vs [%d,%d] w. %d diffs\n",indent,"",
- ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos,ovl->path.diffs);
-
- if (tbytes == 1)
- { uint8 *trace = (uint8 *) (ovl->path.trace);
- if (trace != NULL)
-#ifdef DELTAS
- { int p = ovl->path.bbpos + trace[1];
- fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p);
- for (i = 3; i < ovl->path.tlen; i += 2)
- { if (i%10 == 0)
- fprintf(output,"\n%*s",indent+6,"");
- p += trace[i];
- fprintf(output," %3d/%5d",trace[i-1],p);
- }
-#else
- { int p = ovl->path.bbpos + trace[0];
- fprintf(output,"%*sTrace: %5d",indent,"",p);
- for (i = 1; i < ovl->path.tlen; i++)
- { if (i%10 == 0)
- fprintf(output,"\n%*s",indent+6,"");
- p += trace[i];
- fprintf(output," %5d",p);
- }
-#endif
- fprintf(output,"\n");
- }
- }
- else
- { uint16 *trace = (uint16 *) (ovl->path.trace);
- if (trace != NULL)
-#ifdef DELTAS
- { int p = ovl->path.bbpos + trace[1];
- fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p);
- for (i = 3; i < ovl->path.tlen; i += 2)
- { if (i%10 == 0)
- fprintf(output,"\n%*s",indent+6,"");
- p += trace[i];
- fprintf(output," %3d/%5d",trace[i-1],p);
- }
-#else
- { int p = ovl->path.bbpos + trace[0];
- fprintf(output,"%*sTrace: %5d",indent,"",p);
- for (i = 1; i < ovl->path.tlen; i++)
- { if (i%10 == 0)
- fprintf(output,"\n%*s",indent+6,"");
- p += trace[i];
- fprintf(output," %5d",p);
- }
-#endif
- fprintf(output,"\n");
- }
- }
-}
-
-int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname)
-{ int i, p;
-
-#ifdef DELTAS
- if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2)
-#else
- if ((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace != ovl->path.tlen-1)
-#endif
- { if (verbose)
- fprintf(stderr," %s: Wrong number of trace points\n",fname);
- return (1);
- }
- p = ovl->path.bbpos;
- if (tspace <= TRACE_XOVR)
- { uint8 *trace8 = (uint8 *) ovl->path.trace;
-#ifdef DELTAS
- for (i = 1; i < ovl->path.tlen; i += 2)
-#else
- for (i = 0; i < ovl->path.tlen; i++)
-#endif
- p += trace8[i];
- }
- else
- { uint16 *trace16 = (uint16 *) ovl->path.trace;
-#ifdef DELTAS
- for (i = 1; i < ovl->path.tlen; i += 2)
-#else
- for (i = 0; i < ovl->path.tlen; i++)
-#endif
- p += trace16[i];
- }
- if (p != ovl->path.bepos)
- { if (verbose)
- fprintf(stderr," %s: Trace point sum != aligned interval\n",fname);
- return (1);
- }
- return (0);
-}
-
-
-void Flip_Alignment(Alignment *align, int full)
-{ char *aseq = align->aseq;
- char *bseq = align->bseq;
- int alen = align->alen;
- int blen = align->blen;
- Path *path = align->path;
- int comp = COMP(align->flags);
-
- int *trace = (int *) path->trace;
- int tlen = path->tlen;
-
- int i, j, p;
-
- if (comp)
- { p = path->abpos;
- path->abpos = blen - path->bepos;
- path->bepos = alen - p;
- p = path->aepos;
- path->aepos = blen - path->bbpos;
- path->bbpos = alen - p;
-
- if (full)
- { alen += 2;
- blen += 2;
-
- for (i = 0; i < tlen; i++)
- if ((p = trace[i]) < 0)
- trace[i] = alen + p;
- else
- trace[i] = p - blen;
-
- i = tlen-1;
- j = 0;
- while (j < i)
- { p = trace[i];
- trace[i] = trace[j];
- trace[j] = p;
- i -= 1;
- j += 1;
- }
-
- alen -= 2;
- blen -= 2;
- }
- }
- else
- { p = path->abpos;
- path->abpos = path->bbpos;
- path->bbpos = p;
- p = path->aepos;
- path->aepos = path->bepos;
- path->bepos = p;
-
- if (full)
- for (i = 0; i < tlen; i++)
- trace[i] = - (trace[i]);
- }
-
- align->aseq = bseq;
- align->bseq = aseq;
- align->alen = blen;
- align->blen = alen;
-}
-
-
-/****************************************************************************************\
-* *
-* ALIGNMENT PRINTING *
-* *
-\****************************************************************************************/
-
-/* Complement the sequence in fragment aseq. The operation does the
- complementation/reversal in place. Calling it a second time on a
- given fragment restores it to its original state. */
-
-void Complement_Seq(char *aseq, int len)
-{ char *s, *t;
- int c;
-
- s = aseq;
- t = aseq + (len-1);
- while (s < t)
- { c = 3 - *s;
- *s++ = (char) (3 - *t);
- *t-- = (char) c;
- }
- if (s == t)
- *s = (char) (3 - *s);
-}
-
-/* Print an alignment to file between a and b given in trace (unpacked).
- Prefix gives the length of the initial prefix of a that is unaligned. */
-
-static char ToL[8] = { 'a', 'c', 'g', 't', '.', '[', ']', '-' };
-static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' };
-
-void Print_Alignment(FILE *file, Alignment *align, Work_Data *ework,
- int indent, int width, int border, int upper, int coord)
-{ _Work_Data *work = (_Work_Data *) ework;
- int *trace = align->path->trace;
- int tlen = align->path->tlen;
-
- char *Abuf, *Bbuf, *Dbuf;
- int i, j, o;
- char *a, *b;
- char mtag, dtag;
- int prefa, prefb;
- int aend, bend;
- int sa, sb;
- int match, diff;
- char *N2A;
-
- if (trace == NULL) return;
-
-#ifdef SHOW_TRACE
- fprintf(file,"\nTrace:\n");
- for (i = 0; i < tlen; i++)
- fprintf(file," %3d\n",trace[i]);
-#endif
-
- o = sizeof(char)*3*(width+1);
- if (o > work->vecmax)
- enlarge_vector(work,o);
-
- if (upper)
- N2A = ToU;
- else
- N2A = ToL;
-
- Abuf = (char *) work->vector;
- Bbuf = Abuf + (width+1);
- Dbuf = Bbuf + (width+1);
-
- aend = align->path->aepos;
- bend = align->path->bepos;
-
- Abuf[width] = Bbuf[width] = Dbuf[width] = '\0';
- /* buffer/output next column */
-#define COLUMN(x,y) \
-{ int u, v; \
- if (o >= width) \
- { fprintf(file,"\n"); \
- fprintf(file,"%*s",indent,""); \
- if (coord > 0) \
- { if (sa <= aend) \
- fprintf(file," %*d",coord,sa); \
- else \
- fprintf(file," %*s",coord,""); \
- fprintf(file," %s\n",Abuf); \
- fprintf(file,"%*s %*s %s\n",indent,"",coord,"",Dbuf); \
- fprintf(file,"%*s",indent,""); \
- if (sb <= bend) \
- fprintf(file," %*d",coord,sb); \
- else \
- fprintf(file," %*s",coord,""); \
- fprintf(file," %s",Bbuf); \
- } \
- else \
- { fprintf(file," %s\n",Abuf); \
- fprintf(file,"%*s %s\n",indent,"",Dbuf); \
- fprintf(file,"%*s %s",indent,"",Bbuf); \
- } \
- fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \
- o = 0; \
- sa = i; \
- sb = j; \
- match = diff = 0; \
- } \
- u = (x); \
- v = (y); \
- if (u == 4 || v == 4) \
- Dbuf[o] = ' '; \
- else if (u == v) \
- Dbuf[o] = mtag; \
- else \
- Dbuf[o] = dtag; \
- Abuf[o] = N2A[u]; \
- Bbuf[o] = N2A[v]; \
- o += 1; \
-}
-
- a = align->aseq - 1;
- b = align->bseq - 1;
-
- o = 0;
- i = j = 1;
-
- prefa = align->path->abpos;
- prefb = align->path->bbpos;
-
- if (prefa > border)
- { i = prefa-(border-1);
- prefa = border;
- }
- if (prefb > border)
- { j = prefb-(border-1);
- prefb = border;
- }
-
- sa = i;
- sb = j;
- mtag = ':';
- dtag = ':';
-
- while (prefa > prefb)
- { COLUMN(a[i],4)
- i += 1;
- prefa -= 1;
- }
- while (prefb > prefa)
- { COLUMN(4,b[j])
- j += 1;
- prefb -= 1;
- }
- while (prefa > 0)
- { COLUMN(a[i],b[j])
- i += 1;
- j += 1;
- prefa -= 1;
- }
-
- mtag = '[';
- if (prefb > 0)
- COLUMN(5,5)
-
- mtag = '|';
- dtag = '*';
-
- match = diff = 0;
-
- { int p, c; /* Output columns of alignment til reach trace end */
-
- for (c = 0; c < tlen; c++)
- if ((p = trace[c]) < 0)
- { p = -p;
- while (i != p)
- { COLUMN(a[i],b[j])
- if (a[i] == b[j])
- match += 1;
- else
- diff += 1;
- i += 1;
- j += 1;
- }
- COLUMN(7,b[j])
- j += 1;
- diff += 1;
- }
- else
- { while (j != p)
- { COLUMN(a[i],b[j])
- if (a[i] == b[j])
- match += 1;
- else
- diff += 1;
- i += 1;
- j += 1;
- }
- COLUMN(a[i],7)
- i += 1;
- diff += 1;
- }
- p = align->path->aepos;
- while (i <= p)
- { COLUMN(a[i],b[j])
- if (a[i] == b[j])
- match += 1;
- else
- diff += 1;
- i += 1;
- j += 1;
- }
- }
-
- { int c; /* Output remaining column including unaligned suffix */
-
- mtag = ']';
- if (a[i] != 4 && b[j] != 4 && border > 0)
- COLUMN(6,6)
-
- mtag = ':';
- dtag = ':';
-
- c = 0;
- while (c < border && (a[i] != 4 || b[j] != 4))
- { if (a[i] != 4)
- if (b[j] != 4)
- { COLUMN(a[i],b[j])
- i += 1;
- j += 1;
- }
- else
- { COLUMN(a[i],4)
- i += 1;
- }
- else
- { COLUMN(4,b[j])
- j += 1;
- }
- c += 1;
- }
- }
-
- /* Print remainder of buffered col.s */
-
- fprintf(file,"\n");
- fprintf(file,"%*s",indent,"");
- if (coord > 0)
- { if (sa <= aend)
- fprintf(file," %*d",coord,sa);
- else
- fprintf(file," %*s",coord,"");
- fprintf(file," %.*s\n",o,Abuf);
- fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf);
- fprintf(file,"%*s",indent,"");
- if (sb <= bend)
- fprintf(file," %*d",coord,sb);
- else
- fprintf(file," %*s",coord,"");
- fprintf(file," %.*s",o,Bbuf);
- }
- else
- { fprintf(file," %.*s\n",o,Abuf);
- fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf);
- fprintf(file,"%*s %.*s",indent,"",o,Bbuf);
- }
- if (diff+match > 0)
- fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match));
- else
- fprintf(file,"\n");
-
- fflush(file);
-}
-
-void Print_Reference(FILE *file, Alignment *align, Work_Data *ework,
- int indent, int block, int border, int upper, int coord)
-{ _Work_Data *work = (_Work_Data *) ework;
- int *trace = align->path->trace;
- int tlen = align->path->tlen;
-
- char *Abuf, *Bbuf, *Dbuf;
- int i, j, o;
- char *a, *b;
- char mtag, dtag;
- int prefa, prefb;
- int aend, bend;
- int sa, sb, s0;
- int match, diff;
- char *N2A;
- int vmax;
-
- if (trace == NULL) return;
-
-#ifdef SHOW_TRACE
- fprintf(file,"\nTrace:\n");
- for (i = 0; i < tlen; i++)
- fprintf(file," %3d\n",trace[i]);
-#endif
-
- vmax = work->vecmax/3;
- o = sizeof(char)*6*(block+1);
- if (o > vmax)
- { enlarge_vector(work,3*o);
- vmax = work->vecmax/3;
- }
-
- Abuf = (char *) work->vector;
- Bbuf = Abuf + vmax;
- Dbuf = Bbuf + vmax;
-
- if (upper)
- N2A = ToU;
- else
- N2A = ToL;
-
- aend = align->path->aepos;
- bend = align->path->bepos;
-
-#define BLOCK(x,y) \
-{ int u, v; \
- if (i%block == 1 && i != s0 && x != 7 && o > 0) \
- { fprintf(file,"\n"); \
- fprintf(file,"%*s",indent,""); \
- if (coord > 0) \
- { if (sa <= aend) \
- fprintf(file," %*d",coord,sa); \
- else \
- fprintf(file," %*s",coord,""); \
- fprintf(file," %.*s\n",o,Abuf); \
- fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); \
- fprintf(file,"%*s",indent,""); \
- if (sb <= bend) \
- fprintf(file," %*d",coord,sb); \
- else \
- fprintf(file," %*s",coord,""); \
- fprintf(file," %.*s",o,Bbuf); \
- } \
- else \
- { fprintf(file," %.*s\n",o,Abuf); \
- fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); \
- fprintf(file,"%*s %.*s",indent,"",o,Bbuf); \
- } \
- fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \
- o = 0; \
- sa = i; \
- sb = j; \
- match = diff = 0; \
- } \
- u = (x); \
- v = (y); \
- if (u == 4 || v == 4) \
- Dbuf[o] = ' '; \
- else if (u == v) \
- Dbuf[o] = mtag; \
- else \
- Dbuf[o] = dtag; \
- Abuf[o] = N2A[u]; \
- Bbuf[o] = N2A[v]; \
- o += 1; \
- if (o >= vmax) \
- { enlarge_vector(work,3*o); \
- vmax = work->vecmax/3; \
- memmove(work->vector+2*vmax,Dbuf,o); \
- memmove(work->vector+vmax,Bbuf,o); \
- memmove(work->vector,Abuf,o); \
- Abuf = (char *) work->vector; \
- Bbuf = Abuf + vmax; \
- Dbuf = Bbuf + vmax; \
- } \
-}
-
- a = align->aseq - 1;
- b = align->bseq - 1;
-
- o = 0;
- i = j = 1;
-
- prefa = align->path->abpos;
- prefb = align->path->bbpos;
-
- if (prefa > border)
- { i = prefa-(border-1);
- prefa = border;
- }
- if (prefb > border)
- { j = prefb-(border-1);
- prefb = border;
- }
-
- s0 = i;
- sa = i;
- sb = j;
- mtag = ':';
- dtag = ':';
-
- while (prefa > prefb)
- { BLOCK(a[i],4)
- i += 1;
- prefa -= 1;
- }
- while (prefb > prefa)
- { BLOCK(4,b[j])
- j += 1;
- prefb -= 1;
- }
- while (prefa > 0)
- { BLOCK(a[i],b[j])
- i += 1;
- j += 1;
- prefa -= 1;
- }
-
- mtag = '[';
- if (prefb > 0)
- BLOCK(5,5)
-
- mtag = '|';
- dtag = '*';
-
- match = diff = 0;
-
- { int p, c; /* Output columns of alignment til reach trace end */
-
- for (c = 0; c < tlen; c++)
- if ((p = trace[c]) < 0)
- { p = -p;
- while (i != p)
- { BLOCK(a[i],b[j])
- if (a[i] == b[j])
- match += 1;
- else
- diff += 1;
- i += 1;
- j += 1;
- }
- BLOCK(7,b[j])
- j += 1;
- diff += 1;
- }
- else
- { while (j != p)
- { BLOCK(a[i],b[j])
- if (a[i] == b[j])
- match += 1;
- else
- diff += 1;
- i += 1;
- j += 1;
- }
- BLOCK(a[i],7)
- i += 1;
- diff += 1;
- }
- p = align->path->aepos;
- while (i <= p)
- { BLOCK(a[i],b[j])
- if (a[i] == b[j])
- match += 1;
- else
- diff += 1;
- i += 1;
- j += 1;
- }
- }
-
- { int c; /* Output remaining column including unaligned suffix */
-
- mtag = ']';
- if (a[i] != 4 && b[j] != 4 && border > 0)
- BLOCK(6,6)
-
- mtag = ':';
- dtag = ':';
-
- c = 0;
- while (c < border && (a[i] != 4 || b[j] != 4))
- { if (a[i] != 4)
- if (b[j] != 4)
- { BLOCK(a[i],b[j])
- i += 1;
- j += 1;
- }
- else
- { BLOCK(a[i],4)
- i += 1;
- }
- else
- { BLOCK(4,b[j])
- j += 1;
- }
- c += 1;
- }
- }
-
- /* Print remainder of buffered col.s */
-
- fprintf(file,"\n");
- fprintf(file,"%*s",indent,"");
- if (coord > 0)
- { if (sa <= aend)
- fprintf(file," %*d",coord,sa);
- else
- fprintf(file," %*s",coord,"");
- fprintf(file," %.*s\n",o,Abuf);
- fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf);
- fprintf(file,"%*s",indent,"");
- if (sb <= bend)
- fprintf(file," %*d",coord,sb);
- else
- fprintf(file," %*s",coord,"");
- fprintf(file," %.*s",o,Bbuf);
- }
- else
- { fprintf(file," %.*s\n",o,Abuf);
- fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf);
- fprintf(file,"%*s %.*s",indent,"",o,Bbuf);
- }
- if (diff+match > 0)
- fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match));
- else
- fprintf(file,"\n");
-
- fflush(file);
-}
-
-/* Print an ASCII representation of the overlap in align between fragments
- a and b to given file. */
-
-static inline void repchar(FILE *file, int symbol, int rep)
-{ while (rep-- > 0)
- fputc(symbol,file);
-}
-
-void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord)
-{ int alen = align->alen;
- int blen = align->blen;
- Path *path = align->path;
- int comp = COMP(align->flags);
- int w;
-
- fprintf(file,"%*s",indent,"");
- if (path->abpos > 0)
- fprintf(file," %*d ",coord,path->abpos);
- else
- fprintf(file,"%*s",coord+5,"");
- if (path->aepos < alen)
- fprintf(file,"%*s%d",coord+8,"",alen-path->aepos);
- fprintf(file,"\n");
-
- fprintf(file,"%*s",indent,"");
- if (path->abpos > 0)
- { fprintf(file,"A ");
- w = Number_Digits((int64) path->abpos);
- repchar(file,' ',coord-w);
- repchar(file,'=',w+3);
- fputc('+',file);
- repchar(file,'-',coord+5);
- }
- else
- { fprintf(file,"A %*s",coord+4,"");
- repchar(file,'-',coord+5);
- }
-
- if (path->aepos < alen)
- { fputc('+',file);
- w = Number_Digits((int64) (alen-path->aepos));
- repchar(file,'=',w+2);
- fputc('>',file);
- repchar(file,' ',w);
- }
- else
- { fputc('>',file);
- repchar(file,' ',coord+3);
- }
-
- { int asub, bsub;
-
- asub = path->aepos - path->abpos;
- bsub = path->bepos - path->bbpos;
- fprintf(file," dif/(len1+len2) = %d/(%d+%d) = %5.2f%%\n",
- path->diffs,asub,bsub,(200.*path->diffs)/(asub+bsub));
- }
-
- { int sym1e, sym2e;
- int sym1p, sym2p;
-
- if (comp > 0)
- { sym1p = '<'; sym2p = '-'; sym1e = '<'; sym2e = '='; }
- else
- { sym1p = '-'; sym2p = '>'; sym1e = '='; sym2e = '>'; }
-
- fprintf(file,"%*s",indent,"");
- if (path->bbpos > 0)
- { fprintf(file,"B ");
- w = Number_Digits((int64) path->bbpos);
- repchar(file,' ',coord-w);
- fputc(sym1e,file);
- repchar(file,'=',w+2);
- fputc('+',file);
- repchar(file,'-',coord+5);
- }
- else
- { fprintf(file,"B ");
- repchar(file,' ',coord+3);
- fputc(sym1p,file);
- repchar(file,'-',coord+5);
- }
- if (path->bepos < blen)
- { fprintf(file,"+");
- w = Number_Digits((int64) (blen-path->bepos));
- repchar(file,'=',w+2);
- fprintf(file,"%c\n",sym2e);
- }
- else
- fprintf(file,"%c\n",sym2p);
- }
-
- fprintf(file,"%*s",indent,"");
- if (path->bbpos > 0)
- fprintf(file," %*d ",coord,path->bbpos);
- else
- fprintf(file,"%*s",coord+5,"");
- if (path->bepos < blen)
- fprintf(file,"%*s%d",coord+8,"",blen-path->bepos);
- fprintf(file,"\n");
-
- fflush(file);
-}
-
-
-/****************************************************************************************\
-* *
-* O(ND) trace algorithm *
-* *
-\****************************************************************************************/
-
-#ifdef DEBUG_AWAVE
-
-static void print_awave(int *V, int low, int hgh)
-{ int k;
-
- printf(" [%6d,%6d]: ",low,hgh);
- for (k = low; k <= hgh; k++)
- printf(" %3d",V[k]);
- printf("\n");
- fflush(stdout);
-}
-
-#endif
-
-#ifdef DEBUG_ALIGN
-
-static int depth = 0;
-
-#endif
-
-typedef struct
- { int *Stop; // Ongoing stack of alignment indels
- char *Aabs, *Babs; // Absolute base of A and B sequences
-
- int **PVF, **PHF; // List of waves for iterative np algorithms
- int mida, midb; // mid point division for mid-point algorithms
-
- int *VF, *VB; // Forward/Reverse waves for nd algorithms
- // (defunct: were used for O(nd) algorithms)
- } Trace_Waves;
-
-static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
-{ int x, y;
- int D;
-
-#ifdef DEBUG_ALIGN
- printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N);
-#endif
-
- if (M <= 0)
- { x = (wave->Aabs-A)-1;
- for (y = 1; y <= N; y++)
- { *wave->Stop++ = x;
-#ifdef DEBUG_SCRIPT
- printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1);
-#endif
- }
- return (N);
- }
- if (N <= 0)
- { y = (B-wave->Babs)+1;
- for (x = 1; x <= M; x++)
- { *wave->Stop++ = y;
-#ifdef DEBUG_SCRIPT
- printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1);
-#endif
- }
- return (M);
- }
-
- { int *VF = wave->VF;
- int *VB = wave->VB;
- int flow; // fhgh == D !
- int blow, bhgh;
- char *a;
-
- y = 0;
- if (N < M)
- while (y < N && B[y] == A[y])
- y += 1;
- else
- { while (y < M && B[y] == A[y])
- y += 1;
- if (y >= M && N == M)
- return (0);
- }
-
- flow = 0;
- VF[0] = y;
- VF[-1] = -2;
-
- x = N-M;
- a = A-x;
- y = N-1;
- if (N > M)
- while (y >= x && B[y] == a[y])
- y -= 1;
- else
- while (y >= 0 && B[y] == a[y])
- y -= 1;
-
- blow = bhgh = -x;
- VB += x;
- VB[blow] = y;
- VB[blow-1] = N+1;
-
- for (D = 1; 1; D += 1)
- { int k, r;
- int am, ac, ap;
-
- // Forward wave
-
- flow -= 1;
- am = ac = VF[flow-1] = -2;
-
- a = A + D;
- x = M - D;
- for (k = D; k >= flow; k--)
- { ap = ac;
- ac = am+1;
- am = VF[k-1];
-
- if (ac < am)
- if (ap < am)
- y = am;
- else
- y = ap;
- else
- if (ap < ac)
- y = ac;
- else
- y = ap;
-
- if (blow <= k && k <= bhgh)
- { r = VB[k];
- if (y > r)
- { D = (D<<1)-1;
- if (ap > r)
- y = ap;
- else if (ac > r)
- y = ac;
- else
- y = r+1;
- x = k+y;
- goto OVERLAP2;
- }
- }
-
- if (N < x)
- while (y < N && B[y] == a[y])
- y += 1;
- else
- while (y < x && B[y] == a[y])
- y += 1;
-
- VF[k] = y;
- a -= 1;
- x += 1;
- }
-
-#ifdef DEBUG_AWAVE
- print_awave(VF,flow,D);
-#endif
-
- // Reverse Wave
-
- bhgh += 1;
- blow -= 1;
- am = ac = VB[blow-1] = N+1;
-
- a = A + bhgh;
- x = -bhgh;
- for (k = bhgh; k >= blow; k--)
- { ap = ac+1;
- ac = am;
- am = VB[k-1];
-
- if (ac > am)
- if (ap > am)
- y = am;
- else
- y = ap;
- else
- if (ap > ac)
- y = ac;
- else
- y = ap;
-
- if (flow <= k && k <= D)
- { r = VF[k];
- if (y <= r)
- { D = (D << 1);
- if (ap <= r)
- y = ap;
- else if (ac <= r)
- y = ac;
- else
- y = r;
- x = k+y;
- goto OVERLAP2;
- }
- }
-
- y -= 1;
- if (x > 0)
- while (y >= x && B[y] == a[y])
- y -= 1;
- else
- while (y >= 0 && B[y] == a[y])
- y -= 1;
-
- VB[k] = y;
- a -= 1;
- x += 1;
- }
-
-#ifdef DEBUG_AWAVE
- print_awave(VB,blow,bhgh);
-#endif
- }
- }
-
-OVERLAP2:
-
-#ifdef DEBUG_ALIGN
- printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D);
- fflush(stdout);
-#endif
- if (D > 1)
- {
-#ifdef DEBUG_ALIGN
- depth += 2;
-#endif
- dandc_nd(A,x,B,y,wave);
- dandc_nd(A+x,M-x,B+y,N-y,wave);
-#ifdef DEBUG_ALIGN
- depth -= 2;
-#endif
- }
- else if (D == 1)
- { if (M > N)
- { *wave->Stop++ = (B-wave->Babs)+y+1;
-#ifdef DEBUG_SCRIPT
- printf("%*s D %ld(%ld)\n",depth,"",(A-wave->Aabs)+x,(B-wave->Babs)+y+1);
-#endif
- }
- else if (M < N)
- { *wave->Stop++ = (wave->Aabs-A)-x-1;
-#ifdef DEBUG_SCRIPT
- printf("%*s I %ld(%ld)\n",depth,"",(B-wave->Babs)+y,(A-wave->Aabs)+x+1);
-#endif
- }
-#ifdef DEBUG_SCRIPT
- else
- printf("%*s %ld S %ld\n",depth,"",(wave->Aabs-A)+x,(B-wave->Babs)+y);
-#endif
- }
-
- return (D);
-}
-
-
-static void Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework)
-{ _Work_Data *work = (_Work_Data *) ework;
- Trace_Waves wave;
-
- int L, D;
- int asub, bsub;
- Path *path;
- int *trace;
-
- path = align->path;
- asub = path->aepos-path->abpos;
- bsub = path->bepos-path->bbpos;
-
- if (asub < bsub)
- L = bsub;
- else
- L = asub;
- L *= sizeof(int);
- if (L > work->tramax)
- enlarge_trace(work,L);
-
- trace = wave.Stop = ((int *) work->trace);
-
- D = 2*(path->diffs + 4)*sizeof(int);
- if (D > work->vecmax)
- enlarge_vector(work,D);
-
- D = (path->diffs+3)/2;
- wave.VF = ((int *) work->vector) + (D+1);
- wave.VB = wave.VF + (2*D+1);
-
- wave.Aabs = align->aseq;
- wave.Babs = align->bseq;
-
- path->diffs = dandc_nd(align->aseq+path->abpos,path->aepos-path->abpos,
- align->bseq+path->bbpos,path->bepos-path->bbpos,&wave);
- path->trace = trace;
- path->tlen = wave.Stop - trace;
-}
-
-
-/****************************************************************************************\
-* *
-* O(NP) tracing algorithms *
-* *
-\****************************************************************************************/
-
-/* Iterative O(np) algorithm for finding the alignment between two substrings (specified
- by a Path record). The variation includes handling substitutions and guarantees
- to find left-most alignments so that low complexity runs are always aligned in
- the same way.
-*/
-
-#ifdef DEBUG_ALIGN
-
-static int ToA[4] = { 'a', 'c', 'g', 't' };
-
-#endif
-
-static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave)
-{ int **PVF = wave->PVF;
- int **PHF = wave->PHF;
- int D;
- int del = M-N;
-
- { int *F0, *F1, *F2;
- int *HF;
- int low, hgh, pos;
-
-#ifdef DEBUG_ALIGN
- printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N);
- printf("%*s A = ",depth,"");
- for (D = 0; D < M; D++)
- printf("%c",ToA[(int) A[D]]);
- printf("\n");
- printf("%*s B = ",depth,"");
- for (D = 0; D < N; D++)
- printf("%c",ToA[(int) B[D]]);
- printf("\n");
-#endif
-
- if (del >= 0)
- { low = 0;
- hgh = del;
- }
- else
- { low = del;
- hgh = 0;
- }
- if (wave->Aabs == wave->Babs)
- pos = B-A;
- else
- pos = -INT32_MAX;
-
- F1 = PVF[-2];
- F0 = PVF[-1];
-
- for (D = low-1; D <= hgh+1; D++)
- F1[D] = F0[D] = -2;
- F0[0] = -1;
-
- low += 1;
- hgh -= 1;
-
- for (D = 0; 1; D += 1)
- { int k, i, j;
- int am, ac, ap;
- char *a;
-
- F2 = F1;
- F1 = F0;
- F0 = PVF[D];
- HF = PHF[D];
-
- if ((D & 0x1) == 0)
- { hgh += 1;
- low -= 1;
- if (low <= pos)
- low += 1;
- }
- F0[hgh+1] = F0[low-1] = -2;
-
-#define FS_MOVE(mdir,pdir) \
- ac = F1[k]+1; \
- if (ac < am) \
- if (ap < am) \
- { HF[k] = mdir; \
- j = am; \
- } \
- else \
- { HF[k] = pdir; \
- j = ap; \
- } \
- else \
- if (ap < ac) \
- { HF[k] = 0; \
- j = ac; \
- } \
- else \
- { HF[k] = pdir; \
- j = ap; \
- } \
- \
- if (N < i) \
- while (j < N && B[j] == a[j]) \
- j += 1; \
- else \
- while (j < i && B[j] == a[j]) \
- j += 1; \
- F0[k] = j;
-
- j = -2;
- a = A + hgh;
- i = M - hgh;
- for (k = hgh; k > del; k--)
- { ap = j+1;
- am = F2[k-1];
- FS_MOVE(-1,4)
- a -= 1;
- i += 1;
- }
-
- j = -2;
- a = A + low;
- i = M - low;
- for (k = low; k < del; k++)
- { ap = F2[k+1]+1;
- am = j;
- FS_MOVE(2,1)
- a += 1;
- i -= 1;
- }
-
- ap = F0[del+1]+1;
- am = j;
- FS_MOVE(2,4)
-
-#ifdef DEBUG_AWAVE
- print_awave(F0,low,hgh);
- print_awave(HF,low,hgh);
-#endif
-
- if (F0[del] >= N)
- break;
- }
- }
-
- { int k, h, m, e, c;
- char *a;
- int ap = (wave->Aabs-A)-1;
- int bp = (B-wave->Babs)+1;
-
- PHF[0][0] = 3;
-
- c = N;
- k = del;
- e = PHF[D][k];
- PHF[D][k] = 3;
- while (e != 3)
- { h = k+e;
- if (e > 1)
- h -= 3;
- else if (e == 0)
- D -= 1;
- else
- D -= 2;
- if (h < k) // => e = -1 or 2
- { a = A + k;
- if (k < 0)
- m = -k;
- else
- m = 0;
- if (PVF[D][h] <= c)
- c = PVF[D][h]-1;
- while (c >= m && a[c] == B[c])
- c -= 1;
- if (e < 1) // => edge is 2, others are 1, and 0
- { if (c <= PVF[D+2][k+1])
- { e = 4;
- h = k+1;
- D = D+2;
- }
- else if (c == PVF[D+1][k])
- { e = 0;
- h = k;
- D = D+1;
- }
- else
- PVF[D][h] = c+1;
- }
- else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise)
- { if (k == del)
- m = D;
- else
- m = D-2;
- if (c <= PVF[m][k+1])
- { if (k == del)
- e = 4;
- else
- e = 1;
- h = k+1;
- D = m;
- }
- else if (c == PVF[D-1][k])
- { e = 0;
- h = k;
- D = D-1;
- }
- else
- PVF[D][h] = c+1;
- }
- }
- m = PHF[D][h];
- PHF[D][h] = e;
- e = m;
-
- k = h;
- }
-
- k = D = 0;
- e = PHF[D][k];
- while (e != 3)
- { h = k-e;
- c = PVF[D][k];
- if (e > 1)
- h += 3;
- else if (e == 0)
- D += 1;
- else
- D += 2;
- if (h > k)
- *wave->Stop++ = bp+c;
- else if (h < k)
- *wave->Stop++ = ap-(c+k);
- k = h;
- e = PHF[D][h];
- }
-
-#ifdef DEBUG_SCRIPT
- k = D = 0;
- e = PHF[D][k];
- while (e != 3)
- { h = k-e;
- c = PVF[D][k];
- if (e > 1)
- h += 3;
- else if (e == 0)
- D += 1;
- else
- D += 2;
- if (h > k)
- printf("%*s D %d(%d)\n",depth,"",(c-k)-(ap-1),c+bp);
- else if (h < k)
- printf("%*s I %d(%d)\n",depth,"",c+(bp-1),(c+k)-ap);
- else
- printf("%*s %d S %d\n",depth,"",(c+k)-(ap+1),c+(bp-1));
- k = h;
- e = PHF[D][h];
- }
-#endif
- }
-
- return (D + abs(del));
-}
-
-static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave)
-{ int **PVF = wave->PVF;
- int **PHF = wave->PHF;
- int D;
- int del = M-N;
-
- { int *F0, *F1, *F2;
- int *HF;
- int low, hgh, pos;
-
-#ifdef DEBUG_ALIGN
- printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N);
- printf("%*s A = ",depth,"");
- for (D = 0; D < M; D++)
- printf("%c",ToA[(int) A[D]]);
- printf("\n");
- printf("%*s B = ",depth,"");
- for (D = 0; D < N; D++)
- printf("%c",ToA[(int) B[D]]);
- printf("\n");
-#endif
-
- if (del >= 0)
- { low = 0;
- hgh = del;
- }
- else
- { low = del;
- hgh = 0;
- }
- if (wave->Aabs == wave->Babs)
- pos = B-A;
- else
- pos = -INT32_MAX;
-
- F1 = PVF[-2];
- F0 = PVF[-1];
-
- for (D = low-1; D <= hgh+1; D++)
- F1[D] = F0[D] = -2;
- F0[0] = -1;
-
- low += 1;
- hgh -= 1;
-
- for (D = 0; 1; D += 1)
- { int k, i, j;
- int am, ac, ap;
- char *a;
-
- F2 = F1;
- F1 = F0;
- F0 = PVF[D];
- HF = PHF[D];
-
- if ((D & 0x1) == 0)
- { hgh += 1;
- low -= 1;
- if (low <= pos)
- low += 1;
- }
- F0[hgh+1] = F0[low-1] = -2;
-
- j = -2;
- a = A + hgh;
- i = M - hgh;
- for (k = hgh; k > del; k--)
- { ap = j+1;
- am = F2[k-1];
- FS_MOVE(-1,4)
- a -= 1;
- i += 1;
- }
-
- j = -2;
- a = A + low;
- i = M - low;
- for (k = low; k < del; k++)
- { ap = F2[k+1]+1;
- am = j;
- FS_MOVE(2,1)
- a += 1;
- i -= 1;
- }
-
- ap = F0[del+1]+1;
- am = j;
- FS_MOVE(2,4)
-
-#ifdef DEBUG_AWAVE
- print_awave(F0,low,hgh);
- print_awave(HF,low,hgh);
-#endif
-
- if (F0[del] >= N)
- break;
- }
- }
-
- { int k, h, m, e, c;
- int d, f;
- char *a;
-
- d = D + abs(del);
- c = N;
- k = del;
- for (f = d/2; d > f; d--)
- { e = PHF[D][k];
- h = k+e;
- if (e > 1)
- h -= 3;
- else if (e == 0)
- D -= 1;
- else
- D -= 2;
- if (h < k) // => e = -1 or 2
- { a = A + k;
- if (k < 0)
- m = -k;
- else
- m = 0;
- if (PVF[D][h] <= c)
- c = PVF[D][h]-1;
- while (c >= m && a[c] == B[c])
- c -= 1;
- if (e < 1) // => edge is 2, others are 1, and 0
- { if (c <= PVF[D+2][k+1])
- { e = 4;
- h = k+1;
- D = D+2;
- }
- else if (c == PVF[D+1][k])
- { e = 0;
- h = k;
- D = D+1;
- }
- else
- PVF[D][h] = c+1;
- }
- else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise)
- { if (k == del)
- m = D;
- else
- m = D-2;
- if (c <= PVF[m][k+1])
- { if (k == del)
- e = 4;
- else
- e = 1;
- h = k+1;
- D = m;
- }
- else if (c == PVF[D-1][k])
- { e = 0;
- h = k;
- D = D-1;
- }
- else
- PVF[D][h] = c+1;
- }
- }
- k = h;
- }
-
- wave->midb = (B-wave->Babs) + PVF[D][k];
- wave->mida = (A-wave->Aabs) + k + PVF[D][k];
- }
-
- return (1);
-}
-
-
-/****************************************************************************************\
-* *
-* COMPUTE_TRACE FLAVORS *
-* *
-\****************************************************************************************/
-
-void Compute_Trace_ALL(Alignment *align, Work_Data *ework)
-{ _Work_Data *work = (_Work_Data *) ework;
- Trace_Waves wave;
-
- Path *path;
- char *aseq, *bseq;
- int M, N;
-
- path = align->path;
- aseq = align->aseq;
- bseq = align->bseq;
-
- M = path->aepos-path->abpos;
- N = path->bepos-path->bbpos;
-
- { int64 s;
- int d;
- int dmax;
- int **PVF, **PHF;
-
- if (M < N)
- s = N;
- else
- s = M;
- s *= sizeof(int);
- if (s > work->tramax)
- enlarge_trace(work,s);
-
- dmax = path->diffs - abs(M-N);
-
- s = (dmax+3)*2*((M+N+3)*sizeof(int) + sizeof(int *));
-
- if (s > 256000000)
- { Compute_Trace_ND_ALL(align,ework);
- return;
- }
-
- if (s > work->vecmax)
- enlarge_vector(work,s);
-
- wave.PVF = PVF = ((int **) (work->vector)) + 2;
- wave.PHF = PHF = PVF + (dmax+3);
-
- s = M+N+3;
- PVF[-2] = ((int *) (PHF + (dmax+1))) + (N+1);
- for (d = -1; d <= dmax; d++)
- PVF[d] = PVF[d-1] + s;
- PHF[-2] = PVF[dmax] + s;
- for (d = -1; d <= dmax; d++)
- PHF[d] = PHF[d-1] + s;
- }
-
- wave.Stop = ((int *) work->trace);
- wave.Aabs = aseq;
- wave.Babs = bseq;
-
- path->diffs = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave);
- path->trace = work->trace;
- path->tlen = wave.Stop - ((int *) path->trace);
-}
-
-void Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing)
-{ _Work_Data *work = (_Work_Data *) ework;
- Trace_Waves wave;
-
- Path *path;
- char *aseq, *bseq;
- uint16 *points;
- int tlen;
- int ab, bb;
- int ae, be;
- int diffs;
-
- path = align->path;
- aseq = align->aseq;
- bseq = align->bseq;
- tlen = path->tlen;
- points = (uint16 *) path->trace;
-
- { int64 s;
- int d;
- int M, N;
- int dmax, nmax;
- int **PVF, **PHF;
-
- M = path->aepos-path->abpos;
- N = path->bepos-path->bbpos;
- if (M < N)
- s = N*sizeof(int);
- else
- s = M*sizeof(int);
- if (s > work->tramax)
- enlarge_trace(work,s);
-
- nmax = 0;
-#ifdef DELTAS
- dmax = 0;
- for (d = 1; d < tlen; d += 2)
- { if (points[d-1] > dmax)
- dmax = points[d-1];
-#else
- for (d = 0; d < tlen; d++)
- {
-#endif
- if (points[d] > nmax)
- nmax = points[d];
- }
- if (tlen <= 1)
- nmax = N;
-#ifdef DELTAS
- if (points[d-1] > dmax)
- dmax = points[d-1];
-#else
- dmax = nmax;
-#endif
-
- s = (dmax+3)*2*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *));
-
- if (s > work->vecmax)
- enlarge_vector(work,s);
-
- wave.PVF = PVF = ((int **) (work->vector)) + 2;
- wave.PHF = PHF = PVF + (dmax+3);
-
- s = trace_spacing+nmax+3;
- PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1);
- for (d = -1; d <= dmax; d++)
- PVF[d] = PVF[d-1] + s;
- PHF[-2] = PVF[dmax] + s;
- for (d = -1; d <= dmax; d++)
- PHF[d] = PHF[d-1] + s;
- }
-
- wave.Stop = (int *) (work->trace);
- wave.Aabs = aseq;
- wave.Babs = bseq;
-
- { int i;
-
- diffs = 0;
- ab = path->abpos;
- ae = (ab/trace_spacing)*trace_spacing;
- bb = path->bbpos;
-#ifdef DELTAS
- tlen -= 2;
- for (i = 1; i < tlen; i += 2)
-#else
- tlen -= 1;
- for (i = 0; i < tlen; i++)
-#endif
- { ae = ae + trace_spacing;
- be = bb + points[i];
- diffs += iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave);
- ab = ae;
- bb = be;
- }
- ae = path->aepos;
- be = path->bepos;
- diffs += iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave);
- }
-
- path->trace = work->trace;
- path->tlen = wave.Stop - ((int *) path->trace);
- path->diffs = diffs;
-}
-
-void Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing)
-{ _Work_Data *work = (_Work_Data *) ework;
- Trace_Waves wave;
-
- Path *path;
- char *aseq, *bseq;
- uint16 *points;
- int tlen;
- int ab, bb;
- int ae, be;
- int diffs;
-
- path = align->path;
- aseq = align->aseq;
- bseq = align->bseq;
- tlen = path->tlen;
- points = (uint16 *) path->trace;
-
- { int64 s;
- int d;
- int M, N;
- int dmax, nmax;
- int **PVF, **PHF;
-
- M = path->aepos-path->abpos;
- N = path->bepos-path->bbpos;
- if (M < N)
- s = N*sizeof(int);
- else
- s = M*sizeof(int);
- if (s > work->tramax)
- enlarge_trace(work,s);
-
- nmax = 0;
-#ifdef DELTAS
- dmax = 0;
- for (d = 1; d < tlen; d += 2)
- { if (points[d-1] > dmax)
- dmax = points[d-1];
-#else
- for (d = 0; d < tlen; d++)
- {
-#endif
- if (points[d] > nmax)
- nmax = points[d];
- }
- if (tlen <= 1)
- nmax = N;
-#ifdef DELTAS
- if (points[d-1] > dmax)
- dmax = points[d-1];
-#else
- dmax = nmax;
-#endif
-
- s = (dmax+3)*4*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *));
-
- if (s > work->vecmax)
- enlarge_vector(work,s);
-
- wave.PVF = PVF = ((int **) (work->vector)) + 2;
- wave.PHF = PHF = PVF + (dmax+3);
-
- s = trace_spacing+nmax+3;
- PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1);
- for (d = -1; d <= dmax; d++)
- PVF[d] = PVF[d-1] + s;
- PHF[-2] = PVF[dmax] + s;
- for (d = -1; d <= dmax; d++)
- PHF[d] = PHF[d-1] + s;
- }
-
- wave.Stop = ((int *) work->trace);
- wave.Aabs = aseq;
- wave.Babs = bseq;
-
- { int i;
- int as, bs;
- int af, bf;
-
- diffs = 0;
- ab = as = af = path->abpos;
- ae = (ab/trace_spacing)*trace_spacing;
- bb = bs = bf = path->bbpos;
-#ifdef DELTAS
- tlen -= 2;
- for (i = 1; i < tlen; i += 2)
-#else
- tlen -= 1;
- for (i = 0; i < tlen; i++)
-#endif
- { ae = ae + trace_spacing;
- be = bb + points[i];
- if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave))
- { af = wave.mida;
- bf = wave.midb;
- diffs += iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave);
- ab = ae;
- bb = be;
- as = af;
- bs = bf;
- }
- }
- ae = path->aepos;
- be = path->bepos;
- if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave))
- { af = wave.mida;
- bf = wave.midb;
- diffs += iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave);
- as = af;
- bs = bf;
- }
- diffs += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave);
- }
-
- path->trace = work->trace;
- path->tlen = wave.Stop - ((int *) path->trace);
- path->diffs = diffs;
-}
diff --git a/src/cpp/align.h b/src/cpp/align.h
deleted file mode 100644
index d0b2d5b..0000000
--- a/src/cpp/align.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
-/*******************************************************************************************
- *
- * Local alignment module. Routines for finding local alignments given a seed position,
- * representing such an l.a. with its interval and a set of pass-thru points, so that
- * a detailed alignment can be efficiently computed on demand.
- *
- * All routines work on a numeric representation of DNA sequences, i.e. 0 for A, 1 for C,
- * 2 for G, and 3 for T.
- *
- * Author: Gene Myers
- * Date : July 2013
- *
- ********************************************************************************************/
-
-#include "DB.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define TRACE_XOVR 125 // If the trace spacing is not more than this value, then can
- // and do compress traces pts to 8-bit unsigned ints
-
-#ifndef _A_MODULE
-
-#define _A_MODULE
-
-/*** PATH ABSTRACTION:
-
- Coordinates are *between* characters where 0 is the tick just before the first char,
- 1 is the tick between the first and second character, and so on. Our data structure
- is called a Path refering to its conceptualization in an edit graph.
-
- A local alignment is specified by the point '(abpos,bbpos)' at which its path in
- the underlying edit graph starts, and the point '(aepos,bepos)' at which it ends.
- In otherwords A[abpos+1..aepos] is aligned to B[bbpos+1..bepos] (assuming X[1] is
- the *first* character of X).
-
- There are 'diffs' differences in an optimal local alignment between the beginning and
- end points of the alignment (if computed by Compute_Trace), or nearly so (if computed
- by Local_Alignment).
-
- Optionally, a Path can have additional information about the exact nature of the
- aligned substrings if the field 'trace' is not NULL. Trace points to either an
- array of integers (if computed by a Compute_Trace routine), or an array of unsigned
- short integers (if computed by Local_Alignment).
-
- If computed by Local_Alignment 'trace' points at a list of 'tlen' (always even) short
- values:
-
- d_0, b_0, d_1, b_1, ... d_n-1, b_n-1, d_n, b_n
-
- to be interpreted as follows. The alignment from (abpos,bbpos) to (aepos,bepos)
- passes through the n trace points for i in [1,n]:
-
- (a_i,b_i) where a_i = floor(abpos/TS)*TS + i*TS
- and b_i = bbpos + (b_0 + b_1 + b_i-1)
-
- where also let a_0,b_0 = abpos,bbpos and a_(n+1),b_(n+1) = aepos,bepos. That is, the
- interior (i.e. i != 0 and i != n+1) trace points pass through every TS'th position of
- the aread where TS is the "trace spacing" employed when finding the alignment (see
- New_Align_Spec). Typically TS is 100. Then d_i is the number of differences in the
- portion of the alignment between (a_i,b_i) and (a_i+1,b_i+1). These trace points allow
- the Compute_Trace routines to efficiently compute the exact alignment between the two
- reads by efficiently computing exact alignments between consecutive pairs of trace points.
- Moreover, the diff values give one an idea of the quality of the alignment along every
- segment of TS symbols of the aread.
-
- If computed by a Compute_Trace routine, 'trace' points at a list of 'tlen' integers
- < i1, i2, ... in > that encodes an exact alignment as follows. A negative number j
- indicates that a dash should be placed before A[-j] and a positive number k indicates
- that a dash should be placed before B[k], where A and B are the two sequences of the
- overlap. The indels occur in the trace in the order in which they occur along the
- alignment. For a good example of how to "decode" a trace into an alignment, see the
- code for the routine Print_Alignment.
-
-***/
-
-typedef struct
- { void *trace;
- int tlen;
- int diffs;
- int abpos, bbpos;
- int aepos, bepos;
- } Path;
-
-
-/*** ALIGNMENT ABSTRACTION:
-
- An alignment is modeled by an Alignment record, which in addition to a *pointer* to a
- 'path', gives pointers to the A and B sequences, their lengths, and indicates whether
- the B-sequence needs to be complemented ('comp' non-zero if so). The 'trace' pointer
- of the 'path' subrecord can be either NULL, a list of pass-through points, or an exact
- trace depending on what routines have been called on the record.
-
- One can (1) compute a trace, with Compute_Trace, either from scratch if 'path.trace' = NULL,
- or using the sequence of pass-through points in trace, (2) print an ASCII representation
- of an alignment, or (3) reverse the roles of A and B, and (4) complement a sequence
- (which is a reversible process).
-
- If the alignment record shows the B sequence as complemented, *** THEN IT IS THE
- RESPONSIBILITY OF THE CALLER *** to make sure that bseq points at a complement of
- the sequence before calling Compute_Trace or Print_Alignment. Complement_Seq complements
- the sequence a of length n. The operation does the complementation/reversal in place.
- Calling it a second time on a given fragment restores it to its original state.
-***/
-
-#define COMP(x) ((x) & 0x1)
-
-#define COMP_FLAG 0x1
-
-typedef struct
- { Path *path;
- uint32 flags; /* Pipeline status and complementation flags */
- char *aseq; /* Pointer to A sequence */
- char *bseq; /* Pointer to B sequence */
- int alen; /* Length of A sequence */
- int blen; /* Length of B sequence */
- } Alignment;
-
-void Complement_Seq(char *a, int n);
-
- /* Many routines like Local_Alignment, Compute_Trace, and Print_Alignment need working
- storage that is more efficiently reused with each call, rather than being allocated anew
- with each call. Each *thread* can create a Work_Data object with New_Work_Data and this
- object holds and retains the working storage for routines of this module between calls
- to the routines. Free_Work_Data frees a Work_Data object and all working storage
- held by it.
- */
-
- typedef void Work_Data;
-
- Work_Data *New_Work_Data();
-
- void Free_Work_Data(Work_Data *work);
-
- /* Local_Alignment seeks local alignments of a quality determined by a number of parameters.
- These are coded in an Align_Spec object that can be created with New_Align_Spec and
- freed with Free_Align_Spec when no longer needed. There are 4 essential parameters:
-
- ave_corr: the average correlation (1 - 2*error_rate) for the sought alignments. For Pacbio
- data we set this to .70 assuming an average of 15% error in each read.
- trace_space: the spacing interval for keeping trace points and segment differences (see
- description of 'trace' for Paths above)
- freq[4]: a 4-element vector where afreq[0] = frequency of A, f(A), freq[1] = f(C),
- freq[2] = f(G), and freq[3] = f(T). This vector is part of the header
- of every HITS database (see db.h).
-
- If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e.
- overlap), then the last/first 30 columns of the alignment are guaranteed to be
- suffix/prefix positive at correlation ave_corr * g(freq) where g is an empirically
- measured function that increases from 1 as the entropy of freq decreases.
-
- You can get back the original parameters used to create an Align_Spec with the simple
- utility functions below.
- */
-
- typedef void Align_Spec;
-
- Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq);
-
- void Free_Align_Spec(Align_Spec *spec);
-
- int Trace_Spacing (Align_Spec *spec);
- double Average_Correlation(Align_Spec *spec);
- float *Base_Frequencies (Align_Spec *spec);
-
- /* Local_Alignment finds the longest significant local alignment between the sequences in
- 'align' subject to the alignment criterion given by the Align_Spec 'spec' that passes
- through one of the points '(xlow-xhgh,y)' within the underlying dynamic programming matrix.
- The path record of 'align' has its 'trace' filled from the point of view of an overlap between
- the aread and the bread. In addition a Path record from the point of view of the bread
- versus the aread is returned by the function, with this Path's 'trace' filled in
- appropriately. The space for the returned path and the two 'trace's are in the
- working storage supplied by the Work_Data packet and this space is reused with each call,
- so if one wants to retain the bread-path and the two trace point sequences, then they
- must be copied to user-allocated storage before calling the routine again.
- */
-
- Path *Local_Alignment(Alignment *align, Work_Data *work, Align_Spec *spec,
- int xlow, int xhgh, int y);
-
- /* Given a legitimate Alignment object, Compute_Trace_X computes an exact trace for the alignment.
- If 'path.trace' is non-NULL, then it is assumed to be a sequence of pass-through points
- and diff levels computed by Local_Alignment. In either case 'path.trace' is set
- to point at an integer array within the storage of the Work_Data packet encoding an
- exact optimal trace from the start to end points. If the trace is needed beyond the
- next call to a routine that sets it, then it should be copied to an array allocated
- and managed by the caller.
-
- Compute_Trace_ALL does not require a sequence of pass-through points, as it computes the
- best alignment between (path->abpos,path->bbpos) and (path->aepos,path->bepos) in the
- edit graph between the sequences. Compute_Trace_PTS computes a trace by computing the
- trace between successive pass through points. It is much, much faster than Compute_Trace_ALL
- but at the tradeoff of not necessarily being optimal as pass-through points are not all
- perfect. Compute_Trace_MID computes a trace by computing the trace between the mid-points
- of alignments between two adjacent pairs of pass through points. It is generally twice as
- slow as Compute_Trace_PTS, but it produces nearer optimal alignments.
- */
-
- void Compute_Trace_ALL(Alignment *align, Work_Data *work);
- void Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing);
- void Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing);
-
- /* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the
- two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls
- the display width of numbers, it must be not less than the width of any number to be
- displayed.
-
- If the alignment trace is an exact trace, then one can ask Print_Alignment to print an
- ASCII representation of the alignment 'align' to the file 'file'. Indent the display
- by "indent" spaces and put "width" columns per line in the display. Show "border"
- characters of sequence on each side of the aligned region. If upper is non-zero then
- display bases in upper case. If coord is greater than 0, then the positions of the
- first character in A and B in the given row is displayed with a field width given by
- coord's value.
-
- Print_Reference is like Print_Alignment but rather than printing exaclty "width" columns
- per segment, it prints "block" characters of the A sequence in each segment. This results
- in segments of different lengths, but is convenient when looking at two alignments involving
- A as segments are guaranteed to cover the same interval of A in a segment.
-
- Flip_Alignment modifies align so the roles of A and B are reversed. If full is off then
- the trace is ignored, otherwise the trace must be to a full alignment trace and this trace
- is also appropriately inverted.
- */
-
- void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord);
-
- void Print_Alignment(FILE *file, Alignment *align, Work_Data *work,
- int indent, int width, int border, int upper, int coord);
-
- void Print_Reference(FILE *file, Alignment *align, Work_Data *work,
- int indent, int block, int border, int upper, int coord);
-
- void Flip_Alignment(Alignment *align, int full);
-
-
-/*** OVERLAP ABSTRACTION:
-
- Externally, between modules an Alignment is modeled by an "Overlap" record, which
- (a) replaces the pointers to the two sequences with their ID's in the HITS data bases,
- (b) does not contain the length of the 2 sequences (must fetch from DB), and
- (c) contains its path as a subrecord rather than as a pointer (indeed, typically the
- corresponding Alignment record points at the Overlap's path sub-record). The trace pointer
- is always to a sequence of trace points and can be either compressed (uint8) or
- uncompressed (uint16). One can read and write binary records of an "Overlap".
-***/
-
-typedef struct {
- Path path; /* Path: begin- and end-point of alignment + diffs */
- uint32 flags; /* Pipeline status and complementation flags */
- int aread; /* Id # of A sequence */
- int bread; /* Id # of B sequence */
-} Overlap;
-
-
- /* Read_Overlap reads the next Overlap record from stream 'input', not including the trace
- (if any), and without modifying 'ovl's trace pointer. Read_Trace reads the ensuing trace
- into the memory pointed at by the trace field of 'ovl'. It is assumed to be big enough to
- accommodate the trace where each value take 'tbytes' bytes (1 if uint8 or 2 if uint16).
-
- Write_Overlap write 'ovl' to stream 'output' followed by its trace vector (if any) that
- occupies 'tbytes' bytes per value.
-
- Print_Overlap prints an ASCII version of the contents of 'ovl' to stream 'output'
- where the trace occupes 'tbytes' per value and the print out is indented from the left
- margin by 'indent' spaces.
-
- Compress_TraceTo8 converts a trace fo 16-bit values to 8-bit values in place, and
- Decompress_TraceTo16 does the reverse conversion.
-
- Check_Trace_Points checks that the number of trace points is correct and that the sum
- of the b-read displacements equals the b-read alignment interval, assuming the trace
- spacing is 'tspace'. It reports an error message if there is a problem and 'verbose'
- is non-zero. The 'ovl' came from the file names 'fname'.
- */
-
- int Read_Overlap(FILE *input, Overlap *ovl);
- int Read_Trace(FILE *innput, Overlap *ovl, int tbytes);
-
- void Write_Overlap(FILE *output, Overlap *ovl, int tbytes);
- void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent);
-
- void Compress_TraceTo8(Overlap *ovl);
- void Decompress_TraceTo16(Overlap *ovl);
-
- int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // _A_MODULE
diff --git a/src/cpp/boost.mk b/src/cpp/boost.mk
deleted file mode 100644
index aebea4d..0000000
--- a/src/cpp/boost.mk
+++ /dev/null
@@ -1,18 +0,0 @@
-# A URI location for a copy of boost headers only
-URI := https://www.dropbox.com/s/g22iayi83p5gbbq/boost_1_58_0-headersonly.tbz2\?dl\=0
-# Obtain the boost URI and extract it into PWD
-GET_BOOST := curl -L $(URI) | tar xjf -
-
-MYPATH__ := $(dir $(CURDIR)/$(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))
-
-# Three ways to boost: 1) Internal PBI repot, 2) URI, 3) User specified.
-
-ifdef boost
-BOOST_HEADERS := $(boost)
-else
-BOOST_HEADERS := $(MYPATH__)third-party/boost_1_58_0-headersonly
-endif
-
-ifneq ($(wildcard $(PREBUILT)/boost/boost_1_58_0-headersonly/*),)
-BOOST_HEADERS := $(PREBUILT)/boost/boost_1_58_0-headersonly
-endif
diff --git a/src/cpp/dazcon.cpp b/src/cpp/dazcon.cpp
index 2f70ba6..345b2ee 100644
--- a/src/cpp/dazcon.cpp
+++ b/src/cpp/dazcon.cpp
@@ -1,39 +1,3 @@
-
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
#include <cstdint>
#include <iostream>
#include <sstream>
@@ -57,7 +21,7 @@
#include "DazAlnProvider.hpp"
#include "BoundedBuffer.hpp"
-INITIALIZE_EASYLOGGINGPP
+INITIALIZE_NULL_EASYLOGGINGPP
ProgramOpts popts;
@@ -82,7 +46,7 @@ void Reader(TrgBuf& trgBuf, AlnProvider* ap) {
if (! td.alns.empty())
trgBuf.push(td);
} while (hasNext);
- }
+ }
catch (PacBio::DagCon::IOException& e) {
std::cerr << e.what();
exit(1);
@@ -95,6 +59,7 @@ void Reader(TrgBuf& trgBuf, AlnProvider* ap) {
}
void Consensus(int id, TrgBuf& trgBuf, CnsBuf& cnsBuf) {
+ int fake_well_counter; // just to avoid too many reads in the same bin
TargetData td;
trgBuf.pop(&td);
std::vector<CnsResult> seqs;
@@ -109,11 +74,11 @@ void Consensus(int id, TrgBuf& trgBuf, CnsBuf& cnsBuf) {
CLOG(INFO, "Consensus") << msg % id % td.alns[0].id % td.alns.size();
AlnGraphBoost ag(td.targSeq);
- AlnVec alns = td.alns;
+ AlnVec alns = td.alns;
for (auto it = alns.begin(); it != alns.end(); ++it) {
if (it->qstr.length() < popts.minLen) continue;
dagcon::Alignment aln = normalizeGaps(*it);
- // XXX: Shouldn't be needed for dazcon, but causes some infinite
+ // XXX: Shouldn't be needed for dazcon, but causes some infinite
// loops in the current consensus code.
trimAln(aln, popts.trim);
ag.addAln(aln);
@@ -124,10 +89,14 @@ void Consensus(int id, TrgBuf& trgBuf, CnsBuf& cnsBuf) {
ag.consensus(seqs, popts.minCov, popts.minLen);
for (auto it = seqs.begin(); it != seqs.end(); ++it) {
CnsResult result = *it;
- boost::format fasta(">%s/%d_%d\n%s\n");
- fasta % alns[0].id % result.range[0] % result.range[1];
- fasta % result.seq;
- cnsBuf.push(fasta.str());
+ boost::format fasta(">%s/%d/%d_%d\n%s\n");
+ fasta % alns[0].id
+ % fake_well_counter
+ % result.range[0]
+ % result.range[1]
+ % result.seq;
+ cnsBuf.push(fasta.str());
+ ++fake_well_counter;
}
trgBuf.pop(&td);
}
@@ -143,7 +112,7 @@ void Writer(CnsBuf& cnsBuf) {
int sentinelCount = 0;
while (true) {
std::cout << cns;
- if (cns == "" && ++sentinelCount == popts.threads)
+ if (cns == "" && ++sentinelCount == popts.threads)
break;
cnsBuf.pop(&cns);
@@ -200,11 +169,11 @@ void parseArgs(int argc, char **argv) {
"Turns on verbose logging", cmd, false);
TCLAP::UnlabeledMultiArg<int> targetArgs(
- "targets", "Limit consensus to list of target ids",
+ "targets", "Limit consensus to list of target ids",
false, "list of ints", cmd);
cmd.parse(argc, argv);
-
+
popts.minCov = minCovArg.getValue();
popts.minLen = minLenArg.getValue();
popts.trim = trimArg.getValue();
@@ -223,9 +192,18 @@ void parseArgs(int argc, char **argv) {
}
int main(int argc, char* argv[]) {
-
- START_EASYLOGGINGPP(argc, argv);
parseArgs(argc, argv);
+#if ELPP_ASYNC_LOGGING
+ el::base::elStorage.reset(
+ new el::base::Storage(el::LogBuilderPtr(new el::base::DefaultLogBuilder()),
+ new el::base::AsyncDispatchWorker())
+ );
+#else
+ el::base::elStorage.reset(
+ new el::base::Storage(el::LogBuilderPtr(new el::base::DefaultLogBuilder()))
+ );
+#endif // ELPP_ASYNC_LOGGING
+ START_EASYLOGGINGPP(argc, argv);
LOG(INFO) << "Initializing alignment provider";
DazAlnProvider* ap;
@@ -240,7 +218,7 @@ int main(int argc, char* argv[]) {
std::thread ct(Consensus, i, std::ref(trgBuf), std::ref(cnsBuf));
cnsThreads.push_back(std::move(ct));
}
-
+
std::thread readerThread(Reader, std::ref(trgBuf), ap);
writerThread.join();
diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp
index d64de00..8a8c34c 100644
--- a/src/cpp/main.cpp
+++ b/src/cpp/main.cpp
@@ -1,39 +1,3 @@
-// Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-//
-// * Neither the name of Pacific Biosciences nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-
#include <cstdint>
#include <cassert>
#include <iostream>
@@ -60,7 +24,7 @@
#include "SimpleAligner.hpp"
#include "ProgramOpts.hpp"
-INITIALIZE_EASYLOGGINGPP
+INITIALIZE_NULL_EASYLOGGINGPP
ProgramOpts popts;
@@ -76,8 +40,8 @@ class Reader {
size_t minCov_;
int nCnsThreads_;
public:
- Reader(AlnBuf* b, const std::string fpath, size_t minCov) :
- alnBuf_(b),
+ Reader(AlnBuf* b, const std::string fpath, size_t minCov) :
+ alnBuf_(b),
fpath_(fpath),
minCov_(minCov)
{ }
@@ -90,7 +54,7 @@ public:
el::Logger* logger = el::Loggers::getLogger("Reader");
try {
AlnProvider* ap;
- if (fpath_ == "-") {
+ if (fpath_ == "-") {
ap = new BlasrM5AlnProvider(&std::cin);
} else {
ap = new BlasrM5AlnProvider(fpath_);
@@ -102,7 +66,7 @@ public:
size_t cov = alns.size();
if (cov == 0) continue;
if (cov < minCov_) {
- logger->debug("Coverage requirement not met for %v, coverage: %v",
+ logger->debug("Coverage requirement not met for %v, coverage: %v",
alns[0].id, alns.size());
continue;
}
@@ -111,12 +75,12 @@ public:
logger->debug(msg.str());
alnBuf_->push(alns);
}
- }
+ }
catch (M5Exception::FileOpenError) {
logger->error("Error opening file: %s", fpath_);
}
catch (M5Exception::FormatError err) {
- logger->error("Format error. Input: %s, Error: %s",
+ logger->error("Format error. Input: %s, Error: %s",
fpath_, err.msg);
}
catch (M5Exception::SortError err) {
@@ -137,8 +101,8 @@ class Consensus {
int minWeight_;
SimpleAligner aligner;
public:
- Consensus(AlnBuf* ab, CnsBuf* cb, size_t minLen, int minWeight) :
- alnBuf_(ab),
+ Consensus(AlnBuf* ab, CnsBuf* cb, size_t minLen, int minWeight) :
+ alnBuf_(ab),
cnsBuf_(cb),
minLen_(minLen),
minWeight_(minWeight)
@@ -160,8 +124,8 @@ public:
msg % alns.size();
logger->info(msg.str());
- if (AlignFirst)
- for_each(alns.begin(), alns.end(), aligner);
+ if (AlignFirst)
+ for_each(alns.begin(), alns.end(), aligner);
AlnGraphBoost ag(alns[0].tlen);
for (auto it = alns.begin(); it != alns.end(); ++it) {
@@ -177,7 +141,7 @@ public:
boost::format fasta(">%s/%d_%d\n%s\n");
fasta % alns[0].id % result.range[0] % result.range[1];
fasta % result.seq;
- cnsBuf_->push(fasta.str());
+ cnsBuf_->push(fasta.str());
}
alnBuf_->pop(&alns);
@@ -192,7 +156,7 @@ class Writer {
int nCnsThreads_;
public:
Writer(CnsBuf* cb) : cnsBuf_(cb) {}
-
+
void setNumCnsThreads(int n) {
nCnsThreads_ = n;
}
@@ -203,7 +167,7 @@ public:
int sentinelCount = 0;
while (true) {
std::cout << cns;
- if (cns == "" && ++sentinelCount == nCnsThreads_)
+ if (cns == "" && ++sentinelCount == nCnsThreads_)
break;
cnsBuf_->pop(&cns);
@@ -243,11 +207,11 @@ void parseArgs(int argc, char **argv) {
"Turns on verbose logging", cmd, false);
TCLAP::UnlabeledValueArg<std::string> inputArg(
- "input", "Input data",
+ "input", "Input data",
true, "-","either file path or stdin", cmd);
cmd.parse(argc, argv);
-
+
popts.minCov = minCovArg.getValue();
popts.minLen = minLenArg.getValue();
popts.trim = trimArg.getValue();
@@ -261,8 +225,18 @@ void parseArgs(int argc, char **argv) {
}
int main(int argc, char* argv[]) {
- START_EASYLOGGINGPP(argc, argv);
parseArgs(argc, argv);
+#if ELPP_ASYNC_LOGGING
+ el::base::elStorage.reset(
+ new el::base::Storage(el::LogBuilderPtr(new el::base::DefaultLogBuilder()),
+ new el::base::AsyncDispatchWorker())
+ );
+#else
+ el::base::elStorage.reset(
+ new el::base::Storage(el::LogBuilderPtr(new el::base::DefaultLogBuilder()))
+ );
+#endif // ELPP_ASYNC_LOGGING
+ START_EASYLOGGINGPP(argc, argv);
el::Logger* logger = el::Loggers::getLogger("default");
@@ -275,9 +249,9 @@ int main(int argc, char* argv[]) {
CnsBuf cnsBuf(30);
if (popts.threads > 1) {
- logger->info("Multi-threaded. Input: %v, Threads: %v",
+ logger->info("Multi-threaded. Input: %v, Threads: %v",
popts.input, popts.threads);
-
+
Writer writer(&cnsBuf);
writer.setNumCnsThreads(popts.threads);
std::thread writerThread(writer);
@@ -296,7 +270,7 @@ int main(int argc, char* argv[]) {
std::vector<std::thread>::iterator it;
for (it = cnsThreads.begin(); it != cnsThreads.end(); ++it)
it->join();
-
+
readerThread.join();
} else {
logger->info("Single-threaded. Input: %v", popts.input);
@@ -309,6 +283,6 @@ int main(int argc, char* argv[]) {
cns();
writer();
}
-
+
return 0;
}
diff --git a/src/cpp/makefile b/src/cpp/makefile
new file mode 100644
index 0000000..1fbfbb1
--- /dev/null
+++ b/src/cpp/makefile
@@ -0,0 +1,76 @@
+all:
+
+THISDIR:=$(dir $(lastword ${MAKEFILE_LIST}))
+-include ${CURDIR}/../../defines.mk
+
+COMMON_OBJECTS := Alignment.o AlnGraphBoost.o
+PBDAGCON_OBJECTS := BlasrM5AlnProvider.o main.o SimpleAligner.o
+DAZCON_OBJECTS := DB.o QV.o align.o DazAlnProvider.o dazcon.o
+
+CPPFLAGS += -MMD -MP
+CXXFLAGS = -O3 -std=c++11 -Wall -Wuninitialized -pedantic
+CFLAGS = -O3 -Wall -Wextra -fno-strict-aliasing
+
+INCDIRS := \
+ ${DAZZ_DB_INCLUDE} \
+ ${DALIGNER_INCLUDE} \
+ ${LIBBLASR_INCLUDE} \
+ ${LIBPBDATA_INCLUDE} \
+ ${LIBPBIHDF_INCLUDE} \
+ ${PBBAM_INCLUDE} \
+ ${HDF5_INCLUDE} \
+ ${HTSLIB_INCLUDE}
+SYS_INCDIRS := \
+ ${BOOST_INCLUDE} \
+ ${THISDIR}/third-party
+LIBDIRS := \
+ ${LIBBLASR_LIB} \
+ ${LIBPBDATA_LIB} \
+ ${LIBPBIHDF_LIB} \
+ ${PBBAM_LIB} \
+ ${HDF5_LIB} \
+ ${HTSLIB_LIB} \
+ ${GCC_LIB} \
+ ${ZLIB_LIB}
+LDLIBS+= \
+ ${LIBBLASR_LIBFLAGS} \
+ ${LIBPBDATA_LIBFLAGS} \
+ ${LIBPBIHDF_LIBFLAGS} \
+ ${PBBAM_LIBFLAGS} \
+ ${HDF5_LIBFLAGS} \
+ ${HTSLIB_LIBFLAGS} \
+ ${ZLIB_LIBFLAGS} \
+ ${PTHREAD_LIBFLAGS} \
+ ${DL_LIBFLAGS}
+CPPFLAGS+=$(patsubst %,-I%,${INCDIRS}) $(patsubst %,-isystem%,${SYS_INCDIRS})
+LDFLAGS+=$(patsubst %,-L %,${LIBDIRS})
+LDFLAGS += ${EXTRA_LDFLAGS}
+
+vpath align.c ${DALIGNER_SRC}
+vpath DB.c ${DAZZ_DB_SRC}
+vpath QV.c ${DAZZ_DB_SRC}
+
+all: pbdagcon dazcon
+
+# Technically does not need pbdata or blasr, but so what?
+dazcon: $(COMMON_OBJECTS) $(DAZCON_OBJECTS)
+ $(CXX) -o $@ $^ $(LDFLAGS) $(LDLIBS)
+
+pbdagcon: $(COMMON_OBJECTS) $(PBDAGCON_OBJECTS)
+ $(CXX) -o $@ $^ $(LDFLAGS) $(LDLIBS)
+
+$(COMMON_OBJECTS) $(PBDAGCON_OBJECTS):
+
+clean:
+ $(RM) *.d
+ $(RM) *.o
+ $(RM) pbdagcon
+ $(RM) dazcon
+
+.PHONY: all clean
+
+SRCS:= $(notdir $(wildcard ${THISDIR}/*.c))
+CPP_SRCS:=$(notdir $(wildcard ${THISDIR}/*.cpp))
+DEPS:=$(patsubst %.c,%.d,${SRCS})
+CPP_DEPS:=$(patsubst %.cpp,%.d,${CPP_SRCS})
+-include ${DEPS} ${CPP_DEPS}
diff --git a/src/cpp/pbdagcon_wf.sh b/src/cpp/pbdagcon_wf.sh
index 76912da..97e8fbc 100755
--- a/src/cpp/pbdagcon_wf.sh
+++ b/src/cpp/pbdagcon_wf.sh
@@ -1,43 +1,4 @@
#!/bin/bash
-
-#################################################################################$$
-# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-
# Simple pbdagcon workflow script. Written for the benefit of running via
# smrtpipe so I can communicate pipe errors to the task. We're overcoming
# the limitation of smrtpipe forcing tasks to run serially, enabling a new
diff --git a/src/cpp/pbi.mk b/src/cpp/pbi.mk
deleted file mode 100644
index db3a232..0000000
--- a/src/cpp/pbi.mk
+++ /dev/null
@@ -1,22 +0,0 @@
-# Darwin/Clang is unhappy with -L for a non-existent directory, so we
-# cannot use this file to build on OSX.
-# Instead of a bunch of ifdefs, we can conditionally include this file.
-
-# Defines some variables specific to the PBI build env using relative paths
-BASEDIR ?= ../../../../../..
-PREBUILT := $(BASEDIR)/smrtanalysis/prebuilt.out
-BIFX := $(BASEDIR)/smrtanalysis/bioinformatics
-BLASR ?= $(BIFX)/lib/cpp/alignment
-PBDATA ?= $(BIFX)/lib/cpp/pbdata
-PBBAM := $(BIFX)/staging/PostPrimary/pbbam
-ZLIB := $(PREBUILT)/zlib/zlib-1.2.5/$(OS_STRING2)/lib
-
-ifneq ($(wildcard $(PBBAM)/*),)
-EXTRA_INCDIRS := -I$(PBBAM)/include -I$(PBBAM)/third-party/htslib
-EXTRA_LDFLAGS := -L$(PBBAM)/lib -L$(PBBAM)/third-party/htslib -L$(ZLIB)
-
-# We are moving to BAM, which requires extra lib support when compiling against
-# libblasr. This conditional allows backward compatable compilations with
-# PacificBiosciences/blasr_libcpp.
-EXTRA_LDLIBS = -lpbbam -lhts -lz -lpthread
-endif
diff --git a/src/q-sense.py b/src/q-sense.py
index 72e51f1..7ca68c7 100755
--- a/src/q-sense.py
+++ b/src/q-sense.py
@@ -1,43 +1,4 @@
#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-
import sys
import os
import logging
diff --git a/test/cpp/.gitignore b/test/cpp/.gitignore
new file mode 100644
index 0000000..a5f12e7
--- /dev/null
+++ b/test/cpp/.gitignore
@@ -0,0 +1,2 @@
+/xml/
+/test-*
diff --git a/test/cpp/AlignmentTest.cpp b/test/cpp/AlignmentTest.cpp
index 6cd3bc8..8f4883f 100644
--- a/test/cpp/AlignmentTest.cpp
+++ b/test/cpp/AlignmentTest.cpp
@@ -1,3 +1,4 @@
+#include <cstdlib>
#include <iostream>
#include <fstream>
#include <string>
@@ -7,6 +8,14 @@
using namespace dagcon;
+std::string dataDir() {
+ char const* val = getenv("PBDAGCON_TEST_DATA_DIR");
+ if (!val || !*val) {
+ return ".";
+ }
+ return val;
+}
+
TEST(Alignment, Normalize) {
Alignment a, b;
a.start = 1;
@@ -24,10 +33,10 @@ TEST(Alignment, Normalize) {
b = normalizeGaps(b);
EXPECT_EQ("CCG--T", b.qstr);
EXPECT_EQ("CCGACT", b.tstr);
-
+
// another gap reference push
- a.tstr = "ATATTA---GGC";
- a.qstr = "ATAT-AGCCGGC";
+ a.tstr = "ATATTA---GGC";
+ a.qstr = "ATAT-AGCCGGC";
b = a;
b = normalizeGaps(b);
@@ -38,7 +47,9 @@ TEST(Alignment, Normalize) {
}
TEST(Alignment, ParseBasic) {
- std::ifstream file("basic.m5");
+ std::string fn = dataDir() + "basic.m5";
+ std::ifstream file(fn.c_str());
+ ASSERT_TRUE(file.good());
Alignment aln;
file >> aln;
EXPECT_EQ(1, aln.start);
@@ -52,7 +63,9 @@ TEST(Alignment, ParseBasic) {
}
TEST(Alignment, ParseQuery) {
- std::ifstream file("parsequery.m5");
+ std::string fn = dataDir() + "parsequery.m5";
+ std::ifstream file(fn.c_str());
+ ASSERT_TRUE(file.good());
Alignment aln;
Alignment::groupByTarget = false;
file >> aln;
@@ -64,15 +77,67 @@ TEST(Alignment, ParseQuery) {
EXPECT_EQ("CTGCA--CT", aln.qstr.substr(0,9));
}
-TEST(AlnGraphBoostTest, Trim) {
+TEST(Alignment, Trim) {
+ std::string const t = "ACG-TCA-GCA";
+ std::string const q = "AC-C-C-T---";
+ {
+ dagcon::Alignment aln;
+ aln.tstr = t;
+ aln.qstr = q;
+ aln.start = 1;
+ aln.strand = '-';
+
+ trimAln(aln, 0);
+ EXPECT_EQ(1, aln.start);
+ EXPECT_EQ(t, aln.tstr);
+ EXPECT_EQ(q, aln.qstr);
+ }
+ {
+ dagcon::Alignment aln;
+ aln.tstr = t;
+ aln.qstr = q;
+ aln.start = 1;
+ aln.strand = '-';
+
+ trimAln(aln, 3);
+ EXPECT_EQ(4, aln.start);
+ EXPECT_EQ("-TCA-", aln.tstr);
+ EXPECT_EQ("C-C-T", aln.qstr);
+ }
+ {
+ dagcon::Alignment aln;
+ aln.tstr = t;
+ aln.qstr = q;
+ aln.start = 1;
+ aln.strand = '-';
+
+ trimAln(aln, 4);
+ EXPECT_EQ(5, aln.start);
+ EXPECT_EQ("C", aln.tstr);
+ EXPECT_EQ("C", aln.qstr);
+ }
+ {
+ dagcon::Alignment aln;
+ aln.tstr = t;
+ aln.qstr = q;
+ aln.start = 1;
+ aln.strand = '-';
+
+ trimAln(aln, 5);
+ EXPECT_EQ(6, aln.start);
+ EXPECT_EQ("", aln.tstr);
+ EXPECT_EQ("", aln.qstr);
+ }
+ {
dagcon::Alignment aln;
- aln.tstr = "GATGAAGCCGGGG---TTACAGGCATGGATGTGGATAACGTCGAGTA-C-AGTC-GTA----TTC--C-TGCAGGGGTTGACG-TTTT-CCACCATCGCACGCCGGGACCATCACCGT-TAT-GAAGATTCAGCA-CCGGGG-GCGGCTGAATGATTTTCTCCTGATGGCCATTGACGGAGGATGATTGCCCGGCCGGAC";
- aln.qstr = "GATGAAGC-GGGGGCGTTACAG-CATGGATGT------CG---AGTAACG-GTCAGTACCAGTTCATCCTGCAGG--TTGACGGTTTTTCCACCATCGCACGCCGGGACCATCACCGTGTATCGAAGATTCA-CAAC-GGGGC-CG-CTGAATGATTTTCTC-TG-TG-CCAT-GACGGAGGATGAT-GCCCG-CCGGA-";
+ aln.tstr = t;
+ aln.qstr = q;
aln.start = 1;
aln.strand = '-';
- trimAln(aln, 14);
- EXPECT_EQ("TACAGGCATGGATGTGGATAACGTCGAGTA-C-AGTC-GTA----TTC--C-TGCAGGGGTTGACG-TTTT-CCACCATCGCACGCCGGGACCATCACCGT-TAT-GAAGATTCAGCA-CCGGGG-GCGGCTGAATGATTTTCTCCTGATGGCCATTGACGGAGGATGA", aln.tstr);
- EXPECT_EQ("TACAG-CATGGATGT------CG---AGTAACG-GTCAGTACCAGTTCATCCTGCAGG--TTGACGGTTTTTCCACCATCGCACGCCGGGACCATCACCGTGTATCGAAGATTCA-CAAC-GGGGC-CG-CTGAATGATTTTCTC-TG-TG-CCAT-GACGGAGGATGA", aln.qstr);
- EXPECT_EQ(15, aln.start);
+ trimAln(aln, 500);
+ // EXPECT_EQ(1 + 9, aln.start); // start could be anything, really
+ EXPECT_EQ("", aln.tstr);
+ EXPECT_EQ("", aln.qstr);
+ }
}
diff --git a/test/cpp/AlnGraphBoostTest.cpp b/test/cpp/AlnGraphBoostTest.cpp
index 8e2fee8..7704ce9 100644
--- a/test/cpp/AlnGraphBoostTest.cpp
+++ b/test/cpp/AlnGraphBoostTest.cpp
@@ -1,6 +1,7 @@
#include <string>
#include <sstream>
#include <map>
+#include <memory>
#include <vector>
#include <gtest/gtest.h>
#include <boost/graph/adjacency_list.hpp>
@@ -9,9 +10,9 @@
TEST(AlnGraphBoostTest, RawConsensus) {
std::string backbone = "ATATTAGGC";
- AlnGraphBoost ag(backbone);
- dagcon::Alignment *algs = new dagcon::Alignment[5];
-
+ AlnGraphBoost ag(backbone);
+ std::unique_ptr<dagcon::Alignment[]> algs(new dagcon::Alignment[5]);
+
algs[0].tstr = "ATATTA---GGC";
algs[0].qstr = "ATAT-AGCCGGC";
@@ -46,7 +47,7 @@ TEST(AlnGraphBoostTest, RawConsensus) {
}
TEST(AlnGraphBoostTest, DanglingNodes) {
- AlnGraphBoost ag(12);
+ AlnGraphBoost ag(12);
dagcon::Alignment a;
a.tstr = "C-GCGGA-T-G-";
a.qstr = "CCGCGG-G-A-T";
diff --git a/test/cpp/Makefile b/test/cpp/Makefile
deleted file mode 100644
index 5a72813..0000000
--- a/test/cpp/Makefile
+++ /dev/null
@@ -1,77 +0,0 @@
-.PHONY: all check test_target_hit test_alngraph test_alignment test_simple_aligner
-# project source code
-SRCDIR := ../../src/cpp
-
-include $(SRCDIR)/pbi.mk
-include $(SRCDIR)/boost.mk
-include gtest.mk
-
-GTEST_CPPFLAGS += -isystem $(GTEST_DIR)/include
-GTEST_CXXFLAGS += -g -Wall -Wextra -pthread
-
-INCDIRS := -I$(PBDATA) -I$(BLASR) -I$(BOOST_HEADERS) -I$(GTEST_DIR)/include -I$(GTEST_DIR) $(EXTRA_INCDIRS)
-LDFLAGS := -L$(PBDATA) -L$(BLASR) $(EXTRA_LDFLAGS)
-LDLIBS := -lpbdata -lblasr -lpthread ${EXTRA_LDLIBS}
-
-CXXFLAGS := -O3 -std=c++11 $(INCDIRS) -I$(SRCDIR)
-
-GTEST_OBJECTS := gtest-all.o gtest_main.o
-DAZCON_OBJECTS := $(SRCDIR)/DB.o $(SRCDIR)/align.o $(SRCDIR)/Alignment.o \
- $(SRCDIR)/DazAlnProvider.o
-
-PBDAGCON_OBJECTS := $(SRCDIR)/AlnGraphBoost.o $(SRCDIR)/Alignment.o \
- $(SRCDIR)/SimpleAligner.o
-vpath %.cc $(gtest_version)/src
-
-BUILDMSG = "=== Building $@ ==="
-
-all: check
-
-check: test_target_hit test_alngraph test_alignment test_simple_aligner
-
-# XXX: need to mock out db_ in order to run test_target
-
-test_target_hit: $(GTEST_OBJECTS) $(DAZCON_OBJECTS) TargetHitTest.o
- @echo $(BUILDMSG)
- $(CXX) $^ -o $@ -lpthread
- ./$@
-
-test_target: $(GTEST_OBJECTS) $(DAZCON_OBJECTS) TargetTest.o
- @echo $(BUILDMSG)
- $(CXX) $^ -o $@ -lpthread
- ./$@
-
-test_alngraph: $(GTEST_OBJECTS) $(PBDAGCON_OBJECTS) AlnGraphBoostTest.o
- @echo $(BUILDMSG)
- $(CXX) $^ -static -o $@ $(LDFLAGS) $(LDLIBS)
- ./$@
-
-test_alignment: $(GTEST_OBJECTS) $(PBDAGCON_OBJECTS) AlignmentTest.o
- @echo $(BUILDMSG)
- $(CXX) $^ -static -o $@ $(LDFLAGS) $(LDLIBS)
- ./$@
-
-test_simple_aligner: $(GTEST_OBJECTS) $(PBDAGCON_OBJECTS) SimpleAlignerTest.o
- @echo $(BUILDMSG)
- $(CXX) $^ -static -o $@ $(LDFLAGS) $(LDLIBS)
- ./$@
-
-$(SRCDIR)/AlnGraphBoost.o: $(BOOST_HEADERS)
-
-$(GTEST_OBJECTS): $(GTEST_DIR)
- $(CXX) $(GTEST_CPPFLAGS) -I$(GTEST_DIR) $(GTEST_CXXFLAGS) -c $</src/$(@:.o=.cc)
-
-$(BOOST_HEADERS):
- @echo Fetching boost headers from $(URI)
- cd $(SRCDIR)/third-party && $(GET_BOOST)
-
-$(GTEST_DIR):
- @echo Fetching gtest from $(gtest_uri)
- $(get_gtest)
- unzip -q $(gtest_version).zip
- $(RM) $(gtest_version).zip
-
-clean:
- $(RM) $(SRCDIR)/*.o
- $(RM) *.o
- $(RM) test_*
diff --git a/test/cpp/TargetHitTest.cpp b/test/cpp/TargetHitTest.cpp
index 758251a..5b477aa 100644
--- a/test/cpp/TargetHitTest.cpp
+++ b/test/cpp/TargetHitTest.cpp
@@ -12,13 +12,13 @@ TEST(TargetHitTest, single_overlap_perfect) {
path.bbpos = 1231;
path.bepos = 8217;
path.diffs = 0;
-
+
rec.ovl.path = path;
TargetHit th;
th.add(rec);
th.computeOvlScore();
- EXPECT_FLOAT_EQ(6986, th.ovlScore);
+ EXPECT_FLOAT_EQ(6986, th.ovlScore);
}
TEST(TargetHitTest, single_overlap_inaccurate) {
@@ -32,13 +32,13 @@ TEST(TargetHitTest, single_overlap_inaccurate) {
path.bbpos = 2000;
path.bepos = 6000;
path.diffs = 230;
-
+
rec.ovl.path = path;
TargetHit th;
th.add(rec);
th.computeOvlScore();
- EXPECT_FLOAT_EQ(3770, th.ovlScore);
+ EXPECT_FLOAT_EQ(3770, th.ovlScore);
}
TEST(TargetHitTest, multi_overlap_inaccurate) {
@@ -65,13 +65,13 @@ TEST(TargetHitTest, multi_overlap_inaccurate) {
p2.bepos = 7995;
p2.diffs = 53;
r2.ovl.path = p2;
-
+
TargetHit th;
th.add(r1);
th.computeOvlScore();
- EXPECT_FLOAT_EQ(3770, th.ovlScore);
+ EXPECT_FLOAT_EQ(3770, th.ovlScore);
th.add(r2);
th.computeOvlScore();
- EXPECT_FLOAT_EQ(4721, th.ovlScore);
+ EXPECT_FLOAT_EQ(4721, th.ovlScore);
}
diff --git a/test/cpp/TargetTest.cpp b/test/cpp/TargetTest.cpp
index f06f526..5c7c347 100644
--- a/test/cpp/TargetTest.cpp
+++ b/test/cpp/TargetTest.cpp
@@ -8,11 +8,11 @@ TEST(TargetTest, add_record) {
o1.aread = 1; o1.bread = 3; o1.flags = 0;
o2.aread = 1; o2.bread = 3; o2.flags = 0;
o3.aread = 2; o3.bread = 3; o3.flags = 1;
-
+
r1.ovl = o1;
r2.ovl = o2;
r3.ovl = o3;
-
+
Target t;
t.addRecord(r1);
t.addRecord(r2);
diff --git a/test/cpp/gtest.mk b/test/cpp/gtest.mk
deleted file mode 100644
index c5b2fee..0000000
--- a/test/cpp/gtest.mk
+++ /dev/null
@@ -1,10 +0,0 @@
-# user-specified location of gtest
-ifdef gtest
-GTEST_DIR := $(gtest)
-else
-# download/unpack a version from the inter-web
-gtest_version := gtest-1.7.0
-gtest_uri := https://googletest.googlecode.com/files/$(gtest_version).zip
-get_gtest := curl -O $(gtest_uri)
-GTEST_DIR := $(gtest_version)
-endif
diff --git a/test/cpp/makefile b/test/cpp/makefile
new file mode 100644
index 0000000..e17cb46
--- /dev/null
+++ b/test/cpp/makefile
@@ -0,0 +1,117 @@
+.PHONY: all check test_target_hit test_alngraph test_alignment test_simple_aligner
+all:
+
+THISDIR:=$(dir $(lastword ${MAKEFILE_LIST}))
+-include ${CURDIR}/../../defines.mk
+SRCDIR := ${THISDIR}
+
+INCDIRS := . \
+ ${SRCDIR} \
+ ${SRCDIR}/../../src/cpp \
+ ${DAZZ_DB_INCLUDE} \
+ ${DALIGNER_INCLUDE} \
+ ${LIBBLASR_INCLUDE} \
+ ${LIBPBDATA_INCLUDE} \
+ ${LIBPBIHDF_INCLUDE} \
+ ${PBBAM_INCLUDE} \
+ ${HDF5_INCLUDE} \
+ ${HTSLIB_INCLUDE} \
+ ${BOOST_INCLUDE} \
+ ${GTEST_INCLUDE} \
+ third-party
+
+LIBDIRS := \
+ ${LIBBLASR_LIB} \
+ ${LIBPBDATA_LIB} \
+ ${LIBPBIHDF_LIB} \
+ ${PBBAM_LIB} \
+ ${HDF5_LIB} \
+ ${HTSLIB_LIB} \
+ ${GCC_LIB} \
+ ${ZLIB_LIB}
+
+PTHREAD_LIBFLAGS:=-lpthread
+LDLIBS+= \
+ ${LIBBLASR_LIBFLAGS} \
+ ${LIBPBDATA_LIBFLAGS} \
+ ${LIBPBIHDF_LIBFLAGS} \
+ ${PBBAM_LIBFLAGS} \
+ ${HDF5_LIBFLAGS} \
+ ${HTSLIB_LIBFLAGS} \
+ ${ZLIB_LIBFLAGS} \
+ ${PTHREAD_LIBFLAGS} \
+ ${DL_LIBFLAGS}
+CPPFLAGS+=$(patsubst %,-I%,${INCDIRS})
+LDFLAGS+=$(patsubst %,-L %,${LIBDIRS})
+
+# For fused-src, gtest-all.cc includes gtest/*.
+# For non-fused, gtest-all.cc includes src/gtest-*.
+# So we add -I ${GTEST_SRC}/.. for the latter case.
+GTEST_CPPFLAGS += -isystem $(GTEST_INCLUDE) -isystem ${GTEST_SRC}/..
+GTEST_CXXFLAGS += -g -Wall -Wextra -pthread
+
+CXXFLAGS := -O3 -std=c++11
+
+GTEST_OBJECTS := gtest-all.o gtest_main.o
+DAZCON_OBJECTS := DB.o QV.o align.o Alignment.o \
+ DazAlnProvider.o
+
+PBDAGCON_OBJECTS := AlnGraphBoost.o Alignment.o \
+ SimpleAligner.o
+pbdagcon_testexes := test-alngraph \
+ test-alignment \
+ test-simple_aligner \
+ ${null}
+dazcon_testexes := test-target_hit \
+ test-target \
+ ${null}
+# Remove the test from the broken_testexes variable, once fixed.
+broken_testexes := test-target \
+ ${null}
+testexes := ${pbdagcon_testexes} ${dazcon_testexes}
+# Remove broken tests exes from the testexes list
+testexes := $(filter-out ${broken_testexes},${testexes})
+
+empty:=
+space:=${empty} ${empty}
+ldp+=$(subst ${space},:,${LIBDIRS}):${LD_LIBRARY_PATH}
+#export LD_LIBRARY_PATH
+
+vpath %.c ${THISDIR}/../../src/cpp
+vpath %.cpp ${THISDIR}/../../src/cpp
+vpath %.cc ${GTEST_SRC}
+vpath align.c ${DALIGNER_SRC}
+vpath DB.c ${DAZZ_DB_SRC}
+vpath QV.c ${DAZZ_DB_SRC}
+
+#VALGRIND?=/mnt/software/v/valgrind/3.10.1/bin/valgrind --leak-check=full
+BUILDMSG = "=== Building $@ ==="
+
+${GTEST_OBJECTS}: CXXFLAGS+=${GTEST_CXXFLAGS}
+${GTEST_OBJECTS}: CPPFLAGS+=${GTEST_CPPFLAGS}
+
+all: check
+check: gtest-run
+gtest-run: $(testexes:%=%-gtestrun)
+gtest-build: ${testexes}
+xml:
+ mkdir xml
+
+%-gtestrun: % | xml
+ LD_LIBRARY_PATH=${ldp} PBDAGCON_TEST_DATA_DIR=${THISDIR} ${VALGRIND} ./$< --gtest_output=xml:xml/${<F}.xml
+test-%:
+ ${CXX} -o $@ $^ ${LDFLAGS} ${LDLIBS}
+
+# dazcon test executables:
+test-target_hit: ${GTEST_OBJECTS} ${DAZCON_OBJECTS} TargetHitTest.o
+test-target: ${GTEST_OBJECTS} ${DAZCON_OBJECTS} TargetTest.o
+
+# pbdagcon test executables:
+test-alngraph: ${GTEST_OBJECTS} ${PBDAGCON_OBJECTS} AlnGraphBoostTest.o
+test-alignment: ${GTEST_OBJECTS} ${PBDAGCON_OBJECTS} AlignmentTest.o
+test-simple_aligner: ${GTEST_OBJECTS} ${PBDAGCON_OBJECTS} SimpleAlignerTest.o
+
+clean:
+ ${RM} ${SRCDIR}/*.o
+ ${RM} *.o
+ ${RM} test_*
diff --git a/travis.sh b/travis.sh
new file mode 100755
index 0000000..5762d0a
--- /dev/null
+++ b/travis.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -ex
+
+./configure.py --boost --gtest --sub
+make -j init-submodule
+make --debug=b -j
+make --debug=v -j check
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbdagcon.git
More information about the debian-med-commit
mailing list