[med-svn] [stacks] 01/04: New upstream version 1.44
Andreas Tille
tille at debian.org
Sat Oct 22 19:21:13 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository stacks.
commit 1d2ea27394ba2790e59f527d8c088b96437582b2
Author: Andreas Tille <tille at debian.org>
Date: Sat Oct 22 21:03:22 2016 +0200
New upstream version 1.44
---
ChangeLog | 20 +
acinclude.m4 | 583 +++++++++--
configure | 914 ++++++++++++++++-
configure.ac | 4 +-
scripts/load_sequences.pl | 2 +-
scripts/ref_map.pl | 5 +-
src/BamI.h | 449 +++++----
src/BamUnalignedI.h | 89 +-
src/BowtieI.h | 19 +-
src/BustardI.h | 38 +-
src/DNANSeq.cc | 131 +--
src/DNANSeq.h | 88 +-
src/DNASeq.cc | 328 +++---
src/DNASeq.h | 24 +-
src/FastaI.h | 40 +-
src/FastqI.h | 24 +-
src/GappedAln.h | 80 +-
src/PopMap.h | 8 +-
src/PopSum.h | 122 +--
src/SamI.h | 340 ++++---
src/Tsv.h | 20 +-
src/Vcf.cc | 2 +-
src/aln_utils.cc | 34 +-
src/bootstrap.h | 446 ++++-----
src/catalog_utils.cc | 40 +-
src/clean.cc | 712 ++++++-------
src/clean.h | 488 ++++-----
src/clone_filter.cc | 96 +-
src/clone_filter.h | 16 +-
src/cmb.cc | 20 +-
src/cmb.h | 4 +-
src/constants.h | 12 +-
src/cstacks.cc | 1661 +++++++++++++++---------------
src/estacks.cc | 576 +++++------
src/estacks.h | 2 +-
src/export_formats.cc | 252 ++---
src/file_io.cc | 1493 +++++++++++++--------------
src/file_io.h | 16 +-
src/genotypes.cc | 2448 ++++++++++++++++++++++-----------------------
src/genotypes.h | 2 +-
src/gzFasta.h | 86 +-
src/gzFastq.h | 62 +-
src/hstacks.cc | 372 +++----
src/input.cc | 215 ++--
src/input.h | 31 +-
src/kmer_filter.cc | 2134 +++++++++++++++++++--------------------
src/kmer_filter.h | 2 +-
src/kmers.cc | 70 +-
src/locus.cc | 47 +-
src/locus.h | 38 +-
src/log_utils.cc | 4 +-
src/models.cc | 558 +++++------
src/mst.cc | 26 +-
src/mst.h | 4 +-
src/mstack.cc | 71 +-
src/ordered.h | 330 +++---
src/phasedstacks.cc | 2010 ++++++++++++++++++-------------------
src/phasedstacks.h | 124 +--
src/populations.cc | 356 ++++---
src/process_radtags.cc | 1509 ++++++++++++++--------------
src/process_radtags.h | 30 +-
src/process_shortreads.cc | 1406 +++++++++++++-------------
src/process_shortreads.h | 30 +-
src/pstacks.cc | 188 ++--
src/pstacks.h | 7 +-
src/rxstacks.cc | 2001 ++++++++++++++++++------------------
src/rxstacks.h | 10 +-
src/smoothing.h | 168 ++--
src/smoothing_utils.h | 38 +-
src/sql_utilities.h | 22 +-
src/sstacks.cc | 1145 ++++++++++-----------
src/stacks.cc | 8 +-
src/stacks.h | 105 +-
src/ustacks.cc | 321 +++---
src/ustacks.h | 9 +-
src/utils.cc | 96 +-
src/utils.h | 4 +-
src/write.cc | 268 ++---
src/write.h | 2 +-
79 files changed, 13662 insertions(+), 11893 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 0ae3d94..8b22833 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,25 @@
+Stacks 1.44 - Oct 11, 2016
+--------------------------
+ Bugfix: corrected an error in pstacks where '=' and 'X' symbols were not recognized properly in SAM/BAM
+ CIGAR strings.
+ Bugfix: corrected some typos in pstacks/populations help output.
+
+Stacks 1.43 - Oct 05, 2016
+--------------------------
+ Feature: added alignment controls to pstacks, allowing the program to discard secondary alignments
+ and to discard alignments where a significant portion of the read was not aligned (soft-masked).
+ Bugfix: corrected a very small memory leak in the gapped alignment code, found by Valgrind.
+ Feature: updated configure test to check if compiler can handle c++11 standard.
+ Bugfix: rxstacks was not generating model files.
+ Bugfix: corrected an uncaught exception in cstacks when processing gapped alignments. In some cases when a
+ multiple alignment had to be recomputed the initial CIGAR string was not parsed properly leading to the
+ catalog and query sequences coming out of sync in their length (which could throw the exception).
+ Feature: reduced memory usage in ustacks and pstacks by not retaining all reads from a collapsed locus.
+ Bugfix: corrected -V option for populations, which was causing a crash (although --in_vcf worked).
+
Stacks 1.42 - Aug 05, 2016
--------------------------
+ Feature: Added Csp6I restriction enzyme.
Feature: populations program is now able to calculate populations statistics using arbitrary VCF files
as input.
Feature: Upgraded to the latest release of HTSLib (1.3.1) for reading BAM files. Embedded the library
diff --git a/acinclude.m4 b/acinclude.m4
index af37acd..2c18e49 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -1,133 +1,562 @@
-# ============================================================================
-# http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html
-# ============================================================================
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html
+# ===========================================================================
#
# SYNOPSIS
#
-# AX_CXX_COMPILE_STDCXX_11([ext|noext],[mandatory|optional])
+# AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional])
#
# DESCRIPTION
#
-# Check for baseline language coverage in the compiler for the C++11
-# standard; if necessary, add switches to CXXFLAGS to enable support.
+# Check for baseline language coverage in the compiler for the specified
+# version of the C++ standard. If necessary, add switches to CXX and
+# CXXCPP to enable support. VERSION may be '11' (for the C++11 standard)
+# or '14' (for the C++14 standard).
#
-# The first argument, if specified, indicates whether you insist on an
+# The second argument, if specified, indicates whether you insist on an
# extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
# -std=c++11). If neither is specified, you get whatever works, with
# preference for an extended mode.
#
-# The second argument, if specified 'mandatory' or if left unspecified,
-# indicates that baseline C++11 support is required and that the macro
-# should error out if no mode with that support is found. If specified
-# 'optional', then configuration proceeds regardless, after defining
-# HAVE_CXX11 if and only if a supporting mode is found.
+# The third argument, if specified 'mandatory' or if left unspecified,
+# indicates that baseline support for the specified C++ standard is
+# required and that the macro should error out if no mode with that
+# support is found. If specified 'optional', then configuration proceeds
+# regardless, after defining HAVE_CXX${VERSION} if and only if a
+# supporting mode is found.
#
# LICENSE
#
# Copyright (c) 2008 Benjamin Kosnik <bkoz at redhat.com>
# Copyright (c) 2012 Zack Weinberg <zackw at panix.com>
# Copyright (c) 2013 Roy Stogner <roystgnr at ices.utexas.edu>
+# Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <sokolov at google.com>
+# Copyright (c) 2015 Paul Norman <penorman at mac.com>
+# Copyright (c) 2015 Moritz Klammler <moritz at klammler.eu>
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
-# and this notice are preserved. This file is offered as-is, without any
+# and this notice are preserved. This file is offered as-is, without any
# warranty.
-#serial 3
+#serial 4
-m4_define([_AX_CXX_COMPILE_STDCXX_11_testbody], [
- template <typename T>
- struct check
- {
- static_assert(sizeof(int) <= sizeof(T), "not big enough");
- };
-
- typedef check<check<bool>> right_angle_brackets;
-
- int a;
- decltype(a) b;
+dnl This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro
+dnl (serial version number 13).
- typedef check<int> check_type;
- check_type c;
- check_type&& cr = static_cast<check_type&&>(c);
-
- auto d = a;
-])
-
-AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [dnl
- m4_if([$1], [], [],
- [$1], [ext], [],
- [$1], [noext], [],
- [m4_fatal([invalid argument `$1' to AX_CXX_COMPILE_STDCXX_11])])dnl
- m4_if([$2], [], [ax_cxx_compile_cxx11_required=true],
- [$2], [mandatory], [ax_cxx_compile_cxx11_required=true],
- [$2], [optional], [ax_cxx_compile_cxx11_required=false],
- [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX_11])])dnl
+AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
+ m4_if([$1], [11], [],
+ [$1], [14], [],
+ [$1], [17], [m4_fatal([support for C++17 not yet implemented in AX_CXX_COMPILE_STDCXX])],
+ [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl
+ m4_if([$2], [], [],
+ [$2], [ext], [],
+ [$2], [noext], [],
+ [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl
+ m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true],
+ [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true],
+ [$3], [optional], [ax_cxx_compile_cxx$1_required=false],
+ [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])])
AC_LANG_PUSH([C++])dnl
ac_success=no
- AC_CACHE_CHECK(whether $CXX supports C++11 features by default,
- ax_cv_cxx_compile_cxx11,
- [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
- [ax_cv_cxx_compile_cxx11=yes],
- [ax_cv_cxx_compile_cxx11=no])])
- if test x$ax_cv_cxx_compile_cxx11 = xyes; then
+ AC_CACHE_CHECK(whether $CXX supports C++$1 features by default,
+ ax_cv_cxx_compile_cxx$1,
+ [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+ [ax_cv_cxx_compile_cxx$1=yes],
+ [ax_cv_cxx_compile_cxx$1=no])])
+ if test x$ax_cv_cxx_compile_cxx$1 = xyes; then
ac_success=yes
fi
- m4_if([$1], [noext], [], [dnl
+ m4_if([$2], [noext], [], [dnl
if test x$ac_success = xno; then
- for switch in -std=gnu++11 -std=gnu++0x; do
- cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
- AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
+ for switch in -std=gnu++$1 -std=gnu++0x; do
+ cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+ AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
$cachevar,
- [ac_save_CXXFLAGS="$CXXFLAGS"
- CXXFLAGS="$CXXFLAGS $switch"
- AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
+ [ac_save_CXX="$CXX"
+ CXX="$CXX $switch"
+ AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
[eval $cachevar=yes],
[eval $cachevar=no])
- CXXFLAGS="$ac_save_CXXFLAGS"])
+ CXX="$ac_save_CXX"])
if eval test x\$$cachevar = xyes; then
- CXXFLAGS="$CXXFLAGS $switch"
+ CXX="$CXX $switch"
+ if test -n "$CXXCPP" ; then
+ CXXCPP="$CXXCPP $switch"
+ fi
ac_success=yes
break
fi
done
fi])
- m4_if([$1], [ext], [], [dnl
+ m4_if([$2], [ext], [], [dnl
if test x$ac_success = xno; then
- for switch in -std=c++11 -std=c++0x; do
- cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
- AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
+ dnl HP's aCC needs +std=c++11 according to:
+ dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
+ dnl Cray's crayCC needs "-h std=c++11"
+ for switch in -std=c++$1 -std=c++0x +std=c++$1 "-h std=c++$1"; do
+ cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+ AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
$cachevar,
- [ac_save_CXXFLAGS="$CXXFLAGS"
- CXXFLAGS="$CXXFLAGS $switch"
- AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
+ [ac_save_CXX="$CXX"
+ CXX="$CXX $switch"
+ AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
[eval $cachevar=yes],
[eval $cachevar=no])
- CXXFLAGS="$ac_save_CXXFLAGS"])
+ CXX="$ac_save_CXX"])
if eval test x\$$cachevar = xyes; then
- CXXFLAGS="$CXXFLAGS $switch"
+ CXX="$CXX $switch"
+ if test -n "$CXXCPP" ; then
+ CXXCPP="$CXXCPP $switch"
+ fi
ac_success=yes
break
fi
done
fi])
AC_LANG_POP([C++])
- if test x$ax_cxx_compile_cxx11_required = xtrue; then
+ if test x$ax_cxx_compile_cxx$1_required = xtrue; then
if test x$ac_success = xno; then
- AC_MSG_ERROR([*** A compiler with support for C++11 language features is required.])
+ AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.])
fi
+ fi
+ if test x$ac_success = xno; then
+ HAVE_CXX$1=0
+ AC_MSG_NOTICE([No compiler with C++$1 support was found])
else
- if test x$ac_success = xno; then
- HAVE_CXX11=0
- AC_MSG_NOTICE([No compiler with C++11 support was found])
- else
- HAVE_CXX11=1
- AC_DEFINE(HAVE_CXX11,1,
- [define if the compiler supports basic C++11 syntax])
- fi
-
- AC_SUBST(HAVE_CXX11)
+ HAVE_CXX$1=1
+ AC_DEFINE(HAVE_CXX$1,1,
+ [define if the compiler supports basic C++$1 syntax])
fi
+ AC_SUBST(HAVE_CXX$1)
])
+
+
+dnl Test body for checking C++11 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11],
+ _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+)
+
+
+dnl Test body for checking C++14 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14],
+ _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+ _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+)
+
+
+dnl Tests for new features in C++11
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[
+
+// If the compiler admits that it is not ready for C++11, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201103L
+
+#error "This is not a C++11 compiler"
+
+#else
+
+namespace cxx11
+{
+
+ namespace test_static_assert
+ {
+
+ template <typename T>
+ struct check
+ {
+ static_assert(sizeof(int) <= sizeof(T), "not big enough");
+ };
+
+ }
+
+ namespace test_final_override
+ {
+
+ struct Base
+ {
+ virtual void f() {}
+ };
+
+ struct Derived : public Base
+ {
+ virtual void f() override {}
+ };
+
+ }
+
+ namespace test_double_right_angle_brackets
+ {
+
+ template < typename T >
+ struct check {};
+
+ typedef check<void> single_type;
+ typedef check<check<void>> double_type;
+ typedef check<check<check<void>>> triple_type;
+ typedef check<check<check<check<void>>>> quadruple_type;
+
+ }
+
+ namespace test_decltype
+ {
+
+ int
+ f()
+ {
+ int a = 1;
+ decltype(a) b = 2;
+ return a + b;
+ }
+
+ }
+
+ namespace test_type_deduction
+ {
+
+ template < typename T1, typename T2 >
+ struct is_same
+ {
+ static const bool value = false;
+ };
+
+ template < typename T >
+ struct is_same<T, T>
+ {
+ static const bool value = true;
+ };
+
+ template < typename T1, typename T2 >
+ auto
+ add(T1 a1, T2 a2) -> decltype(a1 + a2)
+ {
+ return a1 + a2;
+ }
+
+ int
+ test(const int c, volatile int v)
+ {
+ static_assert(is_same<int, decltype(0)>::value == true, "");
+ static_assert(is_same<int, decltype(c)>::value == false, "");
+ static_assert(is_same<int, decltype(v)>::value == false, "");
+ auto ac = c;
+ auto av = v;
+ auto sumi = ac + av + 'x';
+ auto sumf = ac + av + 1.0;
+ static_assert(is_same<int, decltype(ac)>::value == true, "");
+ static_assert(is_same<int, decltype(av)>::value == true, "");
+ static_assert(is_same<int, decltype(sumi)>::value == true, "");
+ static_assert(is_same<int, decltype(sumf)>::value == false, "");
+ static_assert(is_same<int, decltype(add(c, v))>::value == true, "");
+ return (sumf > 0.0) ? sumi : add(c, v);
+ }
+
+ }
+
+ namespace test_noexcept
+ {
+
+ int f() { return 0; }
+ int g() noexcept { return 0; }
+
+ static_assert(noexcept(f()) == false, "");
+ static_assert(noexcept(g()) == true, "");
+
+ }
+
+ namespace test_constexpr
+ {
+
+ template < typename CharT >
+ unsigned long constexpr
+ strlen_c_r(const CharT *const s, const unsigned long acc) noexcept
+ {
+ return *s ? strlen_c_r(s + 1, acc + 1) : acc;
+ }
+
+ template < typename CharT >
+ unsigned long constexpr
+ strlen_c(const CharT *const s) noexcept
+ {
+ return strlen_c_r(s, 0UL);
+ }
+
+ static_assert(strlen_c("") == 0UL, "");
+ static_assert(strlen_c("1") == 1UL, "");
+ static_assert(strlen_c("example") == 7UL, "");
+ static_assert(strlen_c("another\0example") == 7UL, "");
+
+ }
+
+ namespace test_rvalue_references
+ {
+
+ template < int N >
+ struct answer
+ {
+ static constexpr int value = N;
+ };
+
+ answer<1> f(int&) { return answer<1>(); }
+ answer<2> f(const int&) { return answer<2>(); }
+ answer<3> f(int&&) { return answer<3>(); }
+
+ void
+ test()
+ {
+ int i = 0;
+ const int c = 0;
+ static_assert(decltype(f(i))::value == 1, "");
+ static_assert(decltype(f(c))::value == 2, "");
+ static_assert(decltype(f(0))::value == 3, "");
+ }
+
+ }
+
+ namespace test_uniform_initialization
+ {
+
+ struct test
+ {
+ static const int zero {};
+ static const int one {1};
+ };
+
+ static_assert(test::zero == 0, "");
+ static_assert(test::one == 1, "");
+
+ }
+
+ namespace test_lambdas
+ {
+
+ void
+ test1()
+ {
+ auto lambda1 = [](){};
+ auto lambda2 = lambda1;
+ lambda1();
+ lambda2();
+ }
+
+ int
+ test2()
+ {
+ auto a = [](int i, int j){ return i + j; }(1, 2);
+ auto b = []() -> int { return '0'; }();
+ auto c = [=](){ return a + b; }();
+ auto d = [&](){ return c; }();
+ auto e = [a, &b](int x) mutable {
+ const auto identity = [](int y){ return y; };
+ for (auto i = 0; i < a; ++i)
+ a += b--;
+ return x + identity(a + b);
+ }(0);
+ return a + b + c + d + e;
+ }
+
+ int
+ test3()
+ {
+ const auto nullary = [](){ return 0; };
+ const auto unary = [](int x){ return x; };
+ using nullary_t = decltype(nullary);
+ using unary_t = decltype(unary);
+ const auto higher1st = [](nullary_t f){ return f(); };
+ const auto higher2nd = [unary](nullary_t f1){
+ return [unary, f1](unary_t f2){ return f2(unary(f1())); };
+ };
+ return higher1st(nullary) + higher2nd(nullary)(unary);
+ }
+
+ }
+
+ namespace test_variadic_templates
+ {
+
+ template <int...>
+ struct sum;
+
+ template <int N0, int... N1toN>
+ struct sum<N0, N1toN...>
+ {
+ static constexpr auto value = N0 + sum<N1toN...>::value;
+ };
+
+ template <>
+ struct sum<>
+ {
+ static constexpr auto value = 0;
+ };
+
+ static_assert(sum<>::value == 0, "");
+ static_assert(sum<1>::value == 1, "");
+ static_assert(sum<23>::value == 23, "");
+ static_assert(sum<1, 2>::value == 3, "");
+ static_assert(sum<5, 5, 11>::value == 21, "");
+ static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, "");
+
+ }
+
+ // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+ // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function
+ // because of this.
+ namespace test_template_alias_sfinae
+ {
+
+ struct foo {};
+
+ template<typename T>
+ using member = typename T::member_type;
+
+ template<typename T>
+ void func(...) {}
+
+ template<typename T>
+ void func(member<T>*) {}
+
+ void test();
+
+ void test() { func<foo>(0); }
+
+ }
+
+} // namespace cxx11
+
+#endif // __cplusplus >= 201103L
+
+]])
+
+
+dnl Tests for new features in C++14
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[
+
+// If the compiler admits that it is not ready for C++14, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201402L
+
+#error "This is not a C++14 compiler"
+
+#else
+
+namespace cxx14
+{
+
+ namespace test_polymorphic_lambdas
+ {
+
+ int
+ test()
+ {
+ const auto lambda = [](auto&&... args){
+ const auto istiny = [](auto x){
+ return (sizeof(x) == 1UL) ? 1 : 0;
+ };
+ const int aretiny[] = { istiny(args)... };
+ return aretiny[0];
+ };
+ return lambda(1, 1L, 1.0f, '1');
+ }
+
+ }
+
+ namespace test_binary_literals
+ {
+
+ constexpr auto ivii = 0b0000000000101010;
+ static_assert(ivii == 42, "wrong value");
+
+ }
+
+ namespace test_generalized_constexpr
+ {
+
+ template < typename CharT >
+ constexpr unsigned long
+ strlen_c(const CharT *const s) noexcept
+ {
+ auto length = 0UL;
+ for (auto p = s; *p; ++p)
+ ++length;
+ return length;
+ }
+
+ static_assert(strlen_c("") == 0UL, "");
+ static_assert(strlen_c("x") == 1UL, "");
+ static_assert(strlen_c("test") == 4UL, "");
+ static_assert(strlen_c("another\0test") == 7UL, "");
+
+ }
+
+ namespace test_lambda_init_capture
+ {
+
+ int
+ test()
+ {
+ auto x = 0;
+ const auto lambda1 = [a = x](int b){ return a + b; };
+ const auto lambda2 = [a = lambda1(x)](){ return a; };
+ return lambda2();
+ }
+
+ }
+
+ namespace test_digit_seperators
+ {
+
+ constexpr auto ten_million = 100'000'000;
+ static_assert(ten_million == 100000000, "");
+
+ }
+
+ namespace test_return_type_deduction
+ {
+
+ auto f(int& x) { return x; }
+ decltype(auto) g(int& x) { return x; }
+
+ template < typename T1, typename T2 >
+ struct is_same
+ {
+ static constexpr auto value = false;
+ };
+
+ template < typename T >
+ struct is_same<T, T>
+ {
+ static constexpr auto value = true;
+ };
+
+ int
+ test()
+ {
+ auto x = 0;
+ static_assert(is_same<int, decltype(f(x))>::value, "");
+ static_assert(is_same<int&, decltype(g(x))>::value, "");
+ return x;
+ }
+
+ }
+
+} // namespace cxx14
+
+#endif // __cplusplus >= 201402L
+
+]])
diff --git a/configure b/configure
index 25d6f19..920893f 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for Stacks 1.42.
+# Generated by GNU Autoconf 2.69 for Stacks 1.44.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='Stacks'
PACKAGE_TARNAME='stacks'
-PACKAGE_VERSION='1.42'
-PACKAGE_STRING='Stacks 1.42'
+PACKAGE_VERSION='1.44'
+PACKAGE_STRING='Stacks 1.44'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1281,7 +1281,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures Stacks 1.42 to adapt to many kinds of systems.
+\`configure' configures Stacks 1.44 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1347,7 +1347,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of Stacks 1.42:";;
+ short | recursive ) echo "Configuration of Stacks 1.44:";;
esac
cat <<\_ACEOF
@@ -1450,7 +1450,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-Stacks configure 1.42
+Stacks configure 1.44
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1907,7 +1907,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by Stacks $as_me 1.42, which was
+It was created by Stacks $as_me 1.44, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2770,7 +2770,7 @@ fi
# Define the identity of the package.
PACKAGE='stacks'
- VERSION='1.42'
+ VERSION='1.44'
cat >>confdefs.h <<_ACEOF
@@ -4345,7 +4345,7 @@ fi
- ax_cxx_compile_cxx11_required=truednl
+ ax_cxx_compile_cxx11_required=true
ac_ext=cpp
ac_cpp='$CXXCPP $CPPFLAGS'
ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
@@ -4360,22 +4360,290 @@ else
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
- template <typename T>
+
+// If the compiler admits that it is not ready for C++11, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201103L
+
+#error "This is not a C++11 compiler"
+
+#else
+
+namespace cxx11
+{
+
+ namespace test_static_assert
+ {
+
+ template <typename T>
struct check
{
static_assert(sizeof(int) <= sizeof(T), "not big enough");
};
- typedef check<check<bool>> right_angle_brackets;
+ }
+
+ namespace test_final_override
+ {
+
+ struct Base
+ {
+ virtual void f() {}
+ };
+
+ struct Derived : public Base
+ {
+ virtual void f() override {}
+ };
+
+ }
+
+ namespace test_double_right_angle_brackets
+ {
+
+ template < typename T >
+ struct check {};
+
+ typedef check<void> single_type;
+ typedef check<check<void>> double_type;
+ typedef check<check<check<void>>> triple_type;
+ typedef check<check<check<check<void>>>> quadruple_type;
+
+ }
+
+ namespace test_decltype
+ {
+
+ int
+ f()
+ {
+ int a = 1;
+ decltype(a) b = 2;
+ return a + b;
+ }
+
+ }
+
+ namespace test_type_deduction
+ {
+
+ template < typename T1, typename T2 >
+ struct is_same
+ {
+ static const bool value = false;
+ };
+
+ template < typename T >
+ struct is_same<T, T>
+ {
+ static const bool value = true;
+ };
+
+ template < typename T1, typename T2 >
+ auto
+ add(T1 a1, T2 a2) -> decltype(a1 + a2)
+ {
+ return a1 + a2;
+ }
+
+ int
+ test(const int c, volatile int v)
+ {
+ static_assert(is_same<int, decltype(0)>::value == true, "");
+ static_assert(is_same<int, decltype(c)>::value == false, "");
+ static_assert(is_same<int, decltype(v)>::value == false, "");
+ auto ac = c;
+ auto av = v;
+ auto sumi = ac + av + 'x';
+ auto sumf = ac + av + 1.0;
+ static_assert(is_same<int, decltype(ac)>::value == true, "");
+ static_assert(is_same<int, decltype(av)>::value == true, "");
+ static_assert(is_same<int, decltype(sumi)>::value == true, "");
+ static_assert(is_same<int, decltype(sumf)>::value == false, "");
+ static_assert(is_same<int, decltype(add(c, v))>::value == true, "");
+ return (sumf > 0.0) ? sumi : add(c, v);
+ }
+
+ }
+
+ namespace test_noexcept
+ {
+
+ int f() { return 0; }
+ int g() noexcept { return 0; }
+
+ static_assert(noexcept(f()) == false, "");
+ static_assert(noexcept(g()) == true, "");
+
+ }
+
+ namespace test_constexpr
+ {
+
+ template < typename CharT >
+ unsigned long constexpr
+ strlen_c_r(const CharT *const s, const unsigned long acc) noexcept
+ {
+ return *s ? strlen_c_r(s + 1, acc + 1) : acc;
+ }
+
+ template < typename CharT >
+ unsigned long constexpr
+ strlen_c(const CharT *const s) noexcept
+ {
+ return strlen_c_r(s, 0UL);
+ }
+
+ static_assert(strlen_c("") == 0UL, "");
+ static_assert(strlen_c("1") == 1UL, "");
+ static_assert(strlen_c("example") == 7UL, "");
+ static_assert(strlen_c("another\0example") == 7UL, "");
+
+ }
+
+ namespace test_rvalue_references
+ {
+
+ template < int N >
+ struct answer
+ {
+ static constexpr int value = N;
+ };
+
+ answer<1> f(int&) { return answer<1>(); }
+ answer<2> f(const int&) { return answer<2>(); }
+ answer<3> f(int&&) { return answer<3>(); }
+
+ void
+ test()
+ {
+ int i = 0;
+ const int c = 0;
+ static_assert(decltype(f(i))::value == 1, "");
+ static_assert(decltype(f(c))::value == 2, "");
+ static_assert(decltype(f(0))::value == 3, "");
+ }
+
+ }
+
+ namespace test_uniform_initialization
+ {
+
+ struct test
+ {
+ static const int zero {};
+ static const int one {1};
+ };
+
+ static_assert(test::zero == 0, "");
+ static_assert(test::one == 1, "");
+
+ }
+
+ namespace test_lambdas
+ {
+
+ void
+ test1()
+ {
+ auto lambda1 = [](){};
+ auto lambda2 = lambda1;
+ lambda1();
+ lambda2();
+ }
+
+ int
+ test2()
+ {
+ auto a = [](int i, int j){ return i + j; }(1, 2);
+ auto b = []() -> int { return '0'; }();
+ auto c = [=](){ return a + b; }();
+ auto d = [&](){ return c; }();
+ auto e = [a, &b](int x) mutable {
+ const auto identity = [](int y){ return y; };
+ for (auto i = 0; i < a; ++i)
+ a += b--;
+ return x + identity(a + b);
+ }(0);
+ return a + b + c + d + e;
+ }
+
+ int
+ test3()
+ {
+ const auto nullary = [](){ return 0; };
+ const auto unary = [](int x){ return x; };
+ using nullary_t = decltype(nullary);
+ using unary_t = decltype(unary);
+ const auto higher1st = [](nullary_t f){ return f(); };
+ const auto higher2nd = [unary](nullary_t f1){
+ return [unary, f1](unary_t f2){ return f2(unary(f1())); };
+ };
+ return higher1st(nullary) + higher2nd(nullary)(unary);
+ }
+
+ }
+
+ namespace test_variadic_templates
+ {
+
+ template <int...>
+ struct sum;
+
+ template <int N0, int... N1toN>
+ struct sum<N0, N1toN...>
+ {
+ static constexpr auto value = N0 + sum<N1toN...>::value;
+ };
+
+ template <>
+ struct sum<>
+ {
+ static constexpr auto value = 0;
+ };
+
+ static_assert(sum<>::value == 0, "");
+ static_assert(sum<1>::value == 1, "");
+ static_assert(sum<23>::value == 23, "");
+ static_assert(sum<1, 2>::value == 3, "");
+ static_assert(sum<5, 5, 11>::value == 21, "");
+ static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, "");
+
+ }
+
+ // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+ // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function
+ // because of this.
+ namespace test_template_alias_sfinae
+ {
+
+ struct foo {};
+
+ template<typename T>
+ using member = typename T::member_type;
+
+ template<typename T>
+ void func(...) {}
+
+ template<typename T>
+ void func(member<T>*) {}
- int a;
- decltype(a) b;
+ void test();
+
+ void test() { func<foo>(0); }
+
+ }
+
+} // namespace cxx11
+
+#endif // __cplusplus >= 201103L
- typedef check<int> check_type;
- check_type c;
- check_type&& cr = static_cast<check_type&&>(c);
- auto d = a;
_ACEOF
if ac_fn_cxx_try_compile "$LINENO"; then :
@@ -4399,27 +4667,295 @@ $as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6;
if eval \${$cachevar+:} false; then :
$as_echo_n "(cached) " >&6
else
- ac_save_CXXFLAGS="$CXXFLAGS"
- CXXFLAGS="$CXXFLAGS $switch"
+ ac_save_CXX="$CXX"
+ CXX="$CXX $switch"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
- template <typename T>
+
+// If the compiler admits that it is not ready for C++11, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201103L
+
+#error "This is not a C++11 compiler"
+
+#else
+
+namespace cxx11
+{
+
+ namespace test_static_assert
+ {
+
+ template <typename T>
struct check
{
static_assert(sizeof(int) <= sizeof(T), "not big enough");
};
- typedef check<check<bool>> right_angle_brackets;
+ }
+
+ namespace test_final_override
+ {
+
+ struct Base
+ {
+ virtual void f() {}
+ };
+
+ struct Derived : public Base
+ {
+ virtual void f() override {}
+ };
+
+ }
+
+ namespace test_double_right_angle_brackets
+ {
+
+ template < typename T >
+ struct check {};
+
+ typedef check<void> single_type;
+ typedef check<check<void>> double_type;
+ typedef check<check<check<void>>> triple_type;
+ typedef check<check<check<check<void>>>> quadruple_type;
+
+ }
+
+ namespace test_decltype
+ {
+
+ int
+ f()
+ {
+ int a = 1;
+ decltype(a) b = 2;
+ return a + b;
+ }
+
+ }
+
+ namespace test_type_deduction
+ {
+
+ template < typename T1, typename T2 >
+ struct is_same
+ {
+ static const bool value = false;
+ };
+
+ template < typename T >
+ struct is_same<T, T>
+ {
+ static const bool value = true;
+ };
+
+ template < typename T1, typename T2 >
+ auto
+ add(T1 a1, T2 a2) -> decltype(a1 + a2)
+ {
+ return a1 + a2;
+ }
+
+ int
+ test(const int c, volatile int v)
+ {
+ static_assert(is_same<int, decltype(0)>::value == true, "");
+ static_assert(is_same<int, decltype(c)>::value == false, "");
+ static_assert(is_same<int, decltype(v)>::value == false, "");
+ auto ac = c;
+ auto av = v;
+ auto sumi = ac + av + 'x';
+ auto sumf = ac + av + 1.0;
+ static_assert(is_same<int, decltype(ac)>::value == true, "");
+ static_assert(is_same<int, decltype(av)>::value == true, "");
+ static_assert(is_same<int, decltype(sumi)>::value == true, "");
+ static_assert(is_same<int, decltype(sumf)>::value == false, "");
+ static_assert(is_same<int, decltype(add(c, v))>::value == true, "");
+ return (sumf > 0.0) ? sumi : add(c, v);
+ }
+
+ }
+
+ namespace test_noexcept
+ {
+
+ int f() { return 0; }
+ int g() noexcept { return 0; }
+
+ static_assert(noexcept(f()) == false, "");
+ static_assert(noexcept(g()) == true, "");
+
+ }
+
+ namespace test_constexpr
+ {
+
+ template < typename CharT >
+ unsigned long constexpr
+ strlen_c_r(const CharT *const s, const unsigned long acc) noexcept
+ {
+ return *s ? strlen_c_r(s + 1, acc + 1) : acc;
+ }
+
+ template < typename CharT >
+ unsigned long constexpr
+ strlen_c(const CharT *const s) noexcept
+ {
+ return strlen_c_r(s, 0UL);
+ }
+
+ static_assert(strlen_c("") == 0UL, "");
+ static_assert(strlen_c("1") == 1UL, "");
+ static_assert(strlen_c("example") == 7UL, "");
+ static_assert(strlen_c("another\0example") == 7UL, "");
+
+ }
+
+ namespace test_rvalue_references
+ {
+
+ template < int N >
+ struct answer
+ {
+ static constexpr int value = N;
+ };
+
+ answer<1> f(int&) { return answer<1>(); }
+ answer<2> f(const int&) { return answer<2>(); }
+ answer<3> f(int&&) { return answer<3>(); }
+
+ void
+ test()
+ {
+ int i = 0;
+ const int c = 0;
+ static_assert(decltype(f(i))::value == 1, "");
+ static_assert(decltype(f(c))::value == 2, "");
+ static_assert(decltype(f(0))::value == 3, "");
+ }
+
+ }
+
+ namespace test_uniform_initialization
+ {
+
+ struct test
+ {
+ static const int zero {};
+ static const int one {1};
+ };
+
+ static_assert(test::zero == 0, "");
+ static_assert(test::one == 1, "");
+
+ }
+
+ namespace test_lambdas
+ {
+
+ void
+ test1()
+ {
+ auto lambda1 = [](){};
+ auto lambda2 = lambda1;
+ lambda1();
+ lambda2();
+ }
+
+ int
+ test2()
+ {
+ auto a = [](int i, int j){ return i + j; }(1, 2);
+ auto b = []() -> int { return '0'; }();
+ auto c = [=](){ return a + b; }();
+ auto d = [&](){ return c; }();
+ auto e = [a, &b](int x) mutable {
+ const auto identity = [](int y){ return y; };
+ for (auto i = 0; i < a; ++i)
+ a += b--;
+ return x + identity(a + b);
+ }(0);
+ return a + b + c + d + e;
+ }
+
+ int
+ test3()
+ {
+ const auto nullary = [](){ return 0; };
+ const auto unary = [](int x){ return x; };
+ using nullary_t = decltype(nullary);
+ using unary_t = decltype(unary);
+ const auto higher1st = [](nullary_t f){ return f(); };
+ const auto higher2nd = [unary](nullary_t f1){
+ return [unary, f1](unary_t f2){ return f2(unary(f1())); };
+ };
+ return higher1st(nullary) + higher2nd(nullary)(unary);
+ }
+
+ }
+
+ namespace test_variadic_templates
+ {
+
+ template <int...>
+ struct sum;
+
+ template <int N0, int... N1toN>
+ struct sum<N0, N1toN...>
+ {
+ static constexpr auto value = N0 + sum<N1toN...>::value;
+ };
+
+ template <>
+ struct sum<>
+ {
+ static constexpr auto value = 0;
+ };
+
+ static_assert(sum<>::value == 0, "");
+ static_assert(sum<1>::value == 1, "");
+ static_assert(sum<23>::value == 23, "");
+ static_assert(sum<1, 2>::value == 3, "");
+ static_assert(sum<5, 5, 11>::value == 21, "");
+ static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, "");
+
+ }
+
+ // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+ // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function
+ // because of this.
+ namespace test_template_alias_sfinae
+ {
+
+ struct foo {};
+
+ template<typename T>
+ using member = typename T::member_type;
+
+ template<typename T>
+ void func(...) {}
+
+ template<typename T>
+ void func(member<T>*) {}
+
+ void test();
- int a;
- decltype(a) b;
+ void test() { func<foo>(0); }
+
+ }
+
+} // namespace cxx11
+
+#endif // __cplusplus >= 201103L
- typedef check<int> check_type;
- check_type c;
- check_type&& cr = static_cast<check_type&&>(c);
- auto d = a;
_ACEOF
if ac_fn_cxx_try_compile "$LINENO"; then :
@@ -4428,13 +4964,16 @@ else
eval $cachevar=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
- CXXFLAGS="$ac_save_CXXFLAGS"
+ CXX="$ac_save_CXX"
fi
eval ac_res=\$$cachevar
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
$as_echo "$ac_res" >&6; }
if eval test x\$$cachevar = xyes; then
- CXXFLAGS="$CXXFLAGS $switch"
+ CXX="$CXX $switch"
+ if test -n "$CXXCPP" ; then
+ CXXCPP="$CXXCPP $switch"
+ fi
ac_success=yes
break
fi
@@ -4442,34 +4981,302 @@ $as_echo "$ac_res" >&6; }
fi
if test x$ac_success = xno; then
- for switch in -std=c++11 -std=c++0x; do
+ for switch in -std=c++11 -std=c++0x +std=c++11 "-h std=c++11"; do
cachevar=`$as_echo "ax_cv_cxx_compile_cxx11_$switch" | $as_tr_sh`
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features with $switch" >&5
$as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6; }
if eval \${$cachevar+:} false; then :
$as_echo_n "(cached) " >&6
else
- ac_save_CXXFLAGS="$CXXFLAGS"
- CXXFLAGS="$CXXFLAGS $switch"
+ ac_save_CXX="$CXX"
+ CXX="$CXX $switch"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
- template <typename T>
+
+// If the compiler admits that it is not ready for C++11, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201103L
+
+#error "This is not a C++11 compiler"
+
+#else
+
+namespace cxx11
+{
+
+ namespace test_static_assert
+ {
+
+ template <typename T>
struct check
{
static_assert(sizeof(int) <= sizeof(T), "not big enough");
};
- typedef check<check<bool>> right_angle_brackets;
+ }
+
+ namespace test_final_override
+ {
+
+ struct Base
+ {
+ virtual void f() {}
+ };
+
+ struct Derived : public Base
+ {
+ virtual void f() override {}
+ };
+
+ }
+
+ namespace test_double_right_angle_brackets
+ {
+
+ template < typename T >
+ struct check {};
+
+ typedef check<void> single_type;
+ typedef check<check<void>> double_type;
+ typedef check<check<check<void>>> triple_type;
+ typedef check<check<check<check<void>>>> quadruple_type;
+
+ }
+
+ namespace test_decltype
+ {
+
+ int
+ f()
+ {
+ int a = 1;
+ decltype(a) b = 2;
+ return a + b;
+ }
+
+ }
+
+ namespace test_type_deduction
+ {
+
+ template < typename T1, typename T2 >
+ struct is_same
+ {
+ static const bool value = false;
+ };
+
+ template < typename T >
+ struct is_same<T, T>
+ {
+ static const bool value = true;
+ };
+
+ template < typename T1, typename T2 >
+ auto
+ add(T1 a1, T2 a2) -> decltype(a1 + a2)
+ {
+ return a1 + a2;
+ }
+
+ int
+ test(const int c, volatile int v)
+ {
+ static_assert(is_same<int, decltype(0)>::value == true, "");
+ static_assert(is_same<int, decltype(c)>::value == false, "");
+ static_assert(is_same<int, decltype(v)>::value == false, "");
+ auto ac = c;
+ auto av = v;
+ auto sumi = ac + av + 'x';
+ auto sumf = ac + av + 1.0;
+ static_assert(is_same<int, decltype(ac)>::value == true, "");
+ static_assert(is_same<int, decltype(av)>::value == true, "");
+ static_assert(is_same<int, decltype(sumi)>::value == true, "");
+ static_assert(is_same<int, decltype(sumf)>::value == false, "");
+ static_assert(is_same<int, decltype(add(c, v))>::value == true, "");
+ return (sumf > 0.0) ? sumi : add(c, v);
+ }
+
+ }
+
+ namespace test_noexcept
+ {
+
+ int f() { return 0; }
+ int g() noexcept { return 0; }
+
+ static_assert(noexcept(f()) == false, "");
+ static_assert(noexcept(g()) == true, "");
+
+ }
+
+ namespace test_constexpr
+ {
+
+ template < typename CharT >
+ unsigned long constexpr
+ strlen_c_r(const CharT *const s, const unsigned long acc) noexcept
+ {
+ return *s ? strlen_c_r(s + 1, acc + 1) : acc;
+ }
+
+ template < typename CharT >
+ unsigned long constexpr
+ strlen_c(const CharT *const s) noexcept
+ {
+ return strlen_c_r(s, 0UL);
+ }
+
+ static_assert(strlen_c("") == 0UL, "");
+ static_assert(strlen_c("1") == 1UL, "");
+ static_assert(strlen_c("example") == 7UL, "");
+ static_assert(strlen_c("another\0example") == 7UL, "");
+
+ }
+
+ namespace test_rvalue_references
+ {
+
+ template < int N >
+ struct answer
+ {
+ static constexpr int value = N;
+ };
+
+ answer<1> f(int&) { return answer<1>(); }
+ answer<2> f(const int&) { return answer<2>(); }
+ answer<3> f(int&&) { return answer<3>(); }
+
+ void
+ test()
+ {
+ int i = 0;
+ const int c = 0;
+ static_assert(decltype(f(i))::value == 1, "");
+ static_assert(decltype(f(c))::value == 2, "");
+ static_assert(decltype(f(0))::value == 3, "");
+ }
+
+ }
+
+ namespace test_uniform_initialization
+ {
+
+ struct test
+ {
+ static const int zero {};
+ static const int one {1};
+ };
+
+ static_assert(test::zero == 0, "");
+ static_assert(test::one == 1, "");
+
+ }
+
+ namespace test_lambdas
+ {
+
+ void
+ test1()
+ {
+ auto lambda1 = [](){};
+ auto lambda2 = lambda1;
+ lambda1();
+ lambda2();
+ }
+
+ int
+ test2()
+ {
+ auto a = [](int i, int j){ return i + j; }(1, 2);
+ auto b = []() -> int { return '0'; }();
+ auto c = [=](){ return a + b; }();
+ auto d = [&](){ return c; }();
+ auto e = [a, &b](int x) mutable {
+ const auto identity = [](int y){ return y; };
+ for (auto i = 0; i < a; ++i)
+ a += b--;
+ return x + identity(a + b);
+ }(0);
+ return a + b + c + d + e;
+ }
+
+ int
+ test3()
+ {
+ const auto nullary = [](){ return 0; };
+ const auto unary = [](int x){ return x; };
+ using nullary_t = decltype(nullary);
+ using unary_t = decltype(unary);
+ const auto higher1st = [](nullary_t f){ return f(); };
+ const auto higher2nd = [unary](nullary_t f1){
+ return [unary, f1](unary_t f2){ return f2(unary(f1())); };
+ };
+ return higher1st(nullary) + higher2nd(nullary)(unary);
+ }
+
+ }
+
+ namespace test_variadic_templates
+ {
+
+ template <int...>
+ struct sum;
+
+ template <int N0, int... N1toN>
+ struct sum<N0, N1toN...>
+ {
+ static constexpr auto value = N0 + sum<N1toN...>::value;
+ };
+
+ template <>
+ struct sum<>
+ {
+ static constexpr auto value = 0;
+ };
+
+ static_assert(sum<>::value == 0, "");
+ static_assert(sum<1>::value == 1, "");
+ static_assert(sum<23>::value == 23, "");
+ static_assert(sum<1, 2>::value == 3, "");
+ static_assert(sum<5, 5, 11>::value == 21, "");
+ static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, "");
+
+ }
+
+ // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+ // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function
+ // because of this.
+ namespace test_template_alias_sfinae
+ {
+
+ struct foo {};
+
+ template<typename T>
+ using member = typename T::member_type;
+
+ template<typename T>
+ void func(...) {}
+
+ template<typename T>
+ void func(member<T>*) {}
+
+ void test();
+
+ void test() { func<foo>(0); }
- int a;
- decltype(a) b;
+ }
+
+} // namespace cxx11
+
+#endif // __cplusplus >= 201103L
- typedef check<int> check_type;
- check_type c;
- check_type&& cr = static_cast<check_type&&>(c);
- auto d = a;
_ACEOF
if ac_fn_cxx_try_compile "$LINENO"; then :
@@ -4478,13 +5285,16 @@ else
eval $cachevar=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
- CXXFLAGS="$ac_save_CXXFLAGS"
+ CXX="$ac_save_CXX"
fi
eval ac_res=\$$cachevar
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
$as_echo "$ac_res" >&6; }
if eval test x\$$cachevar = xyes; then
- CXXFLAGS="$CXXFLAGS $switch"
+ CXX="$CXX $switch"
+ if test -n "$CXXCPP" ; then
+ CXXCPP="$CXXCPP $switch"
+ fi
ac_success=yes
break
fi
@@ -4500,22 +5310,20 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
if test x$ac_success = xno; then
as_fn_error $? "*** A compiler with support for C++11 language features is required." "$LINENO" 5
fi
- else
- if test x$ac_success = xno; then
- HAVE_CXX11=0
- { $as_echo "$as_me:${as_lineno-$LINENO}: No compiler with C++11 support was found" >&5
+ fi
+ if test x$ac_success = xno; then
+ HAVE_CXX11=0
+ { $as_echo "$as_me:${as_lineno-$LINENO}: No compiler with C++11 support was found" >&5
$as_echo "$as_me: No compiler with C++11 support was found" >&6;}
- else
- HAVE_CXX11=1
+ else
+ HAVE_CXX11=1
$as_echo "#define HAVE_CXX11 1" >>confdefs.h
- fi
-
-
fi
+
# Checks for libraries.
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for omp_set_num_threads in -lgomp" >&5
@@ -6209,7 +7017,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by Stacks $as_me 1.42, which was
+This file was extended by Stacks $as_me 1.44, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6275,7 +7083,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-Stacks config.status 1.42
+Stacks config.status 1.44
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index fb4e057..ca61f7a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ(2.59)
-AC_INIT([Stacks], [1.42])
+AC_INIT([Stacks], [1.44])
AC_CONFIG_AUX_DIR([config])
AM_INIT_AUTOMAKE([-Wall -Werror foreign parallel-tests subdir-objects])
AC_CONFIG_SRCDIR([src/ustacks.cc])
@@ -45,7 +45,7 @@ AC_SUBST([SPARSEHASH_CFLAGS])
# Checks for programs.
AC_PROG_CXX
AM_PROG_CC_C_O
-AX_CXX_COMPILE_STDCXX_11(, [mandatory])
+AX_CXX_COMPILE_STDCXX(11,, [mandatory])
# Checks for libraries.
AC_CHECK_LIB([gomp], [omp_set_num_threads],, [AC_MSG_WARN([Unable to locate OpenMP library, you should probably specify '--disable-openmp'.])])
diff --git a/scripts/load_sequences.pl b/scripts/load_sequences.pl
index c3adf24..dbc6cd2 100755
--- a/scripts/load_sequences.pl
+++ b/scripts/load_sequences.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
#
# Copyright 2011, Julian Catchen <jcatchen at uoregon.edu>
#
diff --git a/scripts/ref_map.pl b/scripts/ref_map.pl
index 83dd103..acc2aab 100755
--- a/scripts/ref_map.pl
+++ b/scripts/ref_map.pl
@@ -776,9 +776,6 @@ sub parse_command_line {
usage();
}
- } elsif ($_ =~ /^-t$/) {
- push(@_pstacks, "-d ");
-
} elsif ($_ =~ /^-T$/) {
$arg = shift @ARGV;
push(@_pstacks, "-p " . $arg);
@@ -872,7 +869,7 @@ sub usage {
version();
print STDERR <<EOQ;
-ref_map.pl -p path -r path [-s path] -o path [-t] [-m min_cov] [-M mismatches] [-T num_threads] [-A type] [-O popmap] [-B db -b batch_id -D "desc"] [-S -i num] [-e path] [-d] [-h]
+ref_map.pl -p path -r path [-s path] -o path [-m min_cov] [-T num_threads] [-A type] [-O popmap] [-B db -b batch_id -D "desc"] [-S -i num] [-e path] [-d] [-h]
b: batch ID representing this dataset (an integer, e.g. 1, 2, 3).
o: path to write pipeline output files.
O: if analyzing one or more populations, specify a pOpulation map.
diff --git a/src/BamI.h b/src/BamI.h
index 5e2a9cf..92f4ff4 100644
--- a/src/BamI.h
+++ b/src/BamI.h
@@ -47,19 +47,19 @@ class Bam: public Input {
public:
Bam(const char *path) : Input() {
- this->path = string(path);
- this->bam_fh = hts_open(path, "r");
- this->aln = bam_init1();
+ this->path = string(path);
+ this->bam_fh = hts_open(path, "r");
+ this->aln = bam_init1();
- this->parse_header();
+ this->parse_header();
};
~Bam() {
- hts_close(this->bam_fh);
+ hts_close(this->bam_fh);
bam_hdr_destroy(this->bamh);
- bam_destroy1(this->aln);
+ bam_destroy1(this->aln);
};
Seq *next_seq();
- int next_seq(Seq &) { return 0; };
+ int next_seq(Seq&);
};
int
@@ -69,39 +69,51 @@ Bam::parse_header()
this->bamh = sam_hdr_read(this->bam_fh);
for (uint j = 0; j < (uint) this->bamh->n_targets; j++) {
- //
- // Record the mapping from integer ID to chromosome name that we will see in BAM records.
- //
- this->chrs[j] = string(this->bamh->target_name[j]);
+ //
+ // Record the mapping from integer ID to chromosome name that we will see in BAM records.
+ //
+ this->chrs[j] = string(this->bamh->target_name[j]);
}
return 0;
}
Seq *
-Bam::next_seq()
+Bam::next_seq()
+{
+ Seq* s = new Seq();
+ if(next_seq(*s) != 1) {
+ delete s;
+ s = NULL;
+ }
+ return s;
+}
+
+int
+Bam::next_seq(Seq& s)
{
int bytes_read = 0;
+ int sflag = 0;
int flag = 0;
//
// Read a record from the file, skipping unmapped reads, and place it in a Seq object.
//
do {
- bytes_read = sam_read1(this->bam_fh, this->bamh, this->aln);
+ bytes_read = sam_read1(this->bam_fh, this->bamh, this->aln);
if (bytes_read <= 0)
- return NULL;
+ return 0;
- flag = ((this->aln->core.flag & BAM_FUNMAP) != 0);
+ flag = ((this->aln->core.flag & BAM_FUNMAP) != 0);
} while (flag == 1);
//
- // Check which strand this is aligned to:
+ // Check which strand this is aligned to:
// SAM reference: FLAG bit 0x10 - sequence is reverse complemented
//
- flag = ((this->aln->core.flag & BAM_FREVERSE) != 0);
+ sflag = ((this->aln->core.flag & BAM_FREVERSE) != 0);
//
// If the read was aligned on the reverse strand (and is therefore reverse complemented)
@@ -111,11 +123,25 @@ Bam::next_seq()
// To accomplish this, we must parse the alignment CIGAR string
//
vector<pair<char, uint> > cigar;
- this->parse_bam_cigar(cigar, flag);
+ this->parse_bam_cigar(cigar, sflag);
- uint bp = flag ?
- this->find_start_bp_neg(this->aln->core.pos, cigar) :
- this->find_start_bp_pos(this->aln->core.pos, cigar);
+ uint bp = sflag ?
+ this->find_start_bp_neg(this->aln->core.pos, cigar) :
+ this->find_start_bp_pos(this->aln->core.pos, cigar);
+
+ //
+ // Check if this is the primary or secondary alignment.
+ //
+ alnt aln_type = pri_aln;
+ flag = ((this->aln->core.flag & BAM_FSECONDARY) != 0);
+ if (flag)
+ aln_type = sec_aln;
+ //
+ // Check if this is a supplemenatry (chimeric) alignment (not yet defined in Bam.h).
+ //
+ flag = ((this->aln->core.flag & 2048) != 0);
+ if (flag)
+ aln_type = sup_aln;
//
// Fetch the sequence.
@@ -124,26 +150,26 @@ Bam::next_seq()
uint8_t j;
seq.reserve(this->aln->core.l_qseq);
-
+
for (int i = 0; i < this->aln->core.l_qseq; i++) {
- j = bam_seqi(bam_get_seq(this->aln), i);
- switch(j) {
- case 1:
- seq += 'A';
- break;
- case 2:
- seq += 'C';
- break;
- case 4:
- seq += 'G';
- break;
- case 8:
- seq += 'T';
- break;
- case 15:
- seq += 'N';
- break;
- }
+ j = bam_seqi(bam_get_seq(this->aln), i);
+ switch(j) {
+ case 1:
+ seq += 'A';
+ break;
+ case 2:
+ seq += 'C';
+ break;
+ case 4:
+ seq += 'G';
+ break;
+ case 8:
+ seq += 'T';
+ break;
+ case 15:
+ seq += 'N';
+ break;
+ }
}
//
@@ -152,21 +178,45 @@ Bam::next_seq()
string qual;
uint8_t *q = bam_get_qual(this->aln);
for (int i = 0; i < this->aln->core.l_qseq; i++) {
- qual += char(int(q[i]) + 33);
+ qual += char(int(q[i]) + 33);
}
string chr = this->chrs[this->aln->core.tid];
- Seq *s = new Seq((const char *) bam_get_qname(this->aln), seq.c_str(), qual.c_str(),
- chr.c_str(), bp, flag ? strand_minus : strand_plus);
+ //
+ // Calculate the percentage of the sequence that was aligned to the reference.
+ //
+ double len = 0.0;
+ for (uint i = 0; i < cigar.size(); i++)
+ switch (cigar[i].first) {
+ case 'M':
+ case 'I':
+ case '=':
+ case 'X':
+ len += cigar[i].second;
+ break;
+ case 'D':
+ case 'S':
+ case 'H':
+ case 'N':
+ break;
+ default:
+ cerr << "Error parsing CIGAR string '" << cigar[i].second << cigar[i].first << "'.\n";
+ break;
+ }
+ double pct_aln = len / double(seq.length());
+
+ s = Seq((const char *) bam_get_qname(this->aln), seq.c_str(), qual.c_str(),
+ chr.c_str(), bp, sflag ? strand_minus : strand_plus,
+ aln_type, pct_aln);
if (cigar.size() > 0)
- this->edit_gaps(cigar, s->seq);
+ this->edit_gaps(cigar, s.seq);
- return s;
+ return 1;
}
-int
+int
Bam::parse_bam_cigar(vector<pair<char, uint> > &cigar, bool orientation)
{
int op, len;
@@ -174,47 +224,56 @@ Bam::parse_bam_cigar(vector<pair<char, uint> > &cigar, bool orientation)
uint32_t *cgr = bam_get_cigar(this->aln);
for (int k = 0; k < this->aln->core.n_cigar; k++) {
- op = cgr[k] & BAM_CIGAR_MASK;
- len = cgr[k] >> BAM_CIGAR_SHIFT;
-
- switch(op) {
- case BAM_CMATCH:
- c = 'M';
- break;
- case BAM_CINS:
- c = 'I';
- break;
- case BAM_CDEL:
- c = 'D';
- break;
- case BAM_CREF_SKIP:
- c = 'N';
- break;
- case BAM_CSOFT_CLIP:
- c = 'S';
- break;
- case BAM_CHARD_CLIP:
- c = 'H';
- break;
- case BAM_CPAD:
- c = 'P';
- break;
- }
-
- //
- // If aligned to the negative strand, sequence has been reverse complemented and
- // CIGAR string should be interpreted in reverse.
- //
- if (orientation == strand_plus)
- cigar.push_back(make_pair(c, len));
- else
- cigar.insert(cigar.begin(), make_pair(c, len));
+ op = cgr[k] & BAM_CIGAR_MASK;
+ len = cgr[k] >> BAM_CIGAR_SHIFT;
+
+ switch(op) {
+ case BAM_CMATCH:
+ c = 'M';
+ break;
+ case BAM_CEQUAL:
+ c = '=';
+ break;
+ case BAM_CDIFF:
+ c = 'X';
+ break;
+ case BAM_CINS:
+ c = 'I';
+ break;
+ case BAM_CDEL:
+ c = 'D';
+ break;
+ case BAM_CREF_SKIP:
+ c = 'N';
+ break;
+ case BAM_CSOFT_CLIP:
+ c = 'S';
+ break;
+ case BAM_CHARD_CLIP:
+ c = 'H';
+ break;
+ case BAM_CPAD:
+ c = 'P';
+ break;
+ default:
+ cerr << "Unknown operation present in CIGAR string.\n";
+ break;
+ }
+
+ //
+ // If aligned to the negative strand, sequence has been reverse complemented and
+ // CIGAR string should be interpreted in reverse.
+ //
+ if (orientation == strand_plus)
+ cigar.push_back(make_pair(c, len));
+ else
+ cigar.insert(cigar.begin(), make_pair(c, len));
}
return 0;
}
-int
+int
Bam::parse_cigar(const char *cigar_str, vector<pair<char, uint> > &cigar, bool orientation)
{
char buf[id_len];
@@ -226,30 +285,30 @@ Bam::parse_cigar(const char *cigar_str, vector<pair<char, uint> > &cigar, bool o
if (*p == '*') return 0;
while (*p != '\0') {
- q = p + 1;
-
- while (*q != '\0' && isdigit(*q))
- q++;
- strncpy(buf, p, q - p);
- buf[q-p] = '\0';
- dist = atoi(buf);
-
- //
- // If aligned to the negative strand, sequence has been reverse complemented and
- // CIGAR string should be interpreted in reverse.
- //
- if (orientation == strand_plus)
- cigar.push_back(make_pair(*q, dist));
- else
- cigar.insert(cigar.begin(), make_pair(*q, dist));
-
- p = q + 1;
+ q = p + 1;
+
+ while (*q != '\0' && isdigit(*q))
+ q++;
+ strncpy(buf, p, q - p);
+ buf[q-p] = '\0';
+ dist = atoi(buf);
+
+ //
+ // If aligned to the negative strand, sequence has been reverse complemented and
+ // CIGAR string should be interpreted in reverse.
+ //
+ if (orientation == strand_plus)
+ cigar.push_back(make_pair(*q, dist));
+ else
+ cigar.insert(cigar.begin(), make_pair(*q, dist));
+
+ p = q + 1;
}
return 0;
}
-int
+int
Bam::find_start_bp_neg(int aln_bp, vector<pair<char, uint> > &cigar)
{
uint size = cigar.size();
@@ -257,27 +316,33 @@ Bam::find_start_bp_neg(int aln_bp, vector<pair<char, uint> > &cigar)
uint dist;
for (uint i = 0; i < size; i++) {
- op = cigar[i].first;
- dist = cigar[i].second;
-
- switch(op) {
- case 'I':
- break;
- case 'S':
- if (i < size - 1)
- aln_bp += dist;
- break;
- case 'M':
- case 'D':
- aln_bp += dist;
- break;
- }
+ op = cigar[i].first;
+ dist = cigar[i].second;
+
+ switch(op) {
+ case 'I':
+ case 'H':
+ break;
+ case 'S':
+ if (i < size - 1)
+ aln_bp += dist;
+ break;
+ case 'M':
+ case '=':
+ case 'X':
+ case 'D':
+ case 'N':
+ aln_bp += dist;
+ break;
+ default:
+ break;
+ }
}
return aln_bp - 1;
}
-int
+int
Bam::find_start_bp_pos(int aln_bp, vector<pair<char, uint> > &cigar)
{
char op;
@@ -287,12 +352,12 @@ Bam::find_start_bp_pos(int aln_bp, vector<pair<char, uint> > &cigar)
dist = cigar[0].second;
if (op == 'S')
- aln_bp -= dist;
+ aln_bp -= dist;
return aln_bp;
}
-int
+int
Bam::edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
{
char *buf;
@@ -307,78 +372,80 @@ Bam::edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
buf_size = len + 1;
for (uint i = 0; i < size; i++) {
- op = cigar[i].first;
- dist = cigar[i].second;
-
- switch(op) {
- case 'S':
- stop = bp + dist;
- stop = stop > len ? len : stop;
- while (bp < stop) {
- seq[bp] = 'N';
- bp++;
- }
- break;
- case 'D':
- //
- // A deletion has occured in the read relative to the reference genome.
- // Pad the read with sufficent Ns to match the deletion, shifting the existing
- // sequence down. Trim the final length to keep the read length consistent.
- //
- k = bp >= len ? len : bp;
-
- strncpy(buf, seq + k, buf_size - 1);
- buf[buf_size - 1] = '\0';
- buf_len = strlen(buf);
-
- stop = bp + dist;
- stop = stop > len ? len : stop;
- while (bp < stop) {
- seq[bp] = 'N';
- bp++;
- }
-
- j = bp;
- k = 0;
- while (j < len && k < buf_len) {
- seq[j] = buf[k];
- k++;
- j++;
- }
- break;
- case 'I':
- //
- // An insertion has occurred in the read relative to the reference genome. Delete the
- // inserted bases and pad the end of the read with Ns.
- //
- if (bp >= len) break;
-
- k = bp + dist > len ? len : bp + dist;
- strncpy(buf, seq + k, buf_size - 1);
- buf[buf_size - 1] = '\0';
- buf_len = strlen(buf);
-
- j = bp;
- k = 0;
- while (j < len && k < buf_len) {
- seq[j] = buf[k];
- k++;
- j++;
- }
-
- stop = j + dist;
- stop = stop > len ? len : stop;
- while (j < stop) {
- seq[j] = 'N';
- j++;
- }
- break;
- case 'M':
- bp += dist;
- break;
- default:
- break;
- }
+ op = cigar[i].first;
+ dist = cigar[i].second;
+
+ switch(op) {
+ case 'S':
+ stop = bp + dist;
+ stop = stop > len ? len : stop;
+ while (bp < stop) {
+ seq[bp] = 'N';
+ bp++;
+ }
+ break;
+ case 'D':
+ //
+ // A deletion has occured in the read relative to the reference genome.
+ // Pad the read with sufficent Ns to match the deletion, shifting the existing
+ // sequence down. Trim the final length to keep the read length consistent.
+ //
+ k = bp >= len ? len : bp;
+
+ strncpy(buf, seq + k, buf_size - 1);
+ buf[buf_size - 1] = '\0';
+ buf_len = strlen(buf);
+
+ stop = bp + dist;
+ stop = stop > len ? len : stop;
+ while (bp < stop) {
+ seq[bp] = 'N';
+ bp++;
+ }
+
+ j = bp;
+ k = 0;
+ while (j < len && k < buf_len) {
+ seq[j] = buf[k];
+ k++;
+ j++;
+ }
+ break;
+ case 'I':
+ //
+ // An insertion has occurred in the read relative to the reference genome. Delete the
+ // inserted bases and pad the end of the read with Ns.
+ //
+ if (bp >= len) break;
+
+ k = bp + dist > len ? len : bp + dist;
+ strncpy(buf, seq + k, buf_size - 1);
+ buf[buf_size - 1] = '\0';
+ buf_len = strlen(buf);
+
+ j = bp;
+ k = 0;
+ while (j < len && k < buf_len) {
+ seq[j] = buf[k];
+ k++;
+ j++;
+ }
+
+ stop = j + dist;
+ stop = stop > len ? len : stop;
+ while (j < stop) {
+ seq[j] = 'N';
+ j++;
+ }
+ break;
+ case 'M':
+ case '=':
+ case 'X':
+ bp += dist;
+ break;
+ default:
+ break;
+ }
}
delete [] buf;
diff --git a/src/BamUnalignedI.h b/src/BamUnalignedI.h
index 73f8496..a3b4021 100644
--- a/src/BamUnalignedI.h
+++ b/src/BamUnalignedI.h
@@ -42,26 +42,26 @@ class BamUnAln: public Input {
public:
BamUnAln(const char *path) : Input() {
- this->path = string(path);
- this->bam_fh = hts_open(path, "r");
- this->aln = bam_init1();
+ this->path = string(path);
+ this->bam_fh = hts_open(path, "r");
+ this->aln = bam_init1();
- this->parse_header();
+ this->parse_header();
};
BamUnAln(string path) : Input() {
- this->path = path;
- this->bam_fh = hts_open(path.c_str(), "r");
- this->aln = bam_init1();
+ this->path = path;
+ this->bam_fh = hts_open(path.c_str(), "r");
+ this->aln = bam_init1();
- this->parse_header();
+ this->parse_header();
};
~BamUnAln() {
- hts_close(this->bam_fh);
+ hts_close(this->bam_fh);
bam_hdr_destroy(this->bamh);
- bam_destroy1(this->aln);
+ bam_destroy1(this->aln);
};
Seq *next_seq();
- int next_seq(Seq &) { return 0; };
+ int next_seq(Seq &);
};
int
@@ -71,17 +71,28 @@ BamUnAln::parse_header()
this->bamh = sam_hdr_read(this->bam_fh);
for (uint j = 0; j < (uint) this->bamh->n_targets; j++) {
- //
- // Record the mapping from integer ID to chromosome name that we will see in BAM records.
- //
- this->chrs[j] = string(this->bamh->target_name[j]);
+ //
+ // Record the mapping from integer ID to chromosome name that we will see in BAM records.
+ //
+ this->chrs[j] = string(this->bamh->target_name[j]);
}
return 0;
}
Seq *
-BamUnAln::next_seq()
+BamUnAln::next_seq()
+{
+ Seq* s = new Seq();
+ if(next_seq(*s) != 1) {
+ delete s;
+ s = NULL;
+ }
+ return s;
+}
+
+int
+BamUnAln::next_seq(Seq& s)
{
int bytes_read = 0;
@@ -91,7 +102,7 @@ BamUnAln::next_seq()
bytes_read = sam_read1(this->bam_fh, this->bamh, this->aln);
if (bytes_read <= 0)
- return NULL;
+ return 0;
//
// Fetch the sequence.
@@ -100,26 +111,26 @@ BamUnAln::next_seq()
uint8_t j;
seq.reserve(this->aln->core.l_qseq);
-
+
for (int i = 0; i < this->aln->core.l_qseq; i++) {
- j = bam_seqi(bam_get_seq(this->aln), i);
- switch(j) {
- case 1:
- seq += 'A';
- break;
- case 2:
- seq += 'C';
- break;
- case 4:
- seq += 'G';
- break;
- case 8:
- seq += 'T';
- break;
- case 15:
- seq += 'N';
- break;
- }
+ j = bam_seqi(bam_get_seq(this->aln), i);
+ switch(j) {
+ case 1:
+ seq += 'A';
+ break;
+ case 2:
+ seq += 'C';
+ break;
+ case 4:
+ seq += 'G';
+ break;
+ case 8:
+ seq += 'T';
+ break;
+ case 15:
+ seq += 'N';
+ break;
+ }
}
//
@@ -128,7 +139,7 @@ BamUnAln::next_seq()
string qual;
uint8_t *q = bam_get_qual(this->aln);
for (int i = 0; i < this->aln->core.l_qseq; i++) {
- qual += char(int(q[i]) + 33);
+ qual += char(int(q[i]) + 33);
}
string chr = this->chrs[this->aln->core.tid];
@@ -137,9 +148,9 @@ BamUnAln::next_seq()
// Attempt to parse the query name for this read.
//
- Seq *s = new Seq((const char *) bam_get_qname(this->aln), seq.c_str(), qual.c_str());
+ s = Seq((const char *) bam_get_qname(this->aln), seq.c_str(), qual.c_str());
- return s;
+ return 1;
}
#else // If HAVE_BAM is undefined and BAM library is not present.
diff --git a/src/BowtieI.h b/src/BowtieI.h
index 2d30c18..6b4005a 100644
--- a/src/BowtieI.h
+++ b/src/BowtieI.h
@@ -38,10 +38,19 @@ class Bowtie: public Input {
Bowtie(const char *path) : Input(path) {};
~Bowtie() {};
Seq *next_seq();
- int next_seq(Seq &) { return 0; };
+ int next_seq(Seq &);
};
Seq *Bowtie::next_seq() {
+ Seq* s = new Seq();
+ if(next_seq(*s) != 1) {
+ delete s;
+ s = NULL;
+ }
+ return s;
+}
+
+int Bowtie::next_seq(Seq& s) {
vector<string> parts;
//
@@ -50,7 +59,7 @@ Seq *Bowtie::next_seq() {
this->fh.getline(this->line, max_len);
if (!this->fh.good()) {
- return NULL;
+ return 0;
}
parse_tsv(this->line, parts);
@@ -64,10 +73,10 @@ Seq *Bowtie::next_seq() {
//
int bp = strand == strand_plus ? atoi(parts[3].c_str()) : atoi(parts[3].c_str()) + parts[4].length();
- Seq *s = new Seq(parts[0].c_str(), parts[4].c_str(), parts[5].c_str(),
- parts[2].c_str(), bp, strand);
+ s = Seq(parts[0].c_str(), parts[4].c_str(), parts[5].c_str(),
+ parts[2].c_str(), bp, strand);
- return s;
+ return 1;
}
#endif // __BOWTIEI_H__
diff --git a/src/BustardI.h b/src/BustardI.h
index 3385984..0ccf835 100644
--- a/src/BustardI.h
+++ b/src/BustardI.h
@@ -51,15 +51,15 @@ Seq *Bustard::next_seq() {
this->fh.getline(this->line, max_len);
if (!this->fh.good()) {
- return NULL;
+ return NULL;
}
parse_tsv(this->line, parts);
if (parts.size() != num_bustd_fields) {
- cerr << "Error parsing '" << this->path.c_str() << "' found " << parts.size() << " fields, but expecting " << num_bustd_fields << "). "
- << "Perhaps you should specify the input file type (-i)?\n";
- return NULL;
+ cerr << "Error parsing '" << this->path.c_str() << "' found " << parts.size() << " fields, but expecting " << num_bustd_fields << "). "
+ << "Perhaps you should specify the input file type (-i)?\n";
+ return NULL;
}
Seq *s = new Seq;
@@ -70,13 +70,13 @@ Seq *Bustard::next_seq() {
s->id = new char[id_len];
sprintf(s->id, "@%s:%s:%s:%s:%s#%s/%s",
- parts[0].c_str(),
- parts[2].c_str(),
- parts[3].c_str(),
- parts[4].c_str(),
- parts[5].c_str(),
- parts[6].c_str(),
- parts[7].c_str());
+ parts[0].c_str(),
+ parts[2].c_str(),
+ parts[3].c_str(),
+ parts[4].c_str(),
+ parts[5].c_str(),
+ parts[6].c_str(),
+ parts[7].c_str());
return s;
}
@@ -90,7 +90,7 @@ int Bustard::next_seq(Seq &s) {
this->fh.getline(this->line, max_len);
if (!this->fh.good()) {
- return 0;
+ return 0;
}
parse_tsv(this->line, parts);
@@ -99,13 +99,13 @@ int Bustard::next_seq(Seq &s) {
strcpy(s.qual, parts[3].c_str());
sprintf(s.id, "@%s:%s:%s:%s:%s#%s/%s",
- parts[0].c_str(),
- parts[1].c_str(),
- parts[2].c_str(),
- parts[3].c_str(),
- parts[4].c_str(),
- parts[5].c_str(),
- parts[6].c_str());
+ parts[0].c_str(),
+ parts[1].c_str(),
+ parts[2].c_str(),
+ parts[3].c_str(),
+ parts[4].c_str(),
+ parts[5].c_str(),
+ parts[6].c_str());
return 1;
}
diff --git a/src/DNANSeq.cc b/src/DNANSeq.cc
index 6b9a2da..fd4ef17 100644
--- a/src/DNANSeq.cc
+++ b/src/DNANSeq.cc
@@ -50,7 +50,7 @@ DNANSeq::DNANSeq(int size, unsigned char *seq) {
this->s = new unsigned char[bytes];
for (unsigned int i = 0; i < bytes; i++)
- this->s[i] = seq[i];
+ this->s[i] = seq[i];
}
DNANSeq::DNANSeq(int size, const char *seq) {
@@ -64,47 +64,64 @@ DNANSeq::DNANSeq(int size, const char *seq) {
int bit = 0;
for (int i = 0; i < size; i++) {
- switch (seq[i]) {
- case 'A':
- case 'a':
- // A == 000
- bit += 3;
- break;
- case 'C':
- case 'c':
- // C == 001
- bit += 2;
- BITSET(this->s, bit);
- bit++;
- break;
- case 'G':
- case 'g':
- // G == 010
- bit++;
- BITSET(this->s, bit);
- bit++;
- bit++;
- break;
- case 'T':
- case 't':
- // T == 011
- bit++;
- BITSET(this->s, bit);
- bit++;
- BITSET(this->s, bit);
- bit++;
- break;
- case 'N':
- case 'n':
- case '.':
- // N == 100
- BITSET(this->s, bit);
- bit += 3;
- break;
- }
+ switch (seq[i]) {
+ case 'A':
+ case 'a':
+ // A == 000
+ bit += 3;
+ break;
+ case 'C':
+ case 'c':
+ // C == 001
+ bit += 2;
+ BITSET(this->s, bit);
+ bit++;
+ break;
+ case 'G':
+ case 'g':
+ // G == 010
+ bit++;
+ BITSET(this->s, bit);
+ bit++;
+ bit++;
+ break;
+ case 'T':
+ case 't':
+ // T == 011
+ bit++;
+ BITSET(this->s, bit);
+ bit++;
+ BITSET(this->s, bit);
+ bit++;
+ break;
+ case 'N':
+ case 'n':
+ case '.':
+ // N == 100
+ BITSET(this->s, bit);
+ bit += 3;
+ break;
+ }
}
}
+DNANSeq::DNANSeq(const DNANSeq& other) : bits(other.bits) {
+ const int n_bytes = BITNSLOTS(bits);
+ s = new unsigned char[n_bytes];
+ memcpy(s, other.s, n_bytes);
+}
+
+DNANSeq& DNANSeq::operator=(const DNANSeq& other) {
+ delete[] s;
+
+ bits = other.bits;
+ const int n_bytes = BITNSLOTS(bits);
+ s = new unsigned char[n_bytes];
+ memcpy(s, other.s, n_bytes);
+
+ return *this;
+}
+
DNANSeq::~DNANSeq() {
delete [] this->s;
}
@@ -121,30 +138,30 @@ char DNANSeq::operator[](int pos) {
base = 'X';
for (int i = bits_per_nuc - 1; i >= 0; i--) {
- if (BITTEST(this->s, bit))
- c |= 1 << i;
- bit++;
+ if (BITTEST(this->s, bit))
+ c |= 1 << i;
+ bit++;
}
switch (c) {
case 0:
- base = 'A';
- break;
+ base = 'A';
+ break;
case 1:
- base = 'C';
- break;
+ base = 'C';
+ break;
case 2:
- base = 'G';
- break;
+ base = 'G';
+ break;
case 3:
- base = 'T';
- break;
+ base = 'T';
+ break;
case 4:
- base = 'N';
- break;
+ base = 'N';
+ break;
default:
- cerr << "Unknown character " << (int) c << "\n";
- break;
+ cerr << "Unknown character " << (int) c << "\n";
+ break;
}
//cerr << " Decoding character " << pos << ", '" << base << "'\n";
@@ -159,7 +176,7 @@ char *DNANSeq::subseq(char *seq, int start, int end) {
int i;
for (i = start; i <= end; i++)
- seq[i - start] = this->operator[](i);
+ seq[i - start] = this->operator[](i);
seq[i - start] = '\0';
@@ -171,7 +188,7 @@ char *DNANSeq::seq(char *seq) {
int end = this->bits / bits_per_nuc;
for (i = 0; i < end; i++)
- seq[i] = this->operator[](i);
+ seq[i] = this->operator[](i);
seq[i] = '\0';
@@ -184,7 +201,7 @@ char *DNANSeq::seq() {
char *seq = new char[size + 1];
for (i = 0; i < size; i++)
- seq[i] = this->operator[](i);
+ seq[i] = this->operator[](i);
seq[i] = '\0';
diff --git a/src/DNANSeq.h b/src/DNANSeq.h
index b7f09e0..62b5d91 100644
--- a/src/DNANSeq.h
+++ b/src/DNANSeq.h
@@ -24,6 +24,11 @@
#include <string.h>
#include <limits.h>
+#include <functional> //std::hash
+#ifdef HAVE_SPARSEHASH
+#include <tr1/functional>
+#endif
+
#define BITMASK(b) (1 << ((b) % CHAR_BIT))
#define BITSLOT(b) ((b) / CHAR_BIT)
#define BITSET(a, b) ((a)[BITSLOT(b)] |= BITMASK(b))
@@ -58,9 +63,14 @@ public:
//
unsigned char *s;
+#ifdef HAVE_SPARSEHASH
+ DNANSeq() : bits(3), s(new unsigned char[1]) { *s = (1<<2); } // "N"
+#endif
DNANSeq(int);
DNANSeq(int, const char *);
DNANSeq(int, unsigned char *);
+ DNANSeq(const DNANSeq& other);
+ DNANSeq& operator=(const DNANSeq& other);
~DNANSeq();
char operator[](int);
@@ -68,6 +78,17 @@ public:
char *seq(char *);
char *seq();
char *subseq(char *, int, int);
+
+ bool operator== (const DNANSeq& other) const {
+ if (bits != other.bits)
+ return false;
+
+ unsigned int bytes = BITNSLOTS(bits);
+ for (unsigned int i = 0; i < bytes; i++)
+ if (s[i] != other.s[i])
+ return false;
+ return true;
+ }
};
#include <iostream>
@@ -78,42 +99,55 @@ using std::cin;
using std::cout;
using std::cerr;
+// Specialization for std::hash
+// Based on GCC
+namespace std {
+template<>
+struct hash<DNANSeq> {
+ size_t operator()(const DNANSeq& seq) const {
+ size_t __result = static_cast<size_t>(14695981039346656037ULL);
+ unsigned short int __bytes = BITNSLOTS(seq.bits);
+ for (unsigned short int i = 0; i < __bytes; i++) {
+ __result ^= static_cast<size_t>(seq.s[i]);
+ __result *= static_cast<size_t>(1099511628211ULL);
+ }
+ return __result;
+ }
+};
+
+#ifdef HAVE_SPARSEHASH
+namespace tr1 {
+template<>
+struct hash<DNANSeq> {
+ size_t operator()(const DNANSeq& seq) const { return std::hash<DNANSeq>()(seq); }
+};
+}
+#endif
+}
+
// namespace __gnu_cxx {
// template<>
// struct hash<DNANSeq *>
// {
-// size_t
-// operator()(DNANSeq *__s) const {
-// unsigned long __h = 0;
-// unsigned int bytes = BITNSLOTS(__s->bits);
-// for (unsigned int i = 0; i < bytes; i++)
-// __h = 5 * __h + __s->s[i];
-// return size_t(__h);
-// }
+// size_t
+// operator()(DNANSeq *__s) const {
+// unsigned long __h = 0;
+// unsigned int bytes = BITNSLOTS(__s->bits);
+// for (unsigned int i = 0; i < bytes; i++)
+// __h = 5 * __h + __s->s[i];
+// return size_t(__h);
+// }
// };
// }
-struct hash_dnanseq {
- size_t operator()(DNANSeq *__s) const
- {
- size_t __result = static_cast<size_t>(14695981039346656037ULL);
- unsigned short int __bytes = BITNSLOTS(__s->bits);
- for (unsigned short int i = 0; i < __bytes; i++) {
- __result ^= static_cast<size_t>(__s->s[i]);
- __result *= static_cast<size_t>(1099511628211ULL);
- }
-
- return __result;
- }
+/* struct hash_dnanseq {
+ // Deprecated
+ size_t operator()(const DNANSeq* seq) const {return std::hash<DNANSeq>{}(*seq);}
};
struct dnanseq_eqstr {
- bool operator()(DNANSeq *s1, DNANSeq *s2) const {
- unsigned int bytes = BITNSLOTS(s1->bits);
- for (unsigned int i = 0; i < bytes; i++)
- if (s1->s[i] != s2->s[i]) return false;
- return true;
- }
-};
+ // Deprecated
+ bool operator()(const DNANSeq *s1, const DNANSeq *s2) const {return *s1 == *s2;}
+}; */
#endif // __DNANSeq_H__
diff --git a/src/DNASeq.cc b/src/DNASeq.cc
index 90b02e0..92d2e08 100644
--- a/src/DNASeq.cc
+++ b/src/DNASeq.cc
@@ -52,7 +52,7 @@ DNASeq::DNASeq(int size, unsigned char *seq) {
this->s = new unsigned char[bytes];
for (unsigned int i = 0; i < bytes; i++)
- this->s[i] = seq[i];
+ this->s[i] = seq[i];
}
DNASeq::DNASeq(int size, const char *seq) {
@@ -69,41 +69,41 @@ DNASeq::DNASeq(int size, const char *seq) {
int index = 0;
for (int i = 0; i < this->size; i++) {
- //cerr << "Encoding character " << i << ", '" << seq[i] << "'\n";
-
- if (i > 0 && i % bases_per_byte == 0) index++;
-
- //cerr << " encoding '" << seq[i] << "' into byte " << index << ".\n";
-
- this->s[index] <<= 2;
-
- switch (seq[i]) {
- case 'A':
- case 'a':
- // A == 00
- break;
- case 'C':
- case 'c':
- // C == 01
- this->s[index] |= 0x1;
- break;
- case 'G':
- case 'g':
- // G == 10
- this->s[index] |= 0x2;
- break;
- case 'T':
- case 't':
- // T == 11
- this->s[index] |= 0x3;
- break;
- }
-
- //cerr << " s[" << index << "," << i % bases_per_byte << "] == " << (int)this->s[index] << "\n";
+ //cerr << "Encoding character " << i << ", '" << seq[i] << "'\n";
+
+ if (i > 0 && i % bases_per_byte == 0) index++;
+
+ //cerr << " encoding '" << seq[i] << "' into byte " << index << ".\n";
+
+ this->s[index] <<= 2;
+
+ switch (seq[i]) {
+ case 'A':
+ case 'a':
+ // A == 00
+ break;
+ case 'C':
+ case 'c':
+ // C == 01
+ this->s[index] |= 0x1;
+ break;
+ case 'G':
+ case 'g':
+ // G == 10
+ this->s[index] |= 0x2;
+ break;
+ case 'T':
+ case 't':
+ // T == 11
+ this->s[index] |= 0x3;
+ break;
+ }
+
+ //cerr << " s[" << index << "," << i % bases_per_byte << "] == " << (int)this->s[index] << "\n";
}
if (rem > 0)
- this->s[index] <<= (bases_per_byte - rem) * 2;
+ this->s[index] <<= (bases_per_byte - rem) * 2;
}
DNASeq::~DNASeq() {
@@ -123,35 +123,35 @@ char DNASeq::operator[](int pos) {
switch (rem) {
case 0:
- c = this->s[index] & 0xC0; // 11000000
- c >>= 6;
- break;
+ c = this->s[index] & 0xC0; // 11000000
+ c >>= 6;
+ break;
case 1:
- c = this->s[index] & 0x30; // 00110000
- c >>= 4;
- break;
+ c = this->s[index] & 0x30; // 00110000
+ c >>= 4;
+ break;
case 2:
- c = this->s[index] & 0xC; // 00001100
- c >>= 2;
- break;
+ c = this->s[index] & 0xC; // 00001100
+ c >>= 2;
+ break;
case 3:
- c = this->s[index] & 0x3; // 00000011
- break;
+ c = this->s[index] & 0x3; // 00000011
+ break;
}
switch (c) {
case 0:
- base = 'A';
- break;
+ base = 'A';
+ break;
case 1:
- base = 'C';
- break;
+ base = 'C';
+ break;
case 2:
- base = 'G';
- break;
+ base = 'G';
+ break;
case 3:
- base = 'T';
- break;
+ base = 'T';
+ break;
}
//cerr << " Decoding character " << pos << ", '" << base << "'\n";
@@ -167,44 +167,44 @@ char *DNASeq::subseq(char *seq, int start, int end) {
rem = i % bases_per_byte;
for (; i <= end; i++) {
- rem = i % bases_per_byte;
-
- if (i > 0 && rem == 0) index++;
- //cerr << "s[" << index << "," << rem << "] == " << (int)this->s[index] << "\n";
-
- switch (rem) {
- case 0:
- c = this->s[index] & 0xC0; // 11000000
- c >>= 6;
- break;
- case 1:
- c = this->s[index] & 0x30; // 00110000
- c >>= 4;
- break;
- case 2:
- c = this->s[index] & 0xC; // 00001100
- c >>= 2;
- break;
- case 3:
- c = this->s[index] & 0x3; // 00000011
- break;
- }
-
- switch (c) {
- case 0:
- seq[i - start] = 'A';
- break;
- case 1:
- seq[i - start] = 'C';
- break;
- case 2:
- seq[i - start] = 'G';
- break;
- case 3:
- seq[i - start] = 'T';
- break;
- }
- //cerr << " Decoding character " << i << ", '" << seq[i - start] << "'\n";
+ rem = i % bases_per_byte;
+
+ if (i > 0 && rem == 0) index++;
+ //cerr << "s[" << index << "," << rem << "] == " << (int)this->s[index] << "\n";
+
+ switch (rem) {
+ case 0:
+ c = this->s[index] & 0xC0; // 11000000
+ c >>= 6;
+ break;
+ case 1:
+ c = this->s[index] & 0x30; // 00110000
+ c >>= 4;
+ break;
+ case 2:
+ c = this->s[index] & 0xC; // 00001100
+ c >>= 2;
+ break;
+ case 3:
+ c = this->s[index] & 0x3; // 00000011
+ break;
+ }
+
+ switch (c) {
+ case 0:
+ seq[i - start] = 'A';
+ break;
+ case 1:
+ seq[i - start] = 'C';
+ break;
+ case 2:
+ seq[i - start] = 'G';
+ break;
+ case 3:
+ seq[i - start] = 'T';
+ break;
+ }
+ //cerr << " Decoding character " << i << ", '" << seq[i - start] << "'\n";
}
seq[i - start] = '\0';
@@ -218,43 +218,43 @@ char *DNASeq::seq(char *seq) {
int index = 0;
for (i = 0; i < this->size; i++) {
- if (i > 0 && i % bases_per_byte == 0) index++;
-
- //cerr << "s[" << index << "," << i % bases_per_byte << "] == " << (int)this->s[index] << "\n";
-
- switch (i % bases_per_byte) {
- case 0:
- c = this->s[index] & 0xC0; // 11000000
- c >>= 6;
- break;
- case 1:
- c = this->s[index] & 0x30; // 00110000
- c >>= 4;
- break;
- case 2:
- c = this->s[index] & 0xC; // 00001100
- c >>= 2;
- break;
- case 3:
- c = this->s[index] & 0x3; // 00000011
- break;
- }
-
- switch (c) {
- case 0:
- seq[i] = 'A';
- break;
- case 1:
- seq[i] = 'C';
- break;
- case 2:
- seq[i] = 'G';
- break;
- case 3:
- seq[i] = 'T';
- break;
- }
- //cerr << " Decoding character " << i << ", '" << seq[i] << "'\n";
+ if (i > 0 && i % bases_per_byte == 0) index++;
+
+ //cerr << "s[" << index << "," << i % bases_per_byte << "] == " << (int)this->s[index] << "\n";
+
+ switch (i % bases_per_byte) {
+ case 0:
+ c = this->s[index] & 0xC0; // 11000000
+ c >>= 6;
+ break;
+ case 1:
+ c = this->s[index] & 0x30; // 00110000
+ c >>= 4;
+ break;
+ case 2:
+ c = this->s[index] & 0xC; // 00001100
+ c >>= 2;
+ break;
+ case 3:
+ c = this->s[index] & 0x3; // 00000011
+ break;
+ }
+
+ switch (c) {
+ case 0:
+ seq[i] = 'A';
+ break;
+ case 1:
+ seq[i] = 'C';
+ break;
+ case 2:
+ seq[i] = 'G';
+ break;
+ case 3:
+ seq[i] = 'T';
+ break;
+ }
+ //cerr << " Decoding character " << i << ", '" << seq[i] << "'\n";
}
seq[i] = '\0';
@@ -270,43 +270,43 @@ char *DNASeq::seq() {
char *seq = new char[this->size + 1];
for (i = 0; i < this->size; i++) {
- if (i > 0 && i % bases_per_byte == 0) index++;
-
- //cerr << "s[" << index << "," << i % bases_per_byte << "] == " << (int)this->s[index] << "\n";
-
- switch (i % bases_per_byte) {
- case 0:
- c = this->s[index] & 0xC0; // 11000000
- c >>= 6;
- break;
- case 1:
- c = this->s[index] & 0x30; // 00110000
- c >>= 4;
- break;
- case 2:
- c = this->s[index] & 0xC; // 00001100
- c >>= 2;
- break;
- case 3:
- c = this->s[index] & 0x3; // 00000011
- break;
- }
-
- switch (c) {
- case 0:
- seq[i] = 'A';
- break;
- case 1:
- seq[i] = 'C';
- break;
- case 2:
- seq[i] = 'G';
- break;
- case 3:
- seq[i] = 'T';
- break;
- }
- //cerr << " Decoding character " << i << ", '" << seq[i] << "'\n";
+ if (i > 0 && i % bases_per_byte == 0) index++;
+
+ //cerr << "s[" << index << "," << i % bases_per_byte << "] == " << (int)this->s[index] << "\n";
+
+ switch (i % bases_per_byte) {
+ case 0:
+ c = this->s[index] & 0xC0; // 11000000
+ c >>= 6;
+ break;
+ case 1:
+ c = this->s[index] & 0x30; // 00110000
+ c >>= 4;
+ break;
+ case 2:
+ c = this->s[index] & 0xC; // 00001100
+ c >>= 2;
+ break;
+ case 3:
+ c = this->s[index] & 0x3; // 00000011
+ break;
+ }
+
+ switch (c) {
+ case 0:
+ seq[i] = 'A';
+ break;
+ case 1:
+ seq[i] = 'C';
+ break;
+ case 2:
+ seq[i] = 'G';
+ break;
+ case 3:
+ seq[i] = 'T';
+ break;
+ }
+ //cerr << " Decoding character " << i << ", '" << seq[i] << "'\n";
}
seq[i] = '\0';
diff --git a/src/DNASeq.h b/src/DNASeq.h
index 9a91066..8981549 100644
--- a/src/DNASeq.h
+++ b/src/DNASeq.h
@@ -33,7 +33,7 @@
// #endif
//
-// We expect (and C++ defines) an unsigned char as 8 bits, so we
+// We expect (and C++ defines) an unsigned char as 8 bits, so we
// should be able to store 4 nucleotide bases per byte of memory.
//
const unsigned short int bases_per_byte = CHAR_BIT / 2;
@@ -81,23 +81,23 @@ using std::cerr;
struct hash_dnaseq {
size_t operator()(DNASeq *__s) const
{
- size_t __result = static_cast<size_t>(14695981039346656037ULL);
- unsigned short int __bytes = (__s->size / bases_per_byte) + (__s->size % bases_per_byte > 0 ? 1 : 0);
- for (unsigned short int i = 0; i < __bytes; i++) {
- __result ^= static_cast<size_t>(__s->s[i]);
- __result *= static_cast<size_t>(1099511628211ULL);
- }
+ size_t __result = static_cast<size_t>(14695981039346656037ULL);
+ unsigned short int __bytes = (__s->size / bases_per_byte) + (__s->size % bases_per_byte > 0 ? 1 : 0);
+ for (unsigned short int i = 0; i < __bytes; i++) {
+ __result ^= static_cast<size_t>(__s->s[i]);
+ __result *= static_cast<size_t>(1099511628211ULL);
+ }
- return __result;
+ return __result;
}
};
struct dnaseq_eqstr {
bool operator()(DNASeq *s1, DNASeq *s2) const {
- unsigned int bytes = (s1->size / bases_per_byte) + (s1->size % bases_per_byte > 0 ? 1 : 0);
- for (unsigned int i = 0; i < bytes; i++)
- if (s1->s[i] != s2->s[i]) return false;
- return true;
+ unsigned int bytes = (s1->size / bases_per_byte) + (s1->size % bases_per_byte > 0 ? 1 : 0);
+ for (unsigned int i = 0; i < bytes; i++)
+ if (s1->s[i] != s2->s[i]) return false;
+ return true;
}
};
diff --git a/src/FastaI.h b/src/FastaI.h
index e50a36c..997341b 100644
--- a/src/FastaI.h
+++ b/src/FastaI.h
@@ -41,11 +41,11 @@ Seq *Fasta::next_seq() {
// record.
//
while (this->line[0] != '>' && this->fh.good() ) {
- this->fh.getline(this->line, max_len);
+ this->fh.getline(this->line, max_len);
}
if (!this->fh.good()) {
- return NULL;
+ return NULL;
}
//
@@ -68,19 +68,19 @@ Seq *Fasta::next_seq() {
this->fh.getline(this->line, max_len);
while (this->line[0] != '>' && this->fh.good()) {
- len = strlen(this->line);
- if (this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+ len = strlen(this->line);
+ if (this->line[len - 1] == '\r') this->line[len - 1] = '\0';
- this->buf += this->line;
- this->line[0] = '\0';
- this->fh.getline(this->line, max_len);
+ this->buf += this->line;
+ this->line[0] = '\0';
+ this->fh.getline(this->line, max_len);
}
if (this->fh.eof()) {
- len = strlen(this->line);
- if (this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+ len = strlen(this->line);
+ if (this->line[len - 1] == '\r') this->line[len - 1] = '\0';
- this->buf += this->line;
+ this->buf += this->line;
}
s->seq = new char[this->buf.length() + 1];
@@ -97,11 +97,11 @@ int Fasta::next_seq(Seq &s) {
// record.
//
while (this->line[0] != '>' && this->fh.good() ) {
- this->fh.getline(this->line, max_len);
+ this->fh.getline(this->line, max_len);
}
if (!this->fh.good()) {
- return 0;
+ return 0;
}
//
@@ -122,19 +122,19 @@ int Fasta::next_seq(Seq &s) {
this->fh.getline(this->line, max_len);
while (this->line[0] != '>' && this->fh.good()) {
- len = strlen(this->line);
- if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+ len = strlen(this->line);
+ if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
- this->buf += this->line;
- this->line[0] = '\0';
- this->fh.getline(this->line, max_len);
+ this->buf += this->line;
+ this->line[0] = '\0';
+ this->fh.getline(this->line, max_len);
}
if (this->fh.eof()) {
- len = strlen(this->line);
- if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+ len = strlen(this->line);
+ if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
- this->buf += this->line;
+ this->buf += this->line;
}
strcpy(s.seq, this->buf.c_str());
diff --git a/src/FastqI.h b/src/FastqI.h
index 2e055e9..dbbfddd 100644
--- a/src/FastqI.h
+++ b/src/FastqI.h
@@ -40,11 +40,11 @@ Seq *Fastq::next_seq() {
// record.
//
while (this->line[0] != '@' && this->fh.good() ) {
- this->fh.getline(this->line, max_len);
+ this->fh.getline(this->line, max_len);
}
if (!this->fh.good()) {
- return NULL;
+ return NULL;
}
//
@@ -66,7 +66,7 @@ Seq *Fastq::next_seq() {
this->fh.getline(this->line, max_len);
if (!this->fh.good()) {
- return NULL;
+ return NULL;
}
len = strlen(this->line);
@@ -81,7 +81,7 @@ Seq *Fastq::next_seq() {
this->fh.getline(this->line, max_len);
if (this->line[0] != '+' || !this->fh.good()) {
- return NULL;
+ return NULL;
}
//
@@ -90,7 +90,7 @@ Seq *Fastq::next_seq() {
this->fh.getline(this->line, max_len);
if (!this->fh.good() && !this->fh.eof()) {
- return NULL;
+ return NULL;
}
len = strlen(this->line);
@@ -101,7 +101,7 @@ Seq *Fastq::next_seq() {
//
// Clear the line buffer so it is set up for the next record. If a '@'
- // appears in the quality scores read, it will break parsing next time
+ // appears in the quality scores read, it will break parsing next time
// it is called.
//
this->line[0] = '\0';
@@ -116,11 +116,11 @@ int Fastq::next_seq(Seq &s) {
// record.
//
while (this->line[0] != '@' && this->fh.good() ) {
- this->fh.getline(this->line, max_len);
+ this->fh.getline(this->line, max_len);
}
if (!this->fh.good()) {
- return 0;
+ return 0;
}
//
@@ -140,7 +140,7 @@ int Fastq::next_seq(Seq &s) {
this->fh.getline(this->line, max_len);
if (!this->fh.good()) {
- return 0;
+ return 0;
}
len = strlen(this->line);
@@ -154,7 +154,7 @@ int Fastq::next_seq(Seq &s) {
this->fh.getline(this->line, max_len);
if (this->line[0] != '+' || !this->fh.good()) {
- return 0;
+ return 0;
}
//
@@ -163,7 +163,7 @@ int Fastq::next_seq(Seq &s) {
this->fh.getline(this->line, max_len);
if (!this->fh.good() && !this->fh.eof()) {
- return 0;
+ return 0;
}
len = strlen(this->line);
@@ -173,7 +173,7 @@ int Fastq::next_seq(Seq &s) {
//
// Clear the line buffer so it is set up for the next record. If a '@'
- // appears in the quality scores read, it will break parsing next time
+ // appears in the quality scores read, it will break parsing next time
// it is called.
//
this->line[0] = '\0';
diff --git a/src/GappedAln.h b/src/GappedAln.h
index af42ba4..210c33a 100644
--- a/src/GappedAln.h
+++ b/src/GappedAln.h
@@ -30,15 +30,15 @@ public:
uint contiguity;
double pct_id;
AlignRes() {
- this->gap_cnt = 0;
+ this->gap_cnt = 0;
this->contiguity = 0;
- this->pct_id = 0.0;
+ this->pct_id = 0.0;
}
AlignRes(string cigar, uint gapcnt, uint contiguity, double pct_id) {
- this->cigar = cigar;
- this->gap_cnt = gapcnt;
+ this->cigar = cigar;
+ this->gap_cnt = gapcnt;
this->contiguity = contiguity;
- this->pct_id = pct_id;
+ this->pct_id = pct_id;
}
};
@@ -83,7 +83,7 @@ class GappedAln {
inline int swap(double *, dynprog *, int, int);
int trace_alignment(string, string);
- public:
+ public:
GappedAln();
GappedAln(int i) : GappedAln(i, i) {};
GappedAln(int, int);
@@ -125,7 +125,7 @@ GappedAln::GappedAln(int len_1, int len_2)
GappedAln::~GappedAln()
{
- for (uint i = 0; i < this->_m; i++) {
+ for (uint i = 0; i < this->_m_size; i++) {
delete [] this->matrix[i];
delete [] this->path[i];
}
@@ -139,33 +139,33 @@ GappedAln::init(int size_1, int size_2)
//
// Resize the underlying matrix and path arrays, if necessary.
//
- if ((size_1 + 1) > (int)_m_size || (size_2 + 1) > (int)_n_size) {
- for (uint i = 0; i < this->_m_size; i++) {
- delete [] this->matrix[i];
- delete [] this->path[i];
- }
+ if ((size_1 + 1) > (int)this->_m_size || (size_2 + 1) > (int)this->_n_size) {
+ for (uint i = 0; i < this->_m_size; i++) {
+ delete [] this->matrix[i];
+ delete [] this->path[i];
+ }
if (this->_m_size > 0) {
delete [] this->matrix;
delete [] this->path;
}
- this->_m_size = size_1 + 1;
- this->_n_size = size_2 + 1;
+ this->_m_size = size_1 + 1;
+ this->_n_size = size_2 + 1;
//
// Resize the arrays to be 25% larger than the requested size.
//
int new_size = this->_m_size > this->_n_size ? this->_m_size : this->_n_size;
new_size += int((double) new_size * 0.25);
this->_m_size = new_size;
- this->_n_size = new_size;
+ this->_n_size = new_size;
- this->matrix = new double * [this->_m_size];
- for (uint i = 0; i < this->_m_size; i++)
- this->matrix[i] = new double [this->_n_size];
+ this->matrix = new double * [this->_m_size];
+ for (uint i = 0; i < this->_m_size; i++)
+ this->matrix[i] = new double [this->_n_size];
- this->path = new AlignPath * [this->_m_size];
- for (uint i = 0; i < this->_m_size; i++)
- this->path[i] = new AlignPath [this->_n_size];
+ this->path = new AlignPath * [this->_m_size];
+ for (uint i = 0; i < this->_m_size; i++)
+ this->path[i] = new AlignPath [this->_n_size];
}
//
@@ -189,7 +189,7 @@ GappedAln::align(string tag_1, string tag_2)
// v [3] |
// ... |
// [m-1] |
- //
+ //
//
// Initialize the first column and row of the dynamic programming
@@ -215,7 +215,7 @@ GappedAln::align(string tag_1, string tag_2)
double score_down, score_diag, score_right;
double scores[3];
dynprog direction[3];
-
+
for (uint i = 1; i < this->_m; i++) {
for (uint j = 1; j < this->_n; j++) {
// Calculate the score:
@@ -268,7 +268,7 @@ GappedAln::align(string tag_1, string tag_2)
this->path[i][j].up = false;
this->path[i][j].left = true;
}
-
+
} else if (scores[0] == scores[1]) {
//
// Two of the paths are equivalent.
@@ -276,7 +276,7 @@ GappedAln::align(string tag_1, string tag_2)
switch (direction[0]) {
case dynp_diag:
this->path[i][j].diag = true;
-
+
switch (direction[1]) {
case dynp_down:
this->path[i][j].up = true;
@@ -291,7 +291,7 @@ GappedAln::align(string tag_1, string tag_2)
break;
case dynp_down:
this->path[i][j].up = true;
-
+
switch (direction[1]) {
case dynp_right:
this->path[i][j].diag = false;
@@ -307,7 +307,7 @@ GappedAln::align(string tag_1, string tag_2)
default:
case dynp_right:
this->path[i][j].left = true;
-
+
switch (direction[1]) {
case dynp_diag:
this->path[i][j].diag = true;
@@ -321,7 +321,7 @@ GappedAln::align(string tag_1, string tag_2)
}
break;
}
-
+
} else {
//
// All paths equivalent.
@@ -336,8 +336,8 @@ GappedAln::align(string tag_1, string tag_2)
// dump_alignment(tag_1, tag_2, matrix, path);
if (this->trace_alignment(tag_1, tag_2))
- return 1;
-
+ return 1;
+
return 0;
}
@@ -365,7 +365,7 @@ compare_alignres(AlignRes a, AlignRes b)
return (a.pct_id > b.pct_id);
} else {
- return (a.gap_cnt < b.gap_cnt);
+ return (a.gap_cnt < b.gap_cnt);
}
}
@@ -381,7 +381,7 @@ GappedAln::trace_alignment(string tag_1, string tag_2)
// v [3] |
// ... |
// [m-1] |
- //
+ //
int i, j, cnt, len, gaps, contiguity;
double ident;
string cigar;
@@ -390,7 +390,7 @@ GappedAln::trace_alignment(string tag_1, string tag_2)
vector<AlignRes> alns;
bool more_paths = true;
bool seq_break = false;
-
+
do {
more_paths = false;
@@ -434,13 +434,13 @@ GappedAln::trace_alignment(string tag_1, string tag_2)
gaps = 0;
contiguity = 0;
seq_break = false;
- ident = 0.0;
+ ident = 0.0;
i = 0;
while (i < len) {
if (aln_1[i] != '-' && aln_2[i] != '-') {
cnt = 0;
do {
- if (aln_1[i] == aln_2[i]) ident++;
+ if (aln_1[i] == aln_2[i]) ident++;
cnt++;
i++;
if (seq_break == false) contiguity++;
@@ -472,8 +472,8 @@ GappedAln::trace_alignment(string tag_1, string tag_2)
}
alns.push_back(AlignRes(cigar, gaps, contiguity, (ident / (double) len)));
-
- // cerr << aln_1 << " [" << cigar << ", contiguity: " << contiguity << ", gaps: " << gaps << "]\n"
+
+ // cerr << aln_1 << " [" << cigar << ", contiguity: " << contiguity << ", gaps: " << gaps << "]\n"
// << aln_2 << "\n";
} while (more_paths);
@@ -492,7 +492,7 @@ GappedAln::result()
return this->_aln;
}
-int
+int
GappedAln::parse_cigar(vector<pair<char, uint> > &cigar)
{
char buf[id_len];
@@ -532,7 +532,7 @@ GappedAln::dump_alignment(string tag_1, string tag_2)
// v [3] |
// ... |
// [m-1] |
- //
+ //
//
// Output the score matrix.
@@ -587,7 +587,7 @@ GappedAln::dump_alignment(string tag_1, string tag_2)
}
cout << "\n";
-
+
return 0;
}
diff --git a/src/PopMap.h b/src/PopMap.h
index 5172f2c..1b699f2 100644
--- a/src/PopMap.h
+++ b/src/PopMap.h
@@ -108,7 +108,7 @@ class PopMap {
int num_loci;
set<pair<int, int> > blacklist;
Datum ***data;
- map<int, int> locus_order; // LocusID => ArrayIndex; map catalog IDs to their first dimension
+ map<int, int> locus_order; // LocusID => ArrayIndex; map catalog IDs to their first dimension
// position in the Datum array.
map<int, int> rev_locus_order;
@@ -177,7 +177,7 @@ template<class LocusT>
int PopMap<LocusT>::populate(map<int, LocusT*> &catalog,
const vector<vector<CatMatch *> > &matches) {
//
- // Create an index showing what position each catalog locus is stored at in the datum
+ // Create an index showing what position each catalog locus is stored at in the datum
// array. Create a second index allowing ordering of Loci by genomic position.
//
typename std::map<int, LocusT*>::const_iterator it;
@@ -237,7 +237,7 @@ int PopMap<LocusT>::populate(map<int, LocusT*> &catalog,
} else {
// cerr << " Adding haplotype to existing datum: sample: " << matches[i][j]->sample_id << ". tag: " << matches[i][j]->tag_id << "\n";
//
- // Check that the IDs of the two matches are the same. If not, then two tags
+ // Check that the IDs of the two matches are the same. If not, then two tags
// match this locus and the locus is invalid, set back to NULL.
//
if (matches[i][j]->tag_id == this->data[locus][sample]->id) {
@@ -248,7 +248,7 @@ int PopMap<LocusT>::populate(map<int, LocusT*> &catalog,
cerr << "Warning: disparate CIGAR strings, catalog locus " << matches[i][j]->cat_id
<< "; sample ID: " << matches[i][j]->sample_id << "; sample locus ID: " << matches[i][j]->tag_id
<< "; datum cigar: " << this->data[locus][sample]->cigar << "; matches cigar: " << matches[i][j]->cigar << "\n";
-
+
this->data[locus][sample]->obshap.push_back(h);
this->data[locus][sample]->depth.push_back(matches[i][j]->depth);
this->data[locus][sample]->tot_depth += matches[i][j]->depth;
diff --git a/src/PopSum.h b/src/PopSum.h
index d8d491f..12af4fd 100644
--- a/src/PopSum.h
+++ b/src/PopSum.h
@@ -98,10 +98,10 @@ class LocStat: public PopStat {
// PopStat[0]: gene diversity
// PopStat[1]: haplotype diversity (Pi)
public:
- uint hap_cnt; // Number of unique haplotypes at this locus.
+ uint hap_cnt; // Number of unique haplotypes at this locus.
string hap_str; // Human-readable string of haplotype counts.
- LocStat(): PopStat() {
+ LocStat(): PopStat() {
this->hap_cnt = 0;
}
~LocStat() {};
@@ -123,7 +123,7 @@ public:
double amova_fst; // AMOVA Fst method, from Weir, Genetic Data Analysis II .
double *comp;
- PopPair() {
+ PopPair() {
col = 0;
pi = 0.0;
fst = 0.0;
@@ -175,10 +175,10 @@ public:
class LocSum {
public:
- SumStat *nucs; // Array containing summary statistics for
+ SumStat *nucs; // Array containing summary statistics for
// each nucleotide position at this locus.
LocSum(int len) {
- this->nucs = new SumStat[len];
+ this->nucs = new SumStat[len];
}
~LocSum() {
delete [] this->nucs;
@@ -200,7 +200,7 @@ public:
bool fixed;
int priv_allele;
- NucTally() {
+ NucTally() {
loc_id = 0;
bp = 0;
col = 0;
@@ -220,8 +220,8 @@ class LocTally {
public:
NucTally *nucs;
- LocTally(int len) {
- this->nucs = new NucTally[len];
+ LocTally(int len) {
+ this->nucs = new NucTally[len];
}
~LocTally() {
delete [] this->nucs;
@@ -229,7 +229,7 @@ public:
};
//
-// Population Summary class contains a two dimensional array storing the
+// Population Summary class contains a two dimensional array storing the
// summary statistics for each locus in each of a set of populations:
//
// Pop1 Pop2 Pop3
@@ -281,7 +281,7 @@ public:
private:
int tally_heterozygous_pos(LocusT *, Datum **, LocSum *, int, int, uint, uint);
int tally_fixed_pos(LocusT *, Datum **, LocSum *, int, uint, uint);
- int tally_ref_alleles(LocSum **, int, short unsigned int &, char &, char &, short unsigned int &, short unsigned int &);
+ int tally_ref_alleles(LocSum **, int, short unsigned int &, char &, char &, short unsigned int &, short unsigned int &);
int tally_observed_haplotypes(vector<char *> &, int);
double pi(double, double, double);
double binomial_coeff(double, double);
@@ -386,7 +386,7 @@ int PopSum<LocusT>::add_population(map<int, LocusT *> &catalog,
<< loc->id << "\t"
<< loc->loc.chr << "\t"
<< loc->sort_bp(loc->snps[k]->col) +1 << "\t"
- << loc->snps[k]->col << "\t"
+ << loc->snps[k]->col << "\t"
<< pop.name << "\n";
}
@@ -411,7 +411,7 @@ int PopSum<LocusT>::add_population(map<int, LocusT *> &catalog,
}
template<class LocusT>
-int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
+int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
{
LocusT *loc;
LocSum **s;
@@ -436,9 +436,9 @@ int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
ltally->nucs[col].bp = loc->sort_bp(col);
ltally->nucs[col].loc_id = locus_id;
- this->tally_ref_alleles(s, col,
- ltally->nucs[col].allele_cnt,
- ltally->nucs[col].p_allele,
+ this->tally_ref_alleles(s, col,
+ ltally->nucs[col].allele_cnt,
+ ltally->nucs[col].p_allele,
ltally->nucs[col].q_allele,
p_cnt, q_cnt);
@@ -447,7 +447,7 @@ int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
//
if (ltally->nucs[col].allele_cnt > 1)
ltally->nucs[col].fixed = false;
-
+
for (int j = 0; j < pop_cnt(); j++) {
//
// Sum the number of individuals examined at this locus across populations.
@@ -461,20 +461,20 @@ int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
// Sum the most frequent allele across populations.
//
if (s[j]->nucs[col].p_nuc == ltally->nucs[col].p_allele)
- ltally->nucs[col].p_freq +=
+ ltally->nucs[col].p_freq +=
s[j]->nucs[col].p * (s[j]->nucs[col].num_indv / (double) ltally->nucs[col].num_indv);
- else
- ltally->nucs[col].p_freq +=
+ else
+ ltally->nucs[col].p_freq +=
(1 - s[j]->nucs[col].p) * (s[j]->nucs[col].num_indv / (double) ltally->nucs[col].num_indv);
//
// Sum observed heterozygosity across populations.
//
- ltally->nucs[col].obs_het +=
+ ltally->nucs[col].obs_het +=
s[j]->nucs[col].obs_het * (s[j]->nucs[col].num_indv / (double) ltally->nucs[col].num_indv);
}
//
- // We want to report the most frequent allele as the P allele. Reorder the alleles
+ // We want to report the most frequent allele as the P allele. Reorder the alleles
// if necessary.
// XXX Possibly unstable for p_freq ~ 0.5. @Nick (July 2016)
//
@@ -514,10 +514,10 @@ int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
}
template<class LocusT>
-int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
- short unsigned int &allele_cnt,
- char &p_allele, char &q_allele,
- short unsigned int &p_cnt, short unsigned int &q_cnt)
+int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
+ short unsigned int &allele_cnt,
+ char &p_allele, char &q_allele,
+ short unsigned int &p_cnt, short unsigned int &q_cnt)
{
int nucs[4] = {0};
char nuc[2];
@@ -532,7 +532,7 @@ int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
nuc[0] = s[j]->nucs[snp_index].p_nuc;
nuc[1] = s[j]->nucs[snp_index].q_nuc;
- for (uint k = 0; k < 2; k++)
+ for (uint k = 0; k < 2; k++)
switch(nuc[k]) {
case 'A':
case 'a':
@@ -610,7 +610,7 @@ int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
//
// Tabulate the number of populations the p_allele and the q_allele occur in.
- //
+ //
p_cnt = 0;
q_cnt = 0;
@@ -620,7 +620,7 @@ int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
nuc[0] = s[j]->nucs[snp_index].p_nuc;
nuc[1] = s[j]->nucs[snp_index].q_nuc;
- for (uint k = 0; k < 2; k++)
+ for (uint k = 0; k < 2; k++)
if (nuc[k] != 0 && nuc[k] == p_allele)
p_cnt++;
else if (nuc[k] != 0 && nuc[k] == q_allele)
@@ -640,7 +640,7 @@ PopPair *PopSum<LocusT>::Fst(int locus_id, int pop_1, int pop_2, int pos)
//
// If this locus only appears in one population do not calculate Fst.
//
- if (s_1->nucs[pos].num_indv == 0 || s_2->nucs[pos].num_indv == 0)
+ if (s_1->nucs[pos].num_indv == 0 || s_2->nucs[pos].num_indv == 0)
return pair;
//
@@ -670,7 +670,7 @@ PopPair *PopSum<LocusT>::Fst(int locus_id, int pop_1, int pop_2, int pos)
nucs[2] = s_2->nucs[pos].p_nuc;
nucs[3] = s_2->nucs[pos].q_nuc;
- for (int i = 0; i < 4; i++)
+ for (int i = 0; i < 4; i++)
switch(nucs[i]) {
case 'A':
ncnt[0]++;
@@ -696,8 +696,8 @@ PopPair *PopSum<LocusT>::Fst(int locus_id, int pop_1, int pop_2, int pos)
double tot_alleles = n_1 + n_2;
double p_1 = round(n_1 * s_1->nucs[pos].p);
double q_1 = n_1 - p_1;
- double p_2 =
- s_1->nucs[pos].p_nuc == s_2->nucs[pos].p_nuc ?
+ double p_2 =
+ s_1->nucs[pos].p_nuc == s_2->nucs[pos].p_nuc ?
s_2->nucs[pos].p : (1 - s_2->nucs[pos].p);
p_2 = round(n_2 * p_2);
double q_2 = n_2 - p_2;
@@ -719,9 +719,9 @@ PopPair *PopSum<LocusT>::Fst(int locus_id, int pop_1, int pop_2, int pos)
this->fishers_exact_test(pair, p_1, q_1, p_2, q_2);
// cerr << "Locus: " << locus << ", pos: " << pos << "\n"
- // << " p_1.nuc: " << s_1->nucs[pos].p_nuc << "; q_1.nuc: " << s_1->nucs[pos].q_nuc
+ // << " p_1.nuc: " << s_1->nucs[pos].p_nuc << "; q_1.nuc: " << s_1->nucs[pos].q_nuc
// << "; p_2.nuc: " << s_2->nucs[pos].p_nuc << "; q_2.nuc: " << s_2->nucs[pos].q_nuc << "\n"
- // << " Total alleles: " << tot_alleles << "; " << " s_1.p: " << s_1->nucs[pos].p
+ // << " Total alleles: " << tot_alleles << "; " << " s_1.p: " << s_1->nucs[pos].p
// << "; s_2.p: " << s_2->nucs[pos].p << "\n"
// << " p_1: " << p_1 << "; q_1: " << q_1 << " p_2: " << p_2 << "; q_2: " << q_2 << "\n"
// << " Pi1: " << pi_1 << "; Pi2: " << pi_2 << "; PiAll: " << pi_all << "\n"
@@ -736,24 +736,24 @@ PopPair *PopSum<LocusT>::Fst(int locus_id, int pop_1, int pop_2, int pos)
//
double p_1_freq = s_1->nucs[pos].p;
double q_1_freq = 1 - p_1_freq;
- double p_2_freq =
- s_1->nucs[pos].p_nuc == s_2->nucs[pos].p_nuc ?
+ double p_2_freq =
+ s_1->nucs[pos].p_nuc == s_2->nucs[pos].p_nuc ?
s_2->nucs[pos].p : (1 - s_2->nucs[pos].p);
double q_2_freq = 1 - p_2_freq;
- double p_avg_cor =
- ( (s_1->nucs[pos].num_indv * p_1_freq) + (s_2->nucs[pos].num_indv * p_2_freq) ) /
+ double p_avg_cor =
+ ( (s_1->nucs[pos].num_indv * p_1_freq) + (s_2->nucs[pos].num_indv * p_2_freq) ) /
( s_1->nucs[pos].num_indv + s_2->nucs[pos].num_indv );
double n_avg_cor = 2 * ((s_1->nucs[pos].num_indv / 2) + (s_2->nucs[pos].num_indv / 2));
pair->amova_fst =
(
- (s_1->nucs[pos].num_indv * pow((p_1_freq - p_avg_cor), 2) +
+ (s_1->nucs[pos].num_indv * pow((p_1_freq - p_avg_cor), 2) +
s_2->nucs[pos].num_indv * pow((p_2_freq - p_avg_cor), 2))
- /
- n_avg_cor
+ /
+ n_avg_cor
)
- /
+ /
(p_avg_cor * (1 - p_avg_cor));
if (log_fst_comp) {
@@ -779,8 +779,8 @@ PopPair *PopSum<LocusT>::Fst(int locus_id, int pop_1, int pop_2, int pos)
}
// //
- // // Calculate Fst using a pure parametric method (assumes allele counts are real, not
- // // samples). Jakobsson, Edge, and Rosenberg. "The Relationship Between Fst and the
+ // // Calculate Fst using a pure parametric method (assumes allele counts are real, not
+ // // samples). Jakobsson, Edge, and Rosenberg. "The Relationship Between Fst and the
// // Frequency of the Most Frequent Allele." Genetics 193:515-528. Equation 4.
// //
// double sigma_1 = p_1_freq + q_1_freq;
@@ -794,7 +794,7 @@ PopPair *PopSum<LocusT>::Fst(int locus_id, int pop_1, int pop_2, int pos)
}
template<class LocusT>
-int PopSum<LocusT>::tally_fixed_pos(LocusT *locus, Datum **d, LocSum *s, int pos, uint start, uint end)
+int PopSum<LocusT>::tally_fixed_pos(LocusT *locus, Datum **d, LocSum *s, int pos, uint start, uint end)
{
double num_indv = 0.0;
char p_nuc = 0;
@@ -802,12 +802,12 @@ int PopSum<LocusT>::tally_fixed_pos(LocusT *locus, Datum **d, LocSum *s, int pos
for (uint i = start; i <= end; i++) {
if (d[i] == NULL || pos >= d[i]->len) continue;
//
- // Before counting this individual, make sure the model definitively called this
+ // Before counting this individual, make sure the model definitively called this
// position as hEterozygous or hOmozygous.
//
if (d[i]->model[pos] == 'E') {
cerr << "Model: " << d[i]->model << "\n";
- cerr << "Warning: heterozygous model call at fixed nucleotide position: "
+ cerr << "Warning: heterozygous model call at fixed nucleotide position: "
<< "locus " << locus->id << " individual " << d[i]->id << "; position: " << pos << "\n";
}
num_indv++;
@@ -837,8 +837,8 @@ int PopSum<LocusT>::tally_fixed_pos(LocusT *locus, Datum **d, LocSum *s, int pos
}
template<class LocusT>
-int PopSum<LocusT>::tally_heterozygous_pos(LocusT *locus, Datum **d, LocSum *s,
- int pos, int snp_index, uint start, uint end)
+int PopSum<LocusT>::tally_heterozygous_pos(LocusT *locus, Datum **d, LocSum *s,
+ int pos, int snp_index, uint start, uint end)
{
//
// Tally up the genotype frequencies.
@@ -949,7 +949,7 @@ int PopSum<LocusT>::tally_heterozygous_pos(LocusT *locus, Datum **d, LocSum *s,
for (i = start; i <= end; i++) {
if (d[i] == NULL || pos >= d[i]->len) continue;
//
- // Before counting this individual, make sure the model definitively called this
+ // Before counting this individual, make sure the model definitively called this
// position as hEterozygous or hOmozygous.
//
if (d[i]->model[pos] == 'E' || d[i]->model[pos] == 'O')
@@ -1055,7 +1055,7 @@ int PopSum<LocusT>::tally_heterozygous_pos(LocusT *locus, Datum **d, LocSum *s,
}
template<class LocusT>
-int PopSum<LocusT>::tally_observed_haplotypes(vector<char *> &obshap, int snp_index)
+int PopSum<LocusT>::tally_observed_haplotypes(vector<char *> &obshap, int snp_index)
{
int nucs[4] = {0};
char nuc;
@@ -1096,7 +1096,7 @@ int PopSum<LocusT>::tally_observed_haplotypes(vector<char *> &obshap, int snp_in
template<class LocusT>
int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, double p_2, double q_2)
{
- // | Allele1 | Allele2 |
+ // | Allele1 | Allele2 |
// Fisher's Exact Test: -----+---------+---------+
// Pop1 | p_1 | q_1 |
// Pop2 | p_2 | q_2 |
@@ -1129,7 +1129,7 @@ int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, do
// sprintf(p2_str, "% 3.0f", p_2);
// sprintf(q2_str, "% 3.0f", q_2);
//
- // cerr
+ // cerr
// << " | Allele1 | Allele2 | " << "\n"
// << "-----+---------+---------+" << "\n"
// << "Pop1 | " << p1_str << " | " << q1_str << " |" << "\n"
@@ -1146,7 +1146,7 @@ int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, do
double den = this->binomial_coeff(n, c_1);
//
- // If (p_1*q_2 - p_2*q_1) < 0 decrease cells p_1 and q_2 by one and add one to p_2 and q_1.
+ // If (p_1*q_2 - p_2*q_1) < 0 decrease cells p_1 and q_2 by one and add one to p_2 and q_1.
// Compute p and repeat until one or more cells equal 0.
//
if (d_1 - d_2 < 0) {
@@ -1162,7 +1162,7 @@ int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, do
} else {
//
- // Else, if (p_1*q_2 - p_2*q_1) > 0 decrease cells p_2 and q_1 by one and add one to p_1 and q_2.
+ // Else, if (p_1*q_2 - p_2*q_1) > 0 decrease cells p_2 and q_1 by one and add one to p_1 and q_2.
// Compute p and repeat until one or more cells equal 0.
//
do {
@@ -1184,7 +1184,7 @@ int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, do
p = 0;
//
- // If (p_1*q_2 - p_2*q_1) < 0, set to zero the smaller of the two frequencies, adjusting the other values
+ // If (p_1*q_2 - p_2*q_1) < 0, set to zero the smaller of the two frequencies, adjusting the other values
// to keep the marginals the same.
//
if (d_1 - d_2 < 0) {
@@ -1214,7 +1214,7 @@ int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, do
}
//
- // If (p_1*q_2 - p_2*q_1) < 0 decrease cells p_1 and q_2 by one and add one to p_2 and q_1.
+ // If (p_1*q_2 - p_2*q_1) < 0 decrease cells p_1 and q_2 by one and add one to p_2 and q_1.
// Compute p and repeat until tail_2 > tail_1.
//
if (d_1 - d_2 < 0) {
@@ -1233,7 +1233,7 @@ int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, do
} else {
//
- // Else, if (p_1*q_2 - p_2*q_1) > 0 decrease cells p_2 and q_1 by one and add one to p_1 and q_2.
+ // Else, if (p_1*q_2 - p_2*q_1) > 0 decrease cells p_2 and q_1 by one and add one to p_1 and q_2.
// Compute p and repeat until one or more cells equal 0.
//
do {
@@ -1296,8 +1296,8 @@ double PopSum<LocusT>::pi(double tot_alleles, double p, double q)
// Calculate Pi, equivalent to expected heterozygosity:
// pi = 1 - Sum_i( (n_i choose 2) ) / (n choose 2)
//
- double pi =
- this->binomial_coeff(p, 2) +
+ double pi =
+ this->binomial_coeff(p, 2) +
this->binomial_coeff(q, 2);
pi = pi / binomial_coeff(tot_alleles, 2);
pi = 1 - pi;
@@ -1311,7 +1311,7 @@ double PopSum<LocusT>::binomial_coeff(double n, double k)
if (n < k) return 0.0;
//
// Compute the binomial coefficient using the method of:
- // Y. Manolopoulos, "Binomial coefficient computation: recursion or iteration?",
+ // Y. Manolopoulos, "Binomial coefficient computation: recursion or iteration?",
// ACM SIGCSE Bulletin, 34(4):65-67, 2002.
//
double r = 1.0;
diff --git a/src/SamI.h b/src/SamI.h
index 74ab51c..b347cfc 100644
--- a/src/SamI.h
+++ b/src/SamI.h
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -39,51 +39,77 @@ class Sam: public Input {
int edit_gaps(vector<pair<char, uint> > &, char *);
public:
- Sam(const char *path) : Input(path) {};
- ~Sam() {};
+ Sam(const char *path) : Input(path) {}
+ ~Sam() {}
Seq *next_seq();
- int next_seq(Seq &) { return 0; };
+ int next_seq(Seq& s);
};
-Seq *
-Sam::next_seq()
-{
+Seq* Sam::next_seq() {
+ Seq* s = new Seq();
+ if(next_seq(*s) != 1) {
+ delete s;
+ s = NULL;
+ }
+ return s;
+}
+
+int Sam::next_seq(Seq& s) {
vector<string> parts;
- int flag;
+ int flag = 0;
+ int sflag = 0;
+ int aflag = 0;
uint len;
//
- // Read a record from the file and place it in a Seq object, skipping header
+ // Read a record from the file and place it in a Seq object, skipping header
// definitions and unaligned sequences.
//
do {
this->fh.getline(this->line, max_len);
if (!this->fh.good())
- return NULL;
+ return 0;
- len = strlen(this->line);
- if (this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+ len = strlen(this->line);
+ if (this->line[len - 1] == '\r') this->line[len - 1] = '\0';
parse_tsv(this->line, parts);
- //
- // According to SAM spec FLAGs are the second field,
- // if FLAG bit 0x4 is set, sequence is not mapped.
- //
- flag = atoi(parts[1].c_str());
- flag = flag & 4;
- flag = flag >> 2;
+ //
+ // According to SAM spec FLAGs are the second field,
+ // if FLAG bit 0x4 is set, sequence is not mapped.
+ //
+ flag = atoi(parts[1].c_str());
+ flag = flag & 4;
+ flag = flag >> 2;
} while (parts[0][0] == '@' || flag == 1);
//
- // Check which strand this is aligned to:
+ // Check which strand this is aligned to:
// SAM reference: FLAG bit 0x10 - sequence is reverse complemented
//
- flag = atoi(parts[1].c_str());
- flag = flag & 16;
- flag = flag >> 4;
+ flag = atoi(parts[1].c_str());
+ sflag = flag & 16;
+ sflag = sflag >> 4;
+
+ //
+ // Check if this is the primary or secondary alignment.
+ //
+ alnt aln_type = pri_aln;
+ aflag = flag & 256;
+ aflag = aflag >> 8;
+ if (aflag)
+ aln_type = sec_aln;
+
+ //
+ // Check if this is a supplemenatry (chimeric) alignment (not yet defined in Bam.h).
+ //
+ aflag = flag & 2048;
+ aflag = aflag >> 11;
+ if (aflag)
+ aln_type = sup_aln;
//
// If the read was aligned on the reverse strand (and is therefore reverse complemented)
@@ -93,11 +119,11 @@ Sam::next_seq()
// To accomplish this, we must parse the alignment CIGAR string
//
vector<pair<char, uint> > cigar;
- this->parse_cigar(parts[5].c_str(), cigar, flag);
+ this->parse_cigar(parts[5].c_str(), cigar, sflag);
- int bp = flag ?
- this->find_start_bp_neg(atoi(parts[3].c_str()), cigar) :
- this->find_start_bp_pos(atoi(parts[3].c_str()), cigar);
+ int bp = sflag ?
+ this->find_start_bp_neg(atoi(parts[3].c_str()), cigar) :
+ this->find_start_bp_pos(atoi(parts[3].c_str()), cigar);
//
// Sam format has a 1-based offset for chrmosome/basepair positions, adjust it to match
@@ -105,16 +131,40 @@ Sam::next_seq()
//
bp--;
- Seq *s = new Seq(parts[0].c_str(), parts[9].c_str(), parts[10].c_str(), // Read ID, Sequence, Quality
- parts[2].c_str(), bp, flag ? strand_minus : strand_plus); // Chr, BasePair, Strand
+ //
+ // Calculate the percentage of the sequence that was aligned to the reference.
+ //
+ len = 0;
+ for (uint i = 0; i < cigar.size(); i++)
+ switch (cigar[i].first) {
+ case 'M':
+ case 'I':
+ case '=':
+ case 'X':
+ len += cigar[i].second;
+ break;
+ case 'D':
+ case 'S':
+ case 'H':
+ case 'N':
+ break;
+ default:
+ cerr << "Error parsing CIGAR string '" << cigar[i].second << cigar[i].first << "'.\n";
+ break;
+ }
+ double pct_aln = double(len) / double(parts[9].length());
+
+ s = Seq(parts[0].c_str(), parts[9].c_str(), parts[10].c_str(), // Read ID, Sequence, Quality
+ parts[2].c_str(), bp, sflag ? strand_minus : strand_plus, // Chr, BasePair, Strand
+ aln_type, pct_aln); // Alignment type (primary or secondary), percent of read aligned
if (cigar.size() > 0)
- this->edit_gaps(cigar, s->seq);
+ this->edit_gaps(cigar, s.seq);
- return s;
+ return 1;
}
-int
+int
Sam::parse_cigar(const char *cigar_str, vector<pair<char, uint> > &cigar, bool orientation)
{
char buf[id_len];
@@ -126,30 +176,30 @@ Sam::parse_cigar(const char *cigar_str, vector<pair<char, uint> > &cigar, bool o
if (*p == '*') return 0;
while (*p != '\0') {
- q = p + 1;
-
- while (*q != '\0' && isdigit(*q))
- q++;
- strncpy(buf, p, q - p);
- buf[q-p] = '\0';
- dist = atoi(buf);
-
- //
- // If aligned to the negative strand, sequence has been reverse complemented and
- // CIGAR string should be interpreted in reverse.
- //
- if (orientation == strand_plus)
- cigar.push_back(make_pair(*q, dist));
- else
- cigar.insert(cigar.begin(), make_pair(*q, dist));
-
- p = q + 1;
+ q = p + 1;
+
+ while (*q != '\0' && isdigit(*q))
+ q++;
+ strncpy(buf, p, q - p);
+ buf[q-p] = '\0';
+ dist = atoi(buf);
+
+ //
+ // If aligned to the negative strand, sequence has been reverse complemented and
+ // CIGAR string should be interpreted in reverse.
+ //
+ if (orientation == strand_plus)
+ cigar.push_back(make_pair(*q, dist));
+ else
+ cigar.insert(cigar.begin(), make_pair(*q, dist));
+
+ p = q + 1;
}
return 0;
}
-int
+int
Sam::find_start_bp_neg(int aln_bp, vector<pair<char, uint> > &cigar)
{
uint size = cigar.size();
@@ -157,27 +207,33 @@ Sam::find_start_bp_neg(int aln_bp, vector<pair<char, uint> > &cigar)
uint dist;
for (uint i = 0; i < size; i++) {
- op = cigar[i].first;
- dist = cigar[i].second;
-
- switch(op) {
- case 'I':
- break;
- case 'S':
- if (i < size - 1)
- aln_bp += dist;
- break;
- case 'M':
- case 'D':
- aln_bp += dist;
- break;
- }
+ op = cigar[i].first;
+ dist = cigar[i].second;
+
+ switch(op) {
+ case 'I':
+ case 'H':
+ break;
+ case 'S':
+ if (i < size - 1)
+ aln_bp += dist;
+ break;
+ case 'M':
+ case '=':
+ case 'X':
+ case 'D':
+ case 'N':
+ aln_bp += dist;
+ break;
+ default:
+ break;
+ }
}
return aln_bp - 1;
}
-int
+int
Sam::find_start_bp_pos(int aln_bp, vector<pair<char, uint> > &cigar)
{
char op;
@@ -187,12 +243,12 @@ Sam::find_start_bp_pos(int aln_bp, vector<pair<char, uint> > &cigar)
dist = cigar[0].second;
if (op == 'S')
- aln_bp -= dist;
+ aln_bp -= dist;
return aln_bp;
}
-int
+int
Sam::edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
{
char *buf;
@@ -207,78 +263,80 @@ Sam::edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
buf_size = len + 1;
for (uint i = 0; i < size; i++) {
- op = cigar[i].first;
- dist = cigar[i].second;
-
- switch(op) {
- case 'S':
- stop = bp + dist;
- stop = stop > len ? len : stop;
- while (bp < stop) {
- seq[bp] = 'N';
- bp++;
- }
- break;
- case 'D':
- //
- // A deletion has occured in the read relative to the reference genome.
- // Pad the read with sufficent Ns to match the deletion, shifting the existing
- // sequence down. Trim the final length to keep the read length consistent.
- //
- k = bp >= len ? len : bp;
-
- strncpy(buf, seq + k, buf_size - 1);
- buf[buf_size - 1] = '\0';
- buf_len = strlen(buf);
-
- stop = bp + dist;
- stop = stop > len ? len : stop;
- while (bp < stop) {
- seq[bp] = 'N';
- bp++;
- }
-
- j = bp;
- k = 0;
- while (j < len && k < buf_len) {
- seq[j] = buf[k];
- k++;
- j++;
- }
- break;
- case 'I':
- //
- // An insertion has occurred in the read relative to the reference genome. Delete the
- // inserted bases and pad the end of the read with Ns.
- //
- if (bp >= len) break;
-
- k = bp + dist > len ? len : bp + dist;
- strncpy(buf, seq + k, buf_size - 1);
- buf[buf_size - 1] = '\0';
- buf_len = strlen(buf);
-
- j = bp;
- k = 0;
- while (j < len && k < buf_len) {
- seq[j] = buf[k];
- k++;
- j++;
- }
-
- stop = j + dist;
- stop = stop > len ? len : stop;
- while (j < stop) {
- seq[j] = 'N';
- j++;
- }
- break;
- case 'M':
- bp += dist;
- break;
- default:
- break;
- }
+ op = cigar[i].first;
+ dist = cigar[i].second;
+
+ switch(op) {
+ case 'S':
+ stop = bp + dist;
+ stop = stop > len ? len : stop;
+ while (bp < stop) {
+ seq[bp] = 'N';
+ bp++;
+ }
+ break;
+ case 'D':
+ //
+ // A deletion has occured in the read relative to the reference genome.
+ // Pad the read with sufficent Ns to match the deletion, shifting the existing
+ // sequence down. Trim the final length to keep the read length consistent.
+ //
+ k = bp >= len ? len : bp;
+
+ strncpy(buf, seq + k, buf_size - 1);
+ buf[buf_size - 1] = '\0';
+ buf_len = strlen(buf);
+
+ stop = bp + dist;
+ stop = stop > len ? len : stop;
+ while (bp < stop) {
+ seq[bp] = 'N';
+ bp++;
+ }
+
+ j = bp;
+ k = 0;
+ while (j < len && k < buf_len) {
+ seq[j] = buf[k];
+ k++;
+ j++;
+ }
+ break;
+ case 'I':
+ //
+ // An insertion has occurred in the read relative to the reference genome. Delete the
+ // inserted bases and pad the end of the read with Ns.
+ //
+ if (bp >= len) break;
+
+ k = bp + dist > len ? len : bp + dist;
+ strncpy(buf, seq + k, buf_size - 1);
+ buf[buf_size - 1] = '\0';
+ buf_len = strlen(buf);
+
+ j = bp;
+ k = 0;
+ while (j < len && k < buf_len) {
+ seq[j] = buf[k];
+ k++;
+ j++;
+ }
+
+ stop = j + dist;
+ stop = stop > len ? len : stop;
+ while (j < stop) {
+ seq[j] = 'N';
+ j++;
+ }
+ break;
+ case 'M':
+ case '=':
+ case 'X':
+ bp += dist;
+ break;
+ default:
+ break;
+ }
}
delete [] buf;
diff --git a/src/Tsv.h b/src/Tsv.h
index f94d94d..ec96d78 100644
--- a/src/Tsv.h
+++ b/src/Tsv.h
@@ -37,28 +37,34 @@ class Tsv: public Input {
Tsv(const char *path) : Input(path) {};
~Tsv() {};
Seq *next_seq();
- int next_seq(Seq &) { return 0; }
+ int next_seq(Seq &);
};
Seq *Tsv::next_seq() {
+ Seq* s = new Seq();
+ if(next_seq(*s) != 1) {
+ delete s;
+ s = NULL;
+ }
+ return s;
+}
+
+int Tsv::next_seq(Seq& s) {
vector<string> parts;
- //
- // Read a record from the file and place it in a Seq object
- //
this->fh.getline(this->line, max_len);
if (!this->fh.good()) {
- return NULL;
+ return 0;
}
parse_tsv(this->line, parts);
string id = parts[0] + "_" + parts[1];
- Seq *s = new Seq(id.c_str(), parts[2].c_str(), parts[3].c_str(), parts[0].c_str(), atoi(parts[1].c_str()), strand_plus);
+ s = Seq(id.c_str(), parts[2].c_str(), parts[3].c_str(), parts[0].c_str(), atoi(parts[1].c_str()), strand_plus);
- return s;
+ return 1;
}
#endif // __TSV_H__
diff --git a/src/Vcf.cc b/src/Vcf.cc
index 32e63c3..44a3025 100644
--- a/src/Vcf.cc
+++ b/src/Vcf.cc
@@ -181,7 +181,7 @@ Vcf::adaptive_open(const string& path)
}
#endif
} else {
- cerr << "Error: File '" << path << "' : expected '.vcf(.gz)' suffix.";
+ cerr << "Error: File '" << path << "' : expected '.vcf(.gz)' suffix.\n";
throw exception();
}
diff --git a/src/aln_utils.cc b/src/aln_utils.cc
index 24107a5..9e3d19b 100644
--- a/src/aln_utils.cc
+++ b/src/aln_utils.cc
@@ -37,7 +37,7 @@ invert_cigar(string cigar)
return cigar;
}
-int
+int
parse_cigar(const char *cigar_str, vector<pair<char, uint> > &cigar)
{
char buf[id_len];
@@ -101,10 +101,10 @@ apply_cigar_to_seq(const char *seq, vector<pair<char, uint> > &cigar)
}
break;
case 'D':
- edited_bp = 0;
+ edited_bp = 0;
while (edited_bp < dist) {
edited_seq.push_back('N');
- edited_bp++;
+ edited_bp++;
}
break;
case 'I':
@@ -155,10 +155,10 @@ apply_cigar_to_model_seq(const char *seq, vector<pair<char, uint> > &cigar)
}
break;
case 'D':
- edited_bp = 0;
+ edited_bp = 0;
while (edited_bp < dist) {
edited_seq.push_back('U');
- edited_bp++;
+ edited_bp++;
}
break;
case 'I':
@@ -206,7 +206,7 @@ apply_cigar_to_seq(char *seq, uint seq_len, const char *old_seq, vector<pair<cha
stop = stop > seq_len ? seq_len : stop;
while (seq_bp < stop) {
seq[seq_bp] = 'N';
- seq_bp++;
+ seq_bp++;
}
break;
case 'I':
@@ -258,7 +258,7 @@ apply_cigar_to_model_seq(char *seq, uint seq_len, const char *model, vector<pair
stop = stop > seq_len ? seq_len : stop;
while (seq_bp < stop) {
seq[seq_bp] = 'U';
- seq_bp++;
+ seq_bp++;
}
break;
case 'I':
@@ -294,7 +294,7 @@ remove_cigar_from_seq(const char *seq, vector<pair<char, uint> > &cigar)
//
uint seqlen = 0;
for (uint i = 0; i < size; i++)
- seqlen += cigar[i].first != 'D' ? cigar[i].second : 0;
+ seqlen += cigar[i].first != 'D' ? cigar[i].second : 0;
bp = 0;
@@ -306,10 +306,10 @@ remove_cigar_from_seq(const char *seq, vector<pair<char, uint> > &cigar)
switch(op) {
case 'D':
- edited_bp = 0;
+ edited_bp = 0;
while (edited_bp < dist) {
edited_bp++;
- bp++;
+ bp++;
}
break;
case 'I':
@@ -338,7 +338,7 @@ adjust_snps_for_gaps(vector<pair<char, uint> > &cigar, Locus *loc)
bp = 0;
offset = 0;
snp_index = 0;
-
+
for (uint i = 0; i < size; i++) {
op = cigar[i].first;
dist = cigar[i].second;
@@ -362,8 +362,8 @@ adjust_snps_for_gaps(vector<pair<char, uint> > &cigar, Locus *loc)
default:
break;
}
- }
-
+ }
+
return 0;
}
@@ -378,7 +378,7 @@ adjust_and_add_snps_for_gaps(vector<pair<char, uint> > &cigar, Locus *loc)
bp = 0;
new_bp = 0;
snp_cnt = loc->snps.size();
-
+
vector<SNP *> snps;
for (uint i = 0; i < size; i++) {
@@ -411,7 +411,7 @@ adjust_and_add_snps_for_gaps(vector<pair<char, uint> > &cigar, Locus *loc)
default:
break;
}
- }
+ }
loc->snps.clear();
@@ -431,7 +431,7 @@ remove_snps_from_gaps(vector<pair<char, uint> > &cigar, Locus *loc)
bp = 0;
new_bp = 0;
snp_cnt = loc->snps.size();
-
+
vector<SNP *> snps;
for (uint i = 0; i < size; i++) {
@@ -460,7 +460,7 @@ remove_snps_from_gaps(vector<pair<char, uint> > &cigar, Locus *loc)
default:
break;
}
- }
+ }
loc->snps.clear();
diff --git a/src/bootstrap.h b/src/bootstrap.h
index 072d2f8..333e769 100644
--- a/src/bootstrap.h
+++ b/src/bootstrap.h
@@ -43,11 +43,11 @@ public:
double stat[PopStatSize];
BSample() {
- this->bp = 0;
- this->alleles = 0;
- this->fixed = false;
- for (uint i = 0; i < PopStatSize; i++)
- this->stat[i] = 0.0;
+ this->bp = 0;
+ this->alleles = 0;
+ this->fixed = false;
+ for (uint i = 0; i < PopStatSize; i++)
+ this->stat[i] = 0.0;
}
};
@@ -59,13 +59,13 @@ class Bootstrap {
uint num_stats;
public:
- Bootstrap(uint size) {
- this->num_stats = size;
- this->weights = calc_weights();
- this->stats.resize(size, vector<double>());
+ Bootstrap(uint size) {
+ this->num_stats = size;
+ this->weights = calc_weights();
+ this->stats.resize(size, vector<double>());
}
- ~Bootstrap() {
- delete [] this->weights;
+ ~Bootstrap() {
+ delete [] this->weights;
}
int add_data(vector<StatT *> &);
@@ -79,9 +79,9 @@ int
Bootstrap<StatT>::add_data(vector<StatT *> &sites)
{
for (uint i = 0; i < sites.size(); i++) {
- if (sites[i] != NULL && sites[i]->fixed == false)
- for (uint j = 0; j < this->num_stats; j++)
- this->stats[j].push_back(sites[i]->stat[j]);
+ if (sites[i] != NULL && sites[i]->fixed == false)
+ for (uint j = 0; j < this->num_stats; j++)
+ this->stats[j].push_back(sites[i]->stat[j]);
}
return 0;
@@ -92,94 +92,94 @@ int
Bootstrap<StatT>::execute(vector<StatT *> &sites)
{
#pragma omp parallel
- {
- PopStat *c;
- double final_weight, sum, weighted_stat[PopStatSize];
- int dist, index;
- uint pos_l = 0;
- uint pos_u = 0;
-
- //#pragma omp for schedule(dynamic, 1)
- for (uint pos_c = 0; pos_c < sites.size(); pos_c++) {
- c = sites[pos_c];
-
- if (c == NULL)
- continue;
-
- if (bootstrap_wl && bootstraplist.count(c->loc_id) == 0)
- continue;
-
- // cerr << "Bootstrapping " << c->loc_id << "; pos_c: " << pos_c << "; bp: " << c->bp << "\n";
-
- determine_window_limits(sites, c->bp, pos_l, pos_u);
-
- int size = 0;
- for (uint i = pos_l; i < pos_u; i++)
- if (sites[i] != NULL) size++;
-
- //
- // Allocate an array of bootstrap resampling objects.
- //
- BSample *bs = new BSample[size];
-
- //
- // Populate the BSample objects.
- //
- int j = 0;
- for (uint i = pos_l; i < pos_u; i++) {
- if (sites[i] == NULL) continue;
- bs[j].bp = sites[i]->bp;
- bs[j].alleles = sites[i]->alleles;
- j++;
- }
-
- vector<vector<double> > resampled_stats(this->num_stats, vector<double>());
- for (uint i = 0; i < this->num_stats; i++)
- resampled_stats[i].reserve(bootstrap_reps);
-
- //
- // Bootstrap this bitch.
- //
- for (int i = 0; i < bootstrap_reps; i++) {
- // if (i % 100 == 0) cerr << " Bootsrap rep " << i << "\n";
-
- for (uint k = 0; k < this->num_stats; k++)
- weighted_stat[k] = 0.0;
- sum = 0.0;
-
- for (j = 0; j < size; j++) {
- //
- // Distance from center of window.
- //
- dist = bs[j].bp > c->bp ? bs[j].bp - c->bp : c->bp - bs[j].bp;
- //
- // Resample for this round of bootstrapping.
- //
- index = (int) (this->stats[0].size() * (random() / (RAND_MAX + 1.0)));
- for (uint k = 0; k < this->num_stats; k++)
- bs[j].stat[k] = this->stats[k][index];
-
- final_weight = (bs[j].alleles - 1) * this->weights[dist];
- for (uint k = 0; k < this->num_stats; k++)
- weighted_stat[k] += bs[j].stat[k] * final_weight;
- sum += final_weight;
- }
-
- // cerr << " New weighted Fst value: " << weighted_fst / sum << "\n";
- for (uint k = 0; k < this->num_stats; k++)
- resampled_stats[k].push_back(weighted_stat[k] / sum);
- }
-
- //
- // Cacluate the p-value for this window based on the empirical Fst distribution.
- //
- for (uint k = 0; k < this->num_stats; k++) {
- sort(resampled_stats[k].begin(), resampled_stats[k].end());
- c->bs[k] = this->pval(c->smoothed[k], resampled_stats[k]);
- }
-
- delete [] bs;
- }
+ {
+ PopStat *c;
+ double final_weight, sum, weighted_stat[PopStatSize];
+ int dist, index;
+ uint pos_l = 0;
+ uint pos_u = 0;
+
+ //#pragma omp for schedule(dynamic, 1)
+ for (uint pos_c = 0; pos_c < sites.size(); pos_c++) {
+ c = sites[pos_c];
+
+ if (c == NULL)
+ continue;
+
+ if (bootstrap_wl && bootstraplist.count(c->loc_id) == 0)
+ continue;
+
+ // cerr << "Bootstrapping " << c->loc_id << "; pos_c: " << pos_c << "; bp: " << c->bp << "\n";
+
+ determine_window_limits(sites, c->bp, pos_l, pos_u);
+
+ int size = 0;
+ for (uint i = pos_l; i < pos_u; i++)
+ if (sites[i] != NULL) size++;
+
+ //
+ // Allocate an array of bootstrap resampling objects.
+ //
+ BSample *bs = new BSample[size];
+
+ //
+ // Populate the BSample objects.
+ //
+ int j = 0;
+ for (uint i = pos_l; i < pos_u; i++) {
+ if (sites[i] == NULL) continue;
+ bs[j].bp = sites[i]->bp;
+ bs[j].alleles = sites[i]->alleles;
+ j++;
+ }
+
+ vector<vector<double> > resampled_stats(this->num_stats, vector<double>());
+ for (uint i = 0; i < this->num_stats; i++)
+ resampled_stats[i].reserve(bootstrap_reps);
+
+ //
+ // Bootstrap this bitch.
+ //
+ for (int i = 0; i < bootstrap_reps; i++) {
+ // if (i % 100 == 0) cerr << " Bootsrap rep " << i << "\n";
+
+ for (uint k = 0; k < this->num_stats; k++)
+ weighted_stat[k] = 0.0;
+ sum = 0.0;
+
+ for (j = 0; j < size; j++) {
+ //
+ // Distance from center of window.
+ //
+ dist = bs[j].bp > c->bp ? bs[j].bp - c->bp : c->bp - bs[j].bp;
+ //
+ // Resample for this round of bootstrapping.
+ //
+ index = (int) (this->stats[0].size() * (random() / (RAND_MAX + 1.0)));
+ for (uint k = 0; k < this->num_stats; k++)
+ bs[j].stat[k] = this->stats[k][index];
+
+ final_weight = (bs[j].alleles - 1) * this->weights[dist];
+ for (uint k = 0; k < this->num_stats; k++)
+ weighted_stat[k] += bs[j].stat[k] * final_weight;
+ sum += final_weight;
+ }
+
+ // cerr << " New weighted Fst value: " << weighted_fst / sum << "\n";
+ for (uint k = 0; k < this->num_stats; k++)
+ resampled_stats[k].push_back(weighted_stat[k] / sum);
+ }
+
+ //
+ // Cacluate the p-value for this window based on the empirical Fst distribution.
+ //
+ for (uint k = 0; k < this->num_stats; k++) {
+ sort(resampled_stats[k].begin(), resampled_stats[k].end());
+ c->bs[k] = this->pval(c->smoothed[k], resampled_stats[k]);
+ }
+
+ delete [] bs;
+ }
}
return 0;
@@ -190,119 +190,119 @@ int
Bootstrap<StatT>::execute_mixed(vector<StatT *> &sites)
{
#pragma omp parallel
- {
- PopStat *c;
- double final_weight, sum, weighted_stat[PopStatSize];
- int dist, index;
- uint pos_l = 0;
- uint pos_u = 0;
-
- //#pragma omp for schedule(dynamic, 1)
- for (uint pos_c = 0; pos_c < sites.size(); pos_c++) {
- c = sites[pos_c];
-
- if (c == NULL || c->fixed == true)
- continue;
-
- if (bootstrap_wl && bootstraplist.count(c->loc_id) == 0)
- continue;
-
- // cerr << "Bootstrapping " << c->loc_id << "; pos_c: " << pos_c << "; bp: " << c->bp << "\n";
-
- determine_window_limits(sites, c->bp, pos_l, pos_u);
-
- int size = 0;
- for (uint i = pos_l; i < pos_u; i++)
- if (sites[i] != NULL) size++;
-
- //
- // Allocate an array of bootstrap resampling objects.
- //
- BSample *bs = new BSample[size];
-
- //
- // Populate the BSample objects.
- //
- int j = 0;
- for (uint i = pos_l; i < pos_u; i++) {
- if (sites[i] == NULL)
- continue;
- bs[j].bp = sites[i]->bp;
- bs[j].alleles = sites[i]->alleles;
- bs[j].fixed = sites[i]->fixed;
- for (uint k = 0; k < this->num_stats; k++)
- bs[j].stat[k] = sites[i]->stat[k];
- j++;
- }
-
- //
- // Precompute the fraction of the window that will not change during resampling.
- //
- double partial_weighted_stat[this->num_stats];
- double partial_sum = 0.0;
- memset(partial_weighted_stat, 0, this->num_stats);
-
- for (j = 0; j < size; j++) {
- if (bs[j].fixed == false) continue;
-
- dist = bs[j].bp > c->bp ? bs[j].bp - c->bp : c->bp - bs[j].bp;
-
- final_weight = (bs[j].alleles - 1.0) * this->weights[dist];
- partial_sum += final_weight;
- for (uint k = 0; k < this->num_stats; k++)
- partial_weighted_stat[k] += bs[j].stat[k] * final_weight;
- }
-
- vector<vector<double> > resampled_stats(this->num_stats, vector<double>());
- for (uint i = 0; i < this->num_stats; i++)
- resampled_stats[i].reserve(bootstrap_reps);
-
- // cerr << "Window starts at " << bs[0].bp << "; centered on " << c->bp << "\n";
-
- //
- // Bootstrap this bitch.
- //
- for (int i = 0; i < bootstrap_reps; i++) {
- // if (i % 100 == 0) cerr << " Bootsrap rep " << i << "\n";
-
- for (uint k = 0; k < this->num_stats; k++)
- weighted_stat[k] = partial_weighted_stat[k];
- sum = partial_sum;
-
- for (j = 0; j < size; j++) {
- if (bs[j].fixed == true) continue;
-
- dist = bs[j].bp > c->bp ? bs[j].bp - c->bp : c->bp - bs[j].bp;
-
- //
- // Resample for this round of bootstrapping.
- //
- index = (int) (this->stats[0].size() * (random() / (RAND_MAX + 1.0)));
- for (uint k = 0; k < this->num_stats; k++)
- bs[j].stat[k] = this->stats[k][index];
-
-
- final_weight = (bs[j].alleles - 1) * this->weights[dist];
- for (uint k = 0; k < this->num_stats; k++)
- weighted_stat[k] += bs[j].stat[k] * final_weight;
- sum += final_weight;
- }
-
- // cerr << " New weighted value: " << (weighted_stat[0] / sum) << "\n";
- for (uint k = 0; k < this->num_stats; k++)
- resampled_stats[k].push_back(weighted_stat[k] / sum);
- }
-
- //
- // Cacluate the p-value for this window based on the empirical Fst distribution.
- //
- for (uint k = 0; k < this->num_stats; k++) {
- sort(resampled_stats[k].begin(), resampled_stats[k].end());
- c->bs[k] = this->pval(c->smoothed[k], resampled_stats[k]);
- }
-
- delete [] bs;
- }
+ {
+ PopStat *c;
+ double final_weight, sum, weighted_stat[PopStatSize];
+ int dist, index;
+ uint pos_l = 0;
+ uint pos_u = 0;
+
+ //#pragma omp for schedule(dynamic, 1)
+ for (uint pos_c = 0; pos_c < sites.size(); pos_c++) {
+ c = sites[pos_c];
+
+ if (c == NULL || c->fixed == true)
+ continue;
+
+ if (bootstrap_wl && bootstraplist.count(c->loc_id) == 0)
+ continue;
+
+ // cerr << "Bootstrapping " << c->loc_id << "; pos_c: " << pos_c << "; bp: " << c->bp << "\n";
+
+ determine_window_limits(sites, c->bp, pos_l, pos_u);
+
+ int size = 0;
+ for (uint i = pos_l; i < pos_u; i++)
+ if (sites[i] != NULL) size++;
+
+ //
+ // Allocate an array of bootstrap resampling objects.
+ //
+ BSample *bs = new BSample[size];
+
+ //
+ // Populate the BSample objects.
+ //
+ int j = 0;
+ for (uint i = pos_l; i < pos_u; i++) {
+ if (sites[i] == NULL)
+ continue;
+ bs[j].bp = sites[i]->bp;
+ bs[j].alleles = sites[i]->alleles;
+ bs[j].fixed = sites[i]->fixed;
+ for (uint k = 0; k < this->num_stats; k++)
+ bs[j].stat[k] = sites[i]->stat[k];
+ j++;
+ }
+
+ //
+ // Precompute the fraction of the window that will not change during resampling.
+ //
+ double partial_weighted_stat[this->num_stats];
+ double partial_sum = 0.0;
+ memset(partial_weighted_stat, 0, this->num_stats);
+
+ for (j = 0; j < size; j++) {
+ if (bs[j].fixed == false) continue;
+
+ dist = bs[j].bp > c->bp ? bs[j].bp - c->bp : c->bp - bs[j].bp;
+
+ final_weight = (bs[j].alleles - 1.0) * this->weights[dist];
+ partial_sum += final_weight;
+ for (uint k = 0; k < this->num_stats; k++)
+ partial_weighted_stat[k] += bs[j].stat[k] * final_weight;
+ }
+
+ vector<vector<double> > resampled_stats(this->num_stats, vector<double>());
+ for (uint i = 0; i < this->num_stats; i++)
+ resampled_stats[i].reserve(bootstrap_reps);
+
+ // cerr << "Window starts at " << bs[0].bp << "; centered on " << c->bp << "\n";
+
+ //
+ // Bootstrap this bitch.
+ //
+ for (int i = 0; i < bootstrap_reps; i++) {
+ // if (i % 100 == 0) cerr << " Bootsrap rep " << i << "\n";
+
+ for (uint k = 0; k < this->num_stats; k++)
+ weighted_stat[k] = partial_weighted_stat[k];
+ sum = partial_sum;
+
+ for (j = 0; j < size; j++) {
+ if (bs[j].fixed == true) continue;
+
+ dist = bs[j].bp > c->bp ? bs[j].bp - c->bp : c->bp - bs[j].bp;
+
+ //
+ // Resample for this round of bootstrapping.
+ //
+ index = (int) (this->stats[0].size() * (random() / (RAND_MAX + 1.0)));
+ for (uint k = 0; k < this->num_stats; k++)
+ bs[j].stat[k] = this->stats[k][index];
+
+
+ final_weight = (bs[j].alleles - 1) * this->weights[dist];
+ for (uint k = 0; k < this->num_stats; k++)
+ weighted_stat[k] += bs[j].stat[k] * final_weight;
+ sum += final_weight;
+ }
+
+ // cerr << " New weighted value: " << (weighted_stat[0] / sum) << "\n";
+ for (uint k = 0; k < this->num_stats; k++)
+ resampled_stats[k].push_back(weighted_stat[k] / sum);
+ }
+
+ //
+ // Cacluate the p-value for this window based on the empirical Fst distribution.
+ //
+ for (uint k = 0; k < this->num_stats; k++) {
+ sort(resampled_stats[k].begin(), resampled_stats[k].end());
+ c->bs[k] = this->pval(c->smoothed[k], resampled_stats[k]);
+ }
+
+ delete [] bs;
+ }
}
return 0;
@@ -318,21 +318,21 @@ Bootstrap<StatT>::pval(double stat, vector<double> &dist)
up = upper_bound(dist.begin(), dist.end(), stat);
if (up == dist.begin())
- pos = 1;
+ pos = 1;
else if (up == dist.end())
- pos = dist.size();
- else
- pos = up - dist.begin() + 1;
+ pos = dist.size();
+ else
+ pos = up - dist.begin() + 1;
double res = 1.0 - (pos / (double) dist.size());
// cerr << "Generated Smoothed Fst Distribution:\n";
// for (uint n = 0; n < dist.size(); n++)
- // cerr << " n: " << n << "; Fst: " << dist[n] << "\n";
+ // cerr << " n: " << n << "; Fst: " << dist[n] << "\n";
- // cerr << "Comparing Fst value: " << stat
- // << " at position " << (up - dist.begin()) << " out of "
- // << dist.size() << " positions (converted position: " << pos << "); pvalue: " << res << ".\n";
+ // cerr << "Comparing Fst value: " << stat
+ // << " at position " << (up - dist.begin()) << " out of "
+ // << dist.size() << " positions (converted position: " << pos << "); pvalue: " << res << ".\n";
return res;
}
diff --git a/src/catalog_utils.cc b/src/catalog_utils.cc
index 09374cc..1d99926 100644
--- a/src/catalog_utils.cc
+++ b/src/catalog_utils.cc
@@ -27,16 +27,16 @@
//
#include "catalog_utils.h"
-int
+int
reduce_catalog(map<int, CSLocus *> &catalog, set<int> &whitelist, set<int> &blacklist)
{
map<int, CSLocus *> list;
map<int, CSLocus *>::iterator it;
CSLocus *loc;
- if (whitelist.size() == 0 && blacklist.size() == 0)
+ if (whitelist.size() == 0 && blacklist.size() == 0)
return 0;
-
+
int i = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
loc = it->second;
@@ -53,8 +53,8 @@ reduce_catalog(map<int, CSLocus *> &catalog, set<int> &whitelist, set<int> &blac
return i;
}
-int
-implement_single_snp_whitelist(map<int, CSLocus *> &catalog, PopSum<CSLocus> *psum, map<int, set<int> > &whitelist)
+int
+implement_single_snp_whitelist(map<int, CSLocus *> &catalog, PopSum<CSLocus> *psum, map<int, set<int> > &whitelist)
{
map<int, set<int> > new_wl;
CSLocus *loc;
@@ -66,7 +66,7 @@ implement_single_snp_whitelist(map<int, CSLocus *> &catalog, PopSum<CSLocus> *ps
for (it = whitelist.begin(); it != whitelist.end(); it++) {
loc = catalog[it->first];
t = psum->locus_tally(loc->id);
-
+
//
// If no specific SNPs are specified in the whitelist all SNPs are included, choose the first variant.
//
@@ -83,7 +83,7 @@ implement_single_snp_whitelist(map<int, CSLocus *> &catalog, PopSum<CSLocus> *ps
for (uint i = 0; i < loc->snps.size(); i++) {
if (it->second.count(loc->snps[i]->col) == 0 ||
t->nucs[loc->snps[i]->col].fixed == true)
- continue;
+ continue;
new_wl[loc->id].insert(loc->snps[i]->col);
break;
}
@@ -109,8 +109,8 @@ implement_single_snp_whitelist(map<int, CSLocus *> &catalog, PopSum<CSLocus> *ps
return 0;
}
-int
-implement_random_snp_whitelist(map<int, CSLocus *> &catalog, PopSum<CSLocus> *psum, map<int, set<int> > &whitelist)
+int
+implement_random_snp_whitelist(map<int, CSLocus *> &catalog, PopSum<CSLocus> *psum, map<int, set<int> > &whitelist)
{
map<int, set<int> > new_wl;
CSLocus *loc;
@@ -123,7 +123,7 @@ implement_random_snp_whitelist(map<int, CSLocus *> &catalog, PopSum<CSLocus> *ps
loc = catalog[it->first];
if (loc->snps.size() == 0) continue;
-
+
if (it->second.size() == 0) {
index = rand() % loc->snps.size();
new_wl[loc->id].insert(loc->snps[index]->col);
@@ -203,16 +203,16 @@ check_whitelist_integrity(map<int, CSLocus *> &catalog, map<int, set<int> > &whi
return 0;
}
-int
+int
reduce_catalog(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist, set<int> &blacklist)
{
map<int, CSLocus *> list;
map<int, CSLocus *>::iterator it;
CSLocus *loc;
- if (whitelist.size() == 0 && blacklist.size() == 0)
+ if (whitelist.size() == 0 && blacklist.size() == 0)
return 0;
-
+
int i = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
loc = it->second;
@@ -229,16 +229,16 @@ reduce_catalog(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist, set
return i;
}
-int
-reduce_catalog_snps(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist, PopMap<CSLocus> *pmap)
+int
+reduce_catalog_snps(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist, PopMap<CSLocus> *pmap)
{
map<int, CSLocus *>::iterator it;
CSLocus *loc;
Datum **d;
- if (whitelist.size() == 0)
+ if (whitelist.size() == 0)
return 0;
-
+
//
// We want to prune out SNP objects that are not in the whitelist.
//
@@ -268,7 +268,7 @@ reduce_catalog_snps(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist
//
pos = loc->snps[i]->col;
for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] == NULL || pos >= d[j]->len)
+ if (d[j] == NULL || pos >= d[j]->len)
continue;
if (d[j]->model != NULL) {
d[j]->model[pos] = 'U';
@@ -306,10 +306,10 @@ reduce_catalog_snps(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist
loc->populate_alleles();
//
- // Now we need to adjust the matched haplotypes to sync to
+ // Now we need to adjust the matched haplotypes to sync to
// the SNPs left in the catalog.
//
- // Reducing the lengths of the haplotypes may create
+ // Reducing the lengths of the haplotypes may create
// redundant (shorter) haplotypes, we need to remove these.
//
for (int i = 0; i < pmap->sample_cnt(); i++) {
diff --git a/src/clean.cc b/src/clean.cc
index 60f67fa..4bd0945 100644
--- a/src/clean.cc
+++ b/src/clean.cc
@@ -35,29 +35,29 @@ int parse_illumina_v1(const char *file) {
// Parse a file name that looks like: s_7_1_0001_qseq.txt ... s_7_1_0120_qseq.txt
// but exclude the paired-end files: s_7_2_0001_qseq.txt ... s_7_2_0120_qseq.txt
//
- if (file[0] != 's')
- return 0;
+ if (file[0] != 's')
+ return 0;
int underscore_cnt = 0;
for (p = file; *p != '\0'; p++) {
- if (*p == '_') {
- underscore_cnt++;
- q = p;
- }
+ if (*p == '_') {
+ underscore_cnt++;
+ q = p;
+ }
}
if (underscore_cnt != 4)
- return 0;
+ return 0;
// Check the file suffix.
if (strncmp(q, "_qseq.txt", 8) != 0)
- return 0;
+ return 0;
// Make sure it is not the paired-end file
p = file;
p += 3;
if (strncmp(p, "_1_", 3) != 0)
- return 0;
+ return 0;
//
// Return the position of the paired-end number, so the other file name can be generated.
@@ -81,31 +81,31 @@ int parse_illumina_v2(const char *file) {
for (q = file; *q != '\0'; q++);
for (p = q; *p != '.' && p > file; p--);
if (strncmp(p, ".gz", 3) == 0)
- for (p--; *p != '.' && p > file; p--);
+ for (p--; *p != '.' && p > file; p--);
if (strncmp(p, ".fastq", 6) != 0)
- return 0;
+ return 0;
//
// Find the part of the name marking the pair, "_R1_", make sure it is not the paired-end file.
//
p = file;
while (*p != '\0') {
- for (; *p != '_' && *p != '\0'; p++);
- if (*p == '\0') return 0;
- if (strncmp(p, "_R1_", 4) == 0) {
- //
- // Return the position of the paired-end number, so the other file name can be generated.
- //
- return (p + 2 - file);
- }
- p++;
+ for (; *p != '_' && *p != '\0'; p++);
+ if (*p == '\0') return 0;
+ if (strncmp(p, "_R1_", 4) == 0) {
+ //
+ // Return the position of the paired-end number, so the other file name can be generated.
+ //
+ return (p + 2 - file);
+ }
+ p++;
}
return 0;
}
-int
-parse_input_record(Seq *s, Read *r)
+int
+parse_input_record(Seq *s, Read *r)
{
char *p, *q, *z;
uint lim;
@@ -126,186 +126,186 @@ parse_input_record(Seq *s, Read *r)
int hash_cnt = 0;
for (p = s->id, q = p; q < stop; q++) {
- colon_cnt += *q == ':' ? 1 : 0;
- hash_cnt += *q == '#' ? 1 : 0;
+ colon_cnt += *q == ':' ? 1 : 0;
+ hash_cnt += *q == '#' ? 1 : 0;
}
if (colon_cnt == 9 && hash_cnt == 0) {
- r->fastq_type = illv2_fastq;
- //
- // According to Illumina manual, "CASAVA v1.8 User Guide" page 41:
- // @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is filtered>:<control number>:<index sequence>
- //
- for (p = s->id, q = p; *q != ':' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- strcpy(r->machine, p);
- *q = ':';
- }
-
- // Run number.
- for (p = q+1, q = p; *q != ':' && q < stop; q++);
- //*q = '\0';
-
- // Flowcell ID.
- for (p = q+1, q = p; *q != ':' && q < stop; q++);
- //*q = '\0';
-
- for (p = q+1, q = p; *q != ':' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- r->lane = atoi(p);
- *q = ':';
- }
-
- for (p = q+1, q = p; *q != ':' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- r->tile = atoi(p);
- *q = ':';
- }
-
- for (p = q+1, q = p; *q != ':' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- r->x = atoi(p);
- *q = ':';
- }
-
- for (p = q+1, q = p; *q != ' ' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- r->y = atoi(p);
- *q = ' ';
- }
-
- for (p = q+1, q = p; *q != ':' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- // r->read = atoi(p);
- *q = ':';
- }
-
- for (p = q+1, q = p; *q != ':' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- r->filter = *p == 'Y' ? true : false;
- *q = ':';
- }
-
- // Control Number.
- for (p = q+1, q = p; *q != ':' && q < stop; q++);
- //*q = '\0';
-
- //
- // Index barcode
- //
- // The index barcode appears identically in both single-end and paired-end reads.
- // If the barcode type is index_index, the barcode will appear as NNNNNN+NNNNNN
- // in both reads. If the specified barcode type is null_index we want to read only
- // the second half of the index, if the type is index_null, we want to read
- // only the first half, or the full string if there is no '+' character.
- //
- if (q < stop)
- for (p = q+1, q = p; q < stop; q++);
- else
- p = q;
-
- if (*p != '\0') {
- //
- // Check if there is a '+' character.
- //
- for (z = p; *z != '+' && *z != '\0'; z++);
-
- if (r->read == 1) {
- lim = z - p;
-
- switch (barcode_type) {
- case index_null:
- case index_index:
- case index_inline:
- lim = lim < max_bc_size_1 ? lim : max_bc_size_1;
- strncpy(r->index_bc, p, lim);
- r->index_bc[lim] = '\0';
- break;
- case inline_index:
- lim = lim < max_bc_size_2 ? lim : max_bc_size_2;
- strncpy(r->index_bc, p, lim);
- r->index_bc[lim] = '\0';
- break;
- default:
- break;
- }
- } else if (r->read == 2) {
- if (*z == '+')
- p = z + 1;
-
- switch (barcode_type) {
- case null_index:
- case index_index:
- case inline_index:
- strncpy(r->index_bc, p, max_bc_size_2);
- r->index_bc[max_bc_size_2] = '\0';
- break;
- default:
- break;
- }
- }
- }
+ r->fastq_type = illv2_fastq;
+ //
+ // According to Illumina manual, "CASAVA v1.8 User Guide" page 41:
+ // @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is filtered>:<control number>:<index sequence>
+ //
+ for (p = s->id, q = p; *q != ':' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ strcpy(r->machine, p);
+ *q = ':';
+ }
+
+ // Run number.
+ for (p = q+1, q = p; *q != ':' && q < stop; q++);
+ //*q = '\0';
+
+ // Flowcell ID.
+ for (p = q+1, q = p; *q != ':' && q < stop; q++);
+ //*q = '\0';
+
+ for (p = q+1, q = p; *q != ':' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ r->lane = atoi(p);
+ *q = ':';
+ }
+
+ for (p = q+1, q = p; *q != ':' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ r->tile = atoi(p);
+ *q = ':';
+ }
+
+ for (p = q+1, q = p; *q != ':' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ r->x = atoi(p);
+ *q = ':';
+ }
+
+ for (p = q+1, q = p; *q != ' ' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ r->y = atoi(p);
+ *q = ' ';
+ }
+
+ for (p = q+1, q = p; *q != ':' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ // r->read = atoi(p);
+ *q = ':';
+ }
+
+ for (p = q+1, q = p; *q != ':' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ r->filter = *p == 'Y' ? true : false;
+ *q = ':';
+ }
+
+ // Control Number.
+ for (p = q+1, q = p; *q != ':' && q < stop; q++);
+ //*q = '\0';
+
+ //
+ // Index barcode
+ //
+ // The index barcode appears identically in both single-end and paired-end reads.
+ // If the barcode type is index_index, the barcode will appear as NNNNNN+NNNNNN
+ // in both reads. If the specified barcode type is null_index we want to read only
+ // the second half of the index, if the type is index_null, we want to read
+ // only the first half, or the full string if there is no '+' character.
+ //
+ if (q < stop)
+ for (p = q+1, q = p; q < stop; q++);
+ else
+ p = q;
+
+ if (*p != '\0') {
+ //
+ // Check if there is a '+' character.
+ //
+ for (z = p; *z != '+' && *z != '\0'; z++);
+
+ if (r->read == 1) {
+ lim = z - p;
+
+ switch (barcode_type) {
+ case index_null:
+ case index_index:
+ case index_inline:
+ lim = lim < max_bc_size_1 ? lim : max_bc_size_1;
+ strncpy(r->index_bc, p, lim);
+ r->index_bc[lim] = '\0';
+ break;
+ case inline_index:
+ lim = lim < max_bc_size_2 ? lim : max_bc_size_2;
+ strncpy(r->index_bc, p, lim);
+ r->index_bc[lim] = '\0';
+ break;
+ default:
+ break;
+ }
+ } else if (r->read == 2) {
+ if (*z == '+')
+ p = z + 1;
+
+ switch (barcode_type) {
+ case null_index:
+ case index_index:
+ case inline_index:
+ strncpy(r->index_bc, p, max_bc_size_2);
+ r->index_bc[max_bc_size_2] = '\0';
+ break;
+ default:
+ break;
+ }
+ }
+ }
} else if (colon_cnt == 4 && hash_cnt == 1) {
- r->fastq_type = illv1_fastq;
-
- for (p = s->id, q = p; *q != ':' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- strcpy(r->machine, p);
- *q = ':';
- }
-
- for (p = q+1, q = p; *q != ':' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- r->lane = atoi(p);
- *q = ':';
- }
-
- for (p = q+1, q = p; *q != ':' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- r->tile = atoi(p);
- *q = ':';
- }
-
- for (p = q+1, q = p; *q != ':' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- r->x = atoi(p);
- *q = ':';
- }
-
- for (p = q+1, q = p; *q != '#' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- r->y = atoi(p);
- *q = '#';
- }
-
- for (p = q+1, q = p; *q != '/' && q < stop; q++);
- if (q < stop) {
- *q = '\0';
- r->index = atoi(p);
- *q = '/';
- }
-
- for (p = q+1, q = p; *q != '\0' && q < stop; q++);
- // r->read = atoi(p);
+ r->fastq_type = illv1_fastq;
+
+ for (p = s->id, q = p; *q != ':' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ strcpy(r->machine, p);
+ *q = ':';
+ }
+
+ for (p = q+1, q = p; *q != ':' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ r->lane = atoi(p);
+ *q = ':';
+ }
+
+ for (p = q+1, q = p; *q != ':' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ r->tile = atoi(p);
+ *q = ':';
+ }
+
+ for (p = q+1, q = p; *q != ':' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ r->x = atoi(p);
+ *q = ':';
+ }
+
+ for (p = q+1, q = p; *q != '#' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ r->y = atoi(p);
+ *q = '#';
+ }
+
+ for (p = q+1, q = p; *q != '/' && q < stop; q++);
+ if (q < stop) {
+ *q = '\0';
+ r->index = atoi(p);
+ *q = '/';
+ }
+
+ for (p = q+1, q = p; *q != '\0' && q < stop; q++);
+ // r->read = atoi(p);
} else {
- r->fastq_type = generic_fastq;
+ r->fastq_type = generic_fastq;
- strncpy(r->machine, s->id, id_len);
- r->machine[id_len] = '\0';
+ strncpy(r->machine, s->id, id_len);
+ r->machine[id_len] = '\0';
}
uint len = strlen(s->seq);
@@ -314,34 +314,34 @@ parse_input_record(Seq *s, Read *r)
// Resize the sequence/phred buffers if necessary.
//
if (len > r->size - 1)
- r->resize(len + 1);
+ r->resize(len + 1);
- strncpy(r->seq, s->seq, r->size - 1);
+ strncpy(r->seq, s->seq, r->size - 1);
strncpy(r->phred, s->qual, r->size - 1);
r->seq[r->size - 1] = '\0';
r->phred[r->size - 1] = '\0';
r->len = len;
if (r->read == 1) {
- switch (barcode_type) {
- case inline_null:
- case inline_inline:
- case inline_index:
- strncpy(r->inline_bc, r->seq, max_bc_size_1);
- r->inline_bc[max_bc_size_1] = '\0';
- break;
- case index_inline:
- strncpy(r->inline_bc, r->seq, max_bc_size_2);
- r->inline_bc[max_bc_size_2] = '\0';
- break;
- default:
- break;
- }
+ switch (barcode_type) {
+ case inline_null:
+ case inline_inline:
+ case inline_index:
+ strncpy(r->inline_bc, r->seq, max_bc_size_1);
+ r->inline_bc[max_bc_size_1] = '\0';
+ break;
+ case index_inline:
+ strncpy(r->inline_bc, r->seq, max_bc_size_2);
+ r->inline_bc[max_bc_size_2] = '\0';
+ break;
+ default:
+ break;
+ }
} else if (r->read == 2 &&
- (barcode_type == inline_inline ||
- barcode_type == index_inline)) {
- strncpy(r->inline_bc, r->seq, max_bc_size_2);
- r->inline_bc[max_bc_size_2] = '\0';
+ (barcode_type == inline_inline ||
+ barcode_type == index_inline)) {
+ strncpy(r->inline_bc, r->seq, max_bc_size_2);
+ r->inline_bc[max_bc_size_2] = '\0';
}
r->retain = 1;
@@ -349,8 +349,8 @@ parse_input_record(Seq *s, Read *r)
return 0;
}
-int
-rev_complement(char *seq, int offset, bool overhang)
+int
+rev_complement(char *seq, int offset, bool overhang)
{
char *p, *q;
@@ -359,8 +359,8 @@ rev_complement(char *seq, int offset, bool overhang)
int len = strlen(q);
int j = 0;
- char *com = new char[len + 1];
-
+ char *com = new char[len + 1];
+
for (p = q + len - 1; p >= q; p--) {
switch (*p) {
case 'A':
@@ -385,15 +385,15 @@ rev_complement(char *seq, int offset, bool overhang)
com[len] = '\0';
for (j = 0; j < len; j++)
- q[j] = com[j];
+ q[j] = com[j];
delete [] com;
return 0;
}
-int
-reverse_qual(char *qual, int offset, bool overhang)
+int
+reverse_qual(char *qual, int offset, bool overhang)
{
char *p, *q;
@@ -402,16 +402,16 @@ reverse_qual(char *qual, int offset, bool overhang)
int len = strlen(q);
int j = 0;
- char *com = new char[len + 1];
-
+ char *com = new char[len + 1];
+
for (p = q + len - 1; p >= q; p--) {
- com[j] = *p;
+ com[j] = *p;
j++;
}
com[len] = '\0';
for (j = 0; j < len; j++)
- q[j] = com[j];
+ q[j] = com[j];
delete [] com;
@@ -421,8 +421,8 @@ reverse_qual(char *qual, int offset, bool overhang)
//
// Functions for quality filtering based on phred scores.
//
-int
-check_quality_scores(Read *href, int qual_offset, int score_limit, int len_limit, int offset)
+int
+check_quality_scores(Read *href, int qual_offset, int score_limit, int len_limit, int offset)
{
//
// Phred quality scores are discussed here:
@@ -445,7 +445,7 @@ check_quality_scores(Read *href, int qual_offset, int score_limit, int len_limit
// cerr << "Integer scores: ";
for (uint j = 0; j < href->len; j++) {
href->int_scores[j] = href->phred[j] - qual_offset;
- // cerr << href->int_scores[j] << " ";
+ // cerr << href->int_scores[j] << " ";
}
// cerr << "\n";
@@ -458,7 +458,7 @@ check_quality_scores(Read *href, int qual_offset, int score_limit, int len_limit
// Populate the sliding window.
//
for (j = offset; j < href->win_len + offset; j++)
- working_sum += href->int_scores[j];
+ working_sum += href->int_scores[j];
// cerr << "Populating the sliding window using position " << offset << " to " << href->win_len + offset - 1 << "; initial working sum: " << working_sum << "\n";
@@ -472,56 +472,56 @@ check_quality_scores(Read *href, int qual_offset, int score_limit, int len_limit
// cerr << "Setting pointers; P: " << (href->int_scores + offset) - href->int_scores << "; Q: " << p + (int) href->win_len - p << "; J: " << j << "\n";
do {
- mean = working_sum / href->win_len;
-
- // cerr << "J: " << j << "; Window contents: ";
- // for (int *r = p; r < q; r++)
- // cerr << *r << " ";
- // cerr << "\n";
- // cerr << " Mean: " << mean << "\n";
-
- if (mean < score_limit) {
-
- if (j < len_limit) {
- return 0;
- } else {
- href->len = j + 1;
- href->seq[j] = '\0';
- href->phred[j] = '\0';
- return -1;
- }
- }
-
- //
- // Advance the window:
- // Add the score from the front edge of the window, subtract the score
- // from the back edge of the window.
- //
- working_sum -= (double) *p;
- working_sum += (double) *q;
-
- // cerr << " Removing value of p: " << *p << " (position: " << p - (href->int_scores) << ")\n";
- // cerr << " Adding value of q: " << *q << " (position: " << q - (href->int_scores) << ")\n";
-
- p++;
- q++;
- j++;
+ mean = working_sum / href->win_len;
+
+ // cerr << "J: " << j << "; Window contents: ";
+ // for (int *r = p; r < q; r++)
+ // cerr << *r << " ";
+ // cerr << "\n";
+ // cerr << " Mean: " << mean << "\n";
+
+ if (mean < score_limit) {
+
+ if (j < len_limit) {
+ return 0;
+ } else {
+ href->len = j + 1;
+ href->seq[j] = '\0';
+ href->phred[j] = '\0';
+ return -1;
+ }
+ }
+
+ //
+ // Advance the window:
+ // Add the score from the front edge of the window, subtract the score
+ // from the back edge of the window.
+ //
+ working_sum -= (double) *p;
+ working_sum += (double) *q;
+
+ // cerr << " Removing value of p: " << *p << " (position: " << p - (href->int_scores) << ")\n";
+ // cerr << " Adding value of q: " << *q << " (position: " << q - (href->int_scores) << ")\n";
+
+ p++;
+ q++;
+ j++;
} while (j <= href->stop_pos);
return 1;
}
-bool
-correct_barcode(set<string> &bcs, Read *href, seqt type, int num_errs)
+bool
+correct_barcode(set<string> &bcs, Read *href, seqt type, int num_errs)
{
if (recover == false)
- return false;
+ return false;
//
// The barcode_dist variable specifies how far apart in sequence space barcodes are. If barcodes
// are off by two nucleotides in sequence space, than we can correct barcodes that have a single
- // sequencing error.
- //
+ // sequencing error.
+ //
// If the barcode sequence is off by no more than barcodes_dist-1 nucleotides, correct it. We will
// search the whole possible space of barcodes if more than one length of barcode was specified.
//
@@ -535,41 +535,41 @@ correct_barcode(set<string> &bcs, Read *href, seqt type, int num_errs)
for (it = bcs.begin(); it != bcs.end(); it++) {
- //
- // Copy the proper subset of the barcode to match the length of the barcode in the bcs set.
- //
- strncpy(bc, type == single_end ? href->se_bc : href->pe_bc, it->length());
- bc[it->length()] = '\0';
-
- d = 0;
- for (p = it->c_str(), q = bc; *p != '\0'; p++, q++)
- if (*p != *q) d++;
-
- if (d <= num_errs) {
- close++;
- b = *it;
- break;
- }
+ //
+ // Copy the proper subset of the barcode to match the length of the barcode in the bcs set.
+ //
+ strncpy(bc, type == single_end ? href->se_bc : href->pe_bc, it->length());
+ bc[it->length()] = '\0';
+
+ d = 0;
+ for (p = it->c_str(), q = bc; *p != '\0'; p++, q++)
+ if (*p != *q) d++;
+
+ if (d <= num_errs) {
+ close++;
+ b = *it;
+ break;
+ }
}
if (close == 1) {
- //
- // Correct the barcode.
- //
- if (type == single_end) {
- strcpy(href->se_bc, b.c_str());
- if (barcode_type == inline_null ||
- barcode_type == inline_index ||
- barcode_type == inline_inline)
- href->inline_bc_len = b.length();
- } else {
- strcpy(href->pe_bc, b.c_str());
- if (barcode_type == index_inline ||
- barcode_type == inline_inline)
- href->inline_bc_len = b.length();
- }
-
- return true;
+ //
+ // Correct the barcode.
+ //
+ if (type == single_end) {
+ strcpy(href->se_bc, b.c_str());
+ if (barcode_type == inline_null ||
+ barcode_type == inline_index ||
+ barcode_type == inline_inline)
+ href->inline_bc_len = b.length();
+ } else {
+ strcpy(href->pe_bc, b.c_str());
+ if (barcode_type == index_inline ||
+ barcode_type == inline_inline)
+ href->inline_bc_len = b.length();
+ }
+
+ return true;
}
return false;
@@ -587,17 +587,17 @@ init_adapter_seq(int kmer_size, char *adapter, int &adp_len, AdapterHash &kmers)
int num_kmers = adp_len - kmer_size + 1;
char *p = adapter;
for (int i = 0; i < num_kmers; i++) {
- kmer.assign(p, kmer_size);
- kmers[kmer].push_back(i);
- p++;
+ kmer.assign(p, kmer_size);
+ kmers[kmer].push_back(i);
+ p++;
}
return 0;
}
-int
+int
filter_adapter_seq(Read *href, char *adapter, int adp_len, AdapterHash &adp_kmers,
- int kmer_size, int distance, int len_limit)
+ int kmer_size, int distance, int len_limit)
{
vector<pair<int, int> > hits;
int num_kmers = href->len - kmer_size + 1;
@@ -608,65 +608,65 @@ filter_adapter_seq(Read *href, char *adapter, int adp_len, AdapterHash &adp_kmer
// Identify matching kmers and their locations of occurance.
//
for (int i = 0; i < num_kmers; i++) {
- kmer.assign(p, kmer_size);
-
- if (adp_kmers.count(kmer) > 0) {
- for (uint j = 0; j < adp_kmers[kmer].size(); j++) {
- // cerr << "Kmer hit " << kmer << " at query position " << i << " at hit position " << adp_kmers[kmer][j] << "\n";
- hits.push_back(make_pair(i, adp_kmers[kmer][j]));
- }
- }
- p++;
+ kmer.assign(p, kmer_size);
+
+ if (adp_kmers.count(kmer) > 0) {
+ for (uint j = 0; j < adp_kmers[kmer].size(); j++) {
+ // cerr << "Kmer hit " << kmer << " at query position " << i << " at hit position " << adp_kmers[kmer][j] << "\n";
+ hits.push_back(make_pair(i, adp_kmers[kmer][j]));
+ }
+ }
+ p++;
}
//
- // Scan backwards from the position of the k-mer and then scan forwards
+ // Scan backwards from the position of the k-mer and then scan forwards
// counting the number of mismatches.
//
int mismatches, i, j, start_pos;
for (uint k = 0; k < hits.size(); k++) {
- mismatches = 0;
- i = hits[k].first; // Position in query sequence
- j = hits[k].second; // Position in adapter hit
-
- // cerr << "Starting comparison at i: "<< i << "; j: " << j << "\n";
-
- while (i >= 0 && j >= 0) {
- if (href->seq[i] != adapter[j])
- mismatches++;
- i--;
- j--;
- }
-
- if (mismatches > distance)
- continue;
-
- start_pos = i + 1;
- i = hits[k].first;
- j = hits[k].second;
-
- while (i < (int) href->len && j < adp_len && mismatches <= distance) {
- if (href->seq[i] != adapter[j])
- mismatches++;
- i++;
- j++;
- }
-
- // cerr << "Starting position: " << start_pos << "; Query end (i): " << i << "; adapter end (j): " << j
- // << "; number of mismatches: " << mismatches << "; Seq Len: " << href->len << "; SeqSeq Len: " << strlen(href->seq) << "\n";
-
- if (mismatches <= distance && (i == (int) href->len || j == adp_len)) {
- // cerr << " Trimming or dropping.\n";
- if (start_pos < len_limit) {
- return 0;
- } else {
- href->len = start_pos + 1;
- href->seq[start_pos] = '\0';
- href->phred[start_pos] = '\0';
- return -1;
- }
- }
+ mismatches = 0;
+ i = hits[k].first; // Position in query sequence
+ j = hits[k].second; // Position in adapter hit
+
+ // cerr << "Starting comparison at i: "<< i << "; j: " << j << "\n";
+
+ while (i >= 0 && j >= 0) {
+ if (href->seq[i] != adapter[j])
+ mismatches++;
+ i--;
+ j--;
+ }
+
+ if (mismatches > distance)
+ continue;
+
+ start_pos = i + 1;
+ i = hits[k].first;
+ j = hits[k].second;
+
+ while (i < (int) href->len && j < adp_len && mismatches <= distance) {
+ if (href->seq[i] != adapter[j])
+ mismatches++;
+ i++;
+ j++;
+ }
+
+ // cerr << "Starting position: " << start_pos << "; Query end (i): " << i << "; adapter end (j): " << j
+ // << "; number of mismatches: " << mismatches << "; Seq Len: " << href->len << "; SeqSeq Len: " << strlen(href->seq) << "\n";
+
+ if (mismatches <= distance && (i == (int) href->len || j == adp_len)) {
+ // cerr << " Trimming or dropping.\n";
+ if (start_pos < len_limit) {
+ return 0;
+ } else {
+ href->len = start_pos + 1;
+ href->seq[start_pos] = '\0';
+ href->phred[start_pos] = '\0';
+ return -1;
+ }
+ }
}
return 1;
diff --git a/src/clean.h b/src/clean.h
index cf1e364..1774bd7 100644
--- a/src/clean.h
+++ b/src/clean.h
@@ -41,9 +41,9 @@ using std::unordered_map;
enum fastqt {generic_fastq, illv1_fastq, illv2_fastq};
enum barcodet {null_null, null_index,
- inline_null, index_null,
- inline_inline, index_index,
- inline_index, index_inline};
+ inline_null, index_null,
+ inline_inline, index_index,
+ inline_index, index_inline};
enum seqt {single_end, paired_end};
typedef unordered_map<string, vector<int>, std::hash<string> > AdapterHash;
@@ -64,87 +64,87 @@ public:
BarcodePair()
{
- this->se = "";
- this->pe = "";
+ this->se = "";
+ this->pe = "";
}
BarcodePair(char *p)
{
- this->se = string(p);
- this->pe = "";
+ this->se = string(p);
+ this->pe = "";
}
BarcodePair(char *p, char *q, char *n)
{
- if (p != NULL)
- this->se = string(p);
- if (q != NULL)
- this->pe = string(q);
- if (n != NULL)
- this->name = string(n);
+ if (p != NULL)
+ this->se = string(p);
+ if (q != NULL)
+ this->pe = string(q);
+ if (n != NULL)
+ this->name = string(n);
}
BarcodePair(string se, string pe, string name)
{
- this->se = se;
- this->pe = pe;
- this->name = name;
+ this->se = se;
+ this->pe = pe;
+ this->name = name;
}
BarcodePair(string se)
{
- this->se = se;
- this->pe = "";
+ this->se = se;
+ this->pe = "";
}
void set(char *p, char *q)
{
- this->se = string(p);
- this->pe = string(q);
+ this->se = string(p);
+ this->pe = string(q);
}
void set(char *p)
{
- this->se = string(p);
- this->pe = "";
+ this->se = string(p);
+ this->pe = "";
}
void set(string p, string q)
{
- this->se = p;
- this->pe = q;
+ this->se = p;
+ this->pe = q;
}
void set(string p)
{
- this->se = p;
- this->pe = "";
+ this->se = p;
+ this->pe = "";
}
- string str()
+ string str()
{
- if (this->pe.length() > 0)
- return string(this->se + "-" + this->pe);
- else
- return this->se;
+ if (this->pe.length() > 0)
+ return string(this->se + "-" + this->pe);
+ else
+ return this->se;
}
bool name_exists()
{
- if (this->name.length() > 0)
- return true;
- return false;
+ if (this->name.length() > 0)
+ return true;
+ return false;
}
friend bool operator<(const BarcodePair &lhs, const BarcodePair &rhs)
{
- if (lhs.se < rhs.se)
- return true;
- else if (lhs.se == rhs.se && lhs.pe < rhs.pe)
- return true;
- else
- return false;
+ if (lhs.se < rhs.se)
+ return true;
+ else if (lhs.se == rhs.se && lhs.pe < rhs.pe)
+ return true;
+ else
+ return false;
}
friend bool operator==(const BarcodePair &lhs, const BarcodePair &rhs)
{
- return (lhs.se == rhs.se && lhs.pe == rhs.pe);
+ return (lhs.se == rhs.se && lhs.pe == rhs.pe);
}
friend ofstream& operator<<(ofstream &out, const BarcodePair &bp)
{
- if (bp.pe.length() > 0)
- out << bp.se << "-" << bp.pe;
- else
- out << bp.se;
- return out;
+ if (bp.pe.length() > 0)
+ out << bp.se << "-" << bp.pe;
+ else
+ out << bp.se;
+ return out;
}
};
@@ -174,118 +174,118 @@ public:
double stop_pos;
Read(uint buf_len, int read, int barcode_size, double win_size) {
- this->inline_bc = new char[id_len + 1];
- this->index_bc = new char[id_len + 1];
- this->machine = new char[id_len + 1];
- this->seq = new char[buf_len + 1];
- this->phred = new char[buf_len + 1];
- this->int_scores = new int[buf_len];
- this->size = buf_len + 1;
- this->read = read;
-
- this->retain = 1;
- this->inline_bc_len = 0;
- this->tile = 0;
- this->lane = 0;
- this->x = 0;
- this->y = 0;
- this->index = 0;
- this->len = 0;
-
- this->inline_bc[0] = '\0';
- this->index_bc[0] = '\0';
- this->machine[0] = '\0';
- this->seq[0] = '\0';
- this->phred[0] = '\0';
-
- this->set_len(buf_len);
-
- this->se_bc = NULL;
- this->pe_bc = NULL;
- if (this->read == 1) {
- switch(barcode_type) {
- case index_inline:
- this->se_bc = this->index_bc;
- this->pe_bc = this->inline_bc;
- break;
- case inline_index:
- this->se_bc = this->inline_bc;
- this->pe_bc = this->index_bc;
- this->inline_bc_len = barcode_size;
- break;
- case inline_null:
- case inline_inline:
- this->se_bc = this->inline_bc;
- this->inline_bc_len = barcode_size;
- break;
- case index_null:
- case index_index:
- this->se_bc = this->index_bc;
- break;
- default:
- break;
- }
- } else if (this->read == 2) {
- switch(barcode_type) {
- case inline_inline:
- case index_inline:
- this->pe_bc = this->inline_bc;
- this->inline_bc_len = barcode_size;
- break;
- case index_index:
- case inline_index:
- this->pe_bc = this->index_bc;
- break;
- default:
- break;
- }
- }
+ this->inline_bc = new char[id_len + 1];
+ this->index_bc = new char[id_len + 1];
+ this->machine = new char[id_len + 1];
+ this->seq = new char[buf_len + 1];
+ this->phred = new char[buf_len + 1];
+ this->int_scores = new int[buf_len];
+ this->size = buf_len + 1;
+ this->read = read;
+
+ this->retain = 1;
+ this->inline_bc_len = 0;
+ this->tile = 0;
+ this->lane = 0;
+ this->x = 0;
+ this->y = 0;
+ this->index = 0;
+ this->len = 0;
+
+ this->inline_bc[0] = '\0';
+ this->index_bc[0] = '\0';
+ this->machine[0] = '\0';
+ this->seq[0] = '\0';
+ this->phred[0] = '\0';
+
+ this->set_len(buf_len);
+
+ this->se_bc = NULL;
+ this->pe_bc = NULL;
+ if (this->read == 1) {
+ switch(barcode_type) {
+ case index_inline:
+ this->se_bc = this->index_bc;
+ this->pe_bc = this->inline_bc;
+ break;
+ case inline_index:
+ this->se_bc = this->inline_bc;
+ this->pe_bc = this->index_bc;
+ this->inline_bc_len = barcode_size;
+ break;
+ case inline_null:
+ case inline_inline:
+ this->se_bc = this->inline_bc;
+ this->inline_bc_len = barcode_size;
+ break;
+ case index_null:
+ case index_index:
+ this->se_bc = this->index_bc;
+ break;
+ default:
+ break;
+ }
+ } else if (this->read == 2) {
+ switch(barcode_type) {
+ case inline_inline:
+ case index_inline:
+ this->pe_bc = this->inline_bc;
+ this->inline_bc_len = barcode_size;
+ break;
+ case index_index:
+ case inline_index:
+ this->pe_bc = this->index_bc;
+ break;
+ default:
+ break;
+ }
+ }
}
~Read() {
- delete [] this->inline_bc;
- delete [] this->index_bc;
- delete [] this->machine;
- delete [] this->seq;
- delete [] this->phred;
- delete [] this->int_scores;
+ delete [] this->inline_bc;
+ delete [] this->index_bc;
+ delete [] this->machine;
+ delete [] this->seq;
+ delete [] this->phred;
+ delete [] this->int_scores;
}
int resize(int size) {
- delete [] this->seq;
- delete [] this->phred;
- delete [] this->int_scores;
- this->size = size;
- this->seq = new char[this->size];
- this->phred = new char[this->size];
- this->int_scores = new int[this->size - 1];
+ delete [] this->seq;
+ delete [] this->phred;
+ delete [] this->int_scores;
+ this->size = size;
+ this->seq = new char[this->size];
+ this->phred = new char[this->size];
+ this->int_scores = new int[this->size - 1];
- this->set_len(size - 1);
+ this->set_len(size - 1);
- return 0;
+ return 0;
}
int set_len(uint buf_len) {
- if (buf_len == this->len)
- return 0;
+ if (buf_len == this->len)
+ return 0;
- if (buf_len > this->size - 1)
- buf_len = this->size - 1;
+ if (buf_len > this->size - 1)
+ buf_len = this->size - 1;
- this->seq[buf_len] = '\0';
- this->phred[buf_len] = '\0';
+ this->seq[buf_len] = '\0';
+ this->phred[buf_len] = '\0';
- //
- // Set the parameters for checking read quality later in processing.
- // Window length is 15% (rounded) of the sequence length.
- //
- this->len = buf_len - this->inline_bc_len;
- this->win_len = round((double) this->len * win_size);
+ //
+ // Set the parameters for checking read quality later in processing.
+ // Window length is 15% (rounded) of the sequence length.
+ //
+ this->len = buf_len - this->inline_bc_len;
+ this->win_len = round((double) this->len * win_size);
- if (this->win_len < 1)
- this->win_len = 1;
+ if (this->win_len < 1)
+ this->win_len = 1;
- this->len += this->inline_bc_len;
- this->stop_pos = this->len - this->win_len;
+ this->len += this->inline_bc_len;
+ this->stop_pos = this->len - this->win_len;
- return 0;
+ return 0;
}
};
@@ -306,14 +306,14 @@ int check_quality_scores(Read *, int, int, int, int);
// Templated function to process barcodes.
//
template<typename fhType>
-int
-process_barcode(Read *href_1, Read *href_2, BarcodePair &bc,
- map<BarcodePair, fhType *> &fhs,
- set<string> &se_bc, set<string> &pe_bc,
- map<BarcodePair, map<string, long> > &barcode_log, map<string, long> &counter)
+int
+process_barcode(Read *href_1, Read *href_2, BarcodePair &bc,
+ map<BarcodePair, fhType *> &fhs,
+ set<string> &se_bc, set<string> &pe_bc,
+ map<BarcodePair, map<string, long> > &barcode_log, map<string, long> &counter)
{
if (barcode_type == null_null)
- return 0;
+ return 0;
//
// Is this a legitimate barcode? The barcode passed into this function is the maximally long
@@ -331,39 +331,39 @@ process_barcode(Read *href_1, Read *href_2, BarcodePair &bc,
p = bc_1 + max_bc_size_1; // Point p at the end of string NULL.
for (uint i = max_bc_size_1; i >= min_bc_size_1; i--)
- if (se_bc.count(bc_1) > 0) {
- valid_se_bc = true;
- break;
- } else {
- p--;
- *p = '\0';
- }
+ if (se_bc.count(bc_1) > 0) {
+ valid_se_bc = true;
+ break;
+ } else {
+ p--;
+ *p = '\0';
+ }
if (pe_bc.size() > 0) {
- p = bc_2 + max_bc_size_2; // Point p at the end of string NULL.
- for (uint i = max_bc_size_2; i >= min_bc_size_2; i--)
- if (pe_bc.count(bc_2) > 0) {
- valid_pe_bc = true;
- break;
- } else {
- p--;
- *p = '\0';
- }
+ p = bc_2 + max_bc_size_2; // Point p at the end of string NULL.
+ for (uint i = max_bc_size_2; i >= min_bc_size_2; i--)
+ if (pe_bc.count(bc_2) > 0) {
+ valid_pe_bc = true;
+ break;
+ } else {
+ p--;
+ *p = '\0';
+ }
}
if (valid_se_bc == true && valid_pe_bc == true)
- bc.set(bc_1, bc_2);
+ bc.set(bc_1, bc_2);
else if (valid_se_bc == true)
- bc.se = bc_1;
+ bc.se = bc_1;
else if (valid_pe_bc == true)
- bc.pe = bc_2;
+ bc.pe = bc_2;
//
// Log the barcodes we receive.
//
if (barcode_log.count(bc) == 0) {
- barcode_log[bc]["noradtag"] = 0;
- barcode_log[bc]["total"] = 0;
- barcode_log[bc]["low_qual"] = 0;
- barcode_log[bc]["retained"] = 0;
+ barcode_log[bc]["noradtag"] = 0;
+ barcode_log[bc]["total"] = 0;
+ barcode_log[bc]["low_qual"] = 0;
+ barcode_log[bc]["retained"] = 0;
}
barcode_log[bc]["total"] += paired ? 2 : 1;
@@ -371,40 +371,40 @@ process_barcode(Read *href_1, Read *href_2, BarcodePair &bc,
// If we have a perfectly matching barcode, set the barcode and length in the right places.
//
if (pe_bc.size() > 0 && valid_se_bc == true && valid_pe_bc == true) {
- if (fhs.count(bc) > 0) {
- if (paired) {
- strcpy(href_1->se_bc, bc_1);
- strcpy(href_2->pe_bc, bc_2);
- } else {
- strcpy(href_1->se_bc, bc_1);
- strcpy(href_1->pe_bc, bc_2);
- }
-
- if (barcode_type == inline_index ||
- barcode_type == inline_inline)
- href_1->inline_bc_len = strlen(bc_1);
- if (barcode_type == index_inline ||
- barcode_type == inline_inline)
- href_2->inline_bc_len = strlen(bc_2);
- return 0;
- }
+ if (fhs.count(bc) > 0) {
+ if (paired) {
+ strcpy(href_1->se_bc, bc_1);
+ strcpy(href_2->pe_bc, bc_2);
+ } else {
+ strcpy(href_1->se_bc, bc_1);
+ strcpy(href_1->pe_bc, bc_2);
+ }
+
+ if (barcode_type == inline_index ||
+ barcode_type == inline_inline)
+ href_1->inline_bc_len = strlen(bc_1);
+ if (barcode_type == index_inline ||
+ barcode_type == inline_inline)
+ href_2->inline_bc_len = strlen(bc_2);
+ return 0;
+ }
} else if (valid_se_bc == true) {
- strcpy(href_1->se_bc, bc_1);
- if (barcode_type == inline_null ||
- barcode_type == inline_index ||
- barcode_type == inline_inline)
- href_1->inline_bc_len = strlen(bc_1);
+ strcpy(href_1->se_bc, bc_1);
+ if (barcode_type == inline_null ||
+ barcode_type == inline_index ||
+ barcode_type == inline_inline)
+ href_1->inline_bc_len = strlen(bc_1);
} else if (valid_pe_bc == true) {
- if (paired)
- strcpy(href_2->pe_bc, bc_2);
- else
- strcpy(href_1->pe_bc, bc_2);
-
- if (barcode_type == index_inline ||
- barcode_type == inline_inline)
- href_2->inline_bc_len = strlen(bc_2);
+ if (paired)
+ strcpy(href_2->pe_bc, bc_2);
+ else
+ strcpy(href_1->pe_bc, bc_2);
+
+ if (barcode_type == index_inline ||
+ barcode_type == inline_inline)
+ href_2->inline_bc_len = strlen(bc_2);
}
//
@@ -415,52 +415,52 @@ process_barcode(Read *href_1, Read *href_2, BarcodePair &bc,
bool pe_correct = false;
if (paired) {
- if (se_bc.count(bc.se) == 0)
- se_correct = correct_barcode(se_bc, href_1, single_end, barcode_dist_1);
- if (pe_bc.size() > 0 && pe_bc.count(bc.pe) == 0)
- pe_correct = correct_barcode(pe_bc, href_2, paired_end, barcode_dist_2);
-
- if (se_correct)
- bc.se = string(href_1->se_bc);
- if (pe_bc.size() > 0 && pe_correct)
- bc.pe = string(href_2->pe_bc);
-
- //
- // After correcting the individual barcodes, check if the combination is valid.
- //
- if (fhs.count(bc) == 0) {
- counter["ambiguous"] += 2;
- href_1->retain = 0;
- href_2->retain = 0;
- }
+ if (se_bc.count(bc.se) == 0)
+ se_correct = correct_barcode(se_bc, href_1, single_end, barcode_dist_1);
+ if (pe_bc.size() > 0 && pe_bc.count(bc.pe) == 0)
+ pe_correct = correct_barcode(pe_bc, href_2, paired_end, barcode_dist_2);
+
+ if (se_correct)
+ bc.se = string(href_1->se_bc);
+ if (pe_bc.size() > 0 && pe_correct)
+ bc.pe = string(href_2->pe_bc);
+
+ //
+ // After correcting the individual barcodes, check if the combination is valid.
+ //
+ if (fhs.count(bc) == 0) {
+ counter["ambiguous"] += 2;
+ href_1->retain = 0;
+ href_2->retain = 0;
+ }
} else {
- if (se_bc.count(bc.se) == 0)
- se_correct = correct_barcode(se_bc, href_1, single_end, barcode_dist_1);
- if (pe_bc.size() > 0 && pe_bc.count(bc.pe) == 0)
- pe_correct = correct_barcode(pe_bc, href_1, paired_end, barcode_dist_2);
-
- if (se_correct)
- bc.se = string(href_1->se_bc);
- if (pe_bc.size() > 0 && pe_correct)
- bc.pe = string(href_1->pe_bc);
-
- if (fhs.count(bc) == 0) {
- counter["ambiguous"]++;
- href_1->retain = 0;
- }
+ if (se_bc.count(bc.se) == 0)
+ se_correct = correct_barcode(se_bc, href_1, single_end, barcode_dist_1);
+ if (pe_bc.size() > 0 && pe_bc.count(bc.pe) == 0)
+ pe_correct = correct_barcode(pe_bc, href_1, paired_end, barcode_dist_2);
+
+ if (se_correct)
+ bc.se = string(href_1->se_bc);
+ if (pe_bc.size() > 0 && pe_correct)
+ bc.pe = string(href_1->pe_bc);
+
+ if (fhs.count(bc) == 0) {
+ counter["ambiguous"]++;
+ href_1->retain = 0;
+ }
}
if (href_1->retain && (se_correct || pe_correct)) {
- counter["recovered"] += paired ? 2 : 1;
- barcode_log[old_barcode]["total"] -= paired ? 2 : 1;
- if (barcode_log.count(bc) == 0) {
- barcode_log[bc]["total"] = 0;
- barcode_log[bc]["retained"] = 0;
- barcode_log[bc]["low_qual"] = 0;
- barcode_log[bc]["noradtag"] = 0;
- }
- barcode_log[bc]["total"] += paired ? 2 : 1;
+ counter["recovered"] += paired ? 2 : 1;
+ barcode_log[old_barcode]["total"] -= paired ? 2 : 1;
+ if (barcode_log.count(bc) == 0) {
+ barcode_log[bc]["total"] = 0;
+ barcode_log[bc]["retained"] = 0;
+ barcode_log[bc]["low_qual"] = 0;
+ barcode_log[bc]["noradtag"] = 0;
+ }
+ barcode_log[bc]["total"] += paired ? 2 : 1;
}
return 0;
diff --git a/src/clone_filter.cc b/src/clone_filter.cc
index fd3de72..05d4b76 100644
--- a/src/clone_filter.cc
+++ b/src/clone_filter.cc
@@ -20,7 +20,7 @@
//
// clone_filter -- find duplicate read pairs and reduce them to one representative
-// pair of sequences in the data set. These reads are assumed to be the product of
+// pair of sequences in the data set. These reads are assumed to be the product of
// PCR amplification.
//
@@ -113,7 +113,7 @@ int main (int argc, char* argv[]) {
cerr << "Searching for inline oligo on single-end read and index oligo (i5 or i7 Illumina read).\n";
break;
}
-
+
map<string, long> counters;
counters["total"] = 0;
counters["red_reads"] = 0;
@@ -187,19 +187,19 @@ int main (int argc, char* argv[]) {
return 0;
}
-int
+int
process_paired_reads_by_sequence(string prefix_1, string prefix_2, map<string, long> &counters,
CloneHash &clone_map, vector<char *> &clone_map_keys)
{
Input *fh_1, *fh_2;
int return_val = 1;
-
+
string path_1 = in_path_1 + prefix_1;
string path_2 = in_path_2 + prefix_2;
- cerr << "Reading data from:\n "
- << path_1 << " and\n "
+ cerr << "Reading data from:\n "
+ << path_1 << " and\n "
<< path_2 << "\n";
switch (in_file_type) {
@@ -273,7 +273,7 @@ process_paired_reads_by_sequence(string prefix_1, string prefix_2, map<string, l
delete s_2;
i++;
- } while ((s_1 = fh_1->next_seq()) != NULL &&
+ } while ((s_1 = fh_1->next_seq()) != NULL &&
(s_2 = fh_2->next_seq()) != NULL);
cerr << "\n";
@@ -291,7 +291,7 @@ write_clonereduced_sequence(string prefix_1, string prefix_2,
{
ofstream out_fh_1, out_fh_2, discard_fh_1, discard_fh_2;
gzFile out_gzfh_1, out_gzfh_2, discard_gzfh_1, discard_gzfh_2;
-
+
int return_val = 1;
//
@@ -304,7 +304,7 @@ write_clonereduced_sequence(string prefix_1, string prefix_2,
// Open the output files.
//
string suffix_1, suffix_2;
-
+
if (out_file_type == FileT::gzfastq) {
suffix_1 = ".1.fq.gz";
suffix_2 = ".2.fq.gz";
@@ -327,7 +327,7 @@ write_clonereduced_sequence(string prefix_1, string prefix_2,
cerr << "Error opening output file '" << path_1 << "'\n";
return -1;
}
- } else {
+ } else {
out_fh_1.open(path_1.c_str(), ifstream::out);
if (out_fh_1.fail()) {
cerr << "Error opening output file '" << path_1 << "'\n";
@@ -343,7 +343,7 @@ write_clonereduced_sequence(string prefix_1, string prefix_2,
cerr << "Error opening output file '" << path_2 << "'\n";
return -1;
}
- } else {
+ } else {
out_fh_2.open(path_2.c_str(), ifstream::out);
if (out_fh_2.fail()) {
cerr << "Error opening output file '" << path_2 << "'\n";
@@ -426,7 +426,7 @@ write_clonereduced_sequence(string prefix_1, string prefix_2,
out_fh_1 << sstr_1.str();
out_fh_2 << sstr_2.str();
}
-
+
counters["dis_reads"] += map_it->second.size() - 1;
clone_dist[map_it->second.size()]++;
@@ -489,17 +489,17 @@ write_clonereduced_sequence(string prefix_1, string prefix_2,
discard_fh_2.close();
}
}
-
+
return return_val;
}
-int
+int
process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counters, OligoHash &oligo_map)
{
Input *fh_1, *fh_2;
Read *r_1, *r_2;
ofstream out_fh_1, out_fh_2, discard_fh_1, discard_fh_2;
gzFile out_gzfh_1, out_gzfh_2, discard_gzfh_1, discard_gzfh_2;
-
+
int return_val = 1;
//
@@ -547,7 +547,7 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
// Open the output files.
//
string suffix_1, suffix_2;
-
+
if (out_file_type == FileT::gzfastq) {
suffix_1 = ".1.fq.gz";
suffix_2 = ".2.fq.gz";
@@ -570,7 +570,7 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
cerr << "Error opening output file '" << path_1 << "'\n";
return -1;
}
- } else {
+ } else {
out_fh_1.open(path_1.c_str(), ifstream::out);
if (out_fh_1.fail()) {
cerr << "Error opening output file '" << path_1 << "'\n";
@@ -586,7 +586,7 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
cerr << "Error opening output file '" << path_2 << "'\n";
return -1;
}
- } else {
+ } else {
out_fh_2.open(path_2.c_str(), ifstream::out);
if (out_fh_2.fail()) {
cerr << "Error opening output file '" << path_2 << "'\n";
@@ -606,7 +606,7 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
cerr << "Error opening discard file '" << path_1 << "'\n";
return -1;
}
- } else {
+ } else {
discard_fh_1.open(path_1.c_str(), ifstream::out);
if (discard_fh_1.fail()) {
cerr << "Error opening discard file '" << path_1 << "'\n";
@@ -622,7 +622,7 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
cerr << "Error opening discard file '" << path_2 << "'\n";
return -1;
}
- } else {
+ } else {
discard_fh_2.open(path_2.c_str(), ifstream::out);
if (discard_fh_2.fail()) {
cerr << "Error opening discard file '" << path_2 << "'\n";
@@ -658,15 +658,15 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
break;
}
-
+
//
- // Read in the first record, initializing the Seq object s. Then
+ // Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
//
Seq *s_1 = fh_1->next_seq();
Seq *s_2 = fh_2->next_seq();
if (s_1 == NULL || s_2 == NULL) {
- cerr << "Attempting to read first pair of input records, unable to allocate "
+ cerr << "Attempting to read first pair of input records, unable to allocate "
<< "Seq object (Was the correct input type specified?).\n";
exit(1);
}
@@ -801,12 +801,12 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
break;
}
}
-
+
delete s_1;
delete s_2;
i++;
- } while ((s_1 = fh_1->next_seq()) != NULL &&
+ } while ((s_1 = fh_1->next_seq()) != NULL &&
(s_2 = fh_2->next_seq()) != NULL);
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
@@ -824,7 +824,7 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
discard_fh_2.close();
}
}
-
+
delete fh_1;
if (interleaved == false) delete fh_2;
@@ -834,14 +834,14 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
return return_val;
}
-int
+int
process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map)
{
Input *fh_1;
Read *r_1;
ofstream out_fh_1, discard_fh_1;
gzFile out_gzfh_1, discard_gzfh_1;
-
+
int return_val = 1;
//
@@ -859,9 +859,9 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
fh_1 = new GzFastq(path_1.c_str());
break;
case FileT::fasta:
- fh_1 = new Fasta(path_1);
+ fh_1 = new Fasta(path_1);
break;
- case FileT::gzfasta:
+ case FileT::gzfasta:
fh_1 = new GzFasta(path_1);
break;
case FileT::bam:
@@ -877,7 +877,7 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
// Open the output files.
//
string suffix_1;
-
+
if (out_file_type == FileT::gzfastq)
suffix_1 = ".fq.gz";
else if (out_file_type == FileT::fastq)
@@ -889,7 +889,7 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
string file_1 = prefix_1;
int pos = file_1.find_last_of(".");
- if ((in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) &&
+ if ((in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) &&
file_1.substr(pos) == ".gz") {
file_1 = file_1.substr(0, pos);
pos = file_1.find_last_of(".");
@@ -901,7 +901,7 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
cerr << "Error opening output file '" << path_1 << "'\n";
return -1;
}
- } else {
+ } else {
out_fh_1.open(path_1.c_str(), ifstream::out);
if (out_fh_1.fail()) {
cerr << "Error opening output file '" << path_1 << "'\n";
@@ -921,7 +921,7 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
cerr << "Error opening discard file '" << path_1 << "'\n";
return -1;
}
- } else {
+ } else {
discard_fh_1.open(path_1.c_str(), ifstream::out);
if (discard_fh_1.fail()) {
cerr << "Error opening discard file '" << path_1 << "'\n";
@@ -944,14 +944,14 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
offset_1 = 0;
break;
}
-
+
//
- // Read in the first record, initializing the Seq object s. Then
+ // Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
//
Seq *s_1 = fh_1->next_seq();
if (s_1 == NULL) {
- cerr << "Attempting to read first pair of input records, unable to allocate "
+ cerr << "Attempting to read first pair of input records, unable to allocate "
<< "Seq object (Was the correct input type specified?).\n";
exit(1);
}
@@ -1014,7 +1014,7 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
default:
break;
}
-
+
if (!result_1) {
cerr << "Error writing to output file for '" << file_1 << "'\n";
return_val = -1;
@@ -1058,7 +1058,7 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
out_fh_1.close();
if (discards) discard_fh_1.close();
}
-
+
delete fh_1;
delete r_1;
@@ -1066,8 +1066,8 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
return return_val;
}
-int
-free_hash(vector<char *> &keys)
+int
+free_hash(vector<char *> &keys)
{
for (uint i = 0; i < keys.size(); i++) {
delete [] keys[i];
@@ -1106,7 +1106,7 @@ int parse_command_line(int argc, char* argv[]) {
{"retain_oligo", required_argument, NULL, 'R'},
{0, 0, 0, 0}
};
-
+
// getopt_long stores the option index here.
int option_index = 0;
@@ -1207,7 +1207,7 @@ int parse_command_line(int argc, char* argv[]) {
// getopt_long already printed an error message.
help();
break;
-
+
default:
cerr << "Unknown command line option '" << (char) c << "'\n";
help();
@@ -1235,16 +1235,16 @@ int parse_command_line(int argc, char* argv[]) {
help();
}
- if (in_path_1.length() > 0 && in_path_1.at(in_path_1.length() - 1) != '/')
+ if (in_path_1.length() > 0 && in_path_1.at(in_path_1.length() - 1) != '/')
in_path_1 += "/";
- if (in_path_2.length() > 0 && in_path_2.at(in_path_2.length() - 1) != '/')
+ if (in_path_2.length() > 0 && in_path_2.at(in_path_2.length() - 1) != '/')
in_path_2 += "/";
- if (out_path.length() == 0)
+ if (out_path.length() == 0)
out_path = ".";
- if (out_path.at(out_path.length() - 1) != '/')
+ if (out_path.at(out_path.length() - 1) != '/')
out_path += "/";
if (in_file_type == FileT::unknown)
@@ -1259,7 +1259,7 @@ int parse_command_line(int argc, char* argv[]) {
cerr << "You must specify the length of the oligo sequences (--oligo_len_1 / --oligo_len_2).\n";
help();
}
-
+
return 0;
}
diff --git a/src/clone_filter.h b/src/clone_filter.h
index 5a93f44..07c775b 100644
--- a/src/clone_filter.h
+++ b/src/clone_filter.h
@@ -21,7 +21,7 @@
#ifndef __CLONE_FILTER_H__
#define __CLONE_FILTER_H__
-#include "constants.h"
+#include "constants.h"
#include <stdlib.h>
#include <getopt.h> // Process command-line options
@@ -67,7 +67,7 @@ using google::sparse_hash_map;
#include "file_io.h"
#include "write.h"
-class Pair {
+class Pair {
public:
string p1_id;
string p2_id;
@@ -75,14 +75,14 @@ public:
string p2_qual;
Pair(string p1_id, string p2_id, string p1_qual, string p2_qual) {
- this->p1_id = p1_id;
- this->p2_id = p2_id;
- this->p1_qual = p1_qual;
- this->p2_qual = p2_qual;
+ this->p1_id = p1_id;
+ this->p2_id = p2_id;
+ this->p1_qual = p1_qual;
+ this->p2_qual = p2_qual;
}
Pair(string p1_id, string p2_id) {
- this->p1_id = p1_id;
- this->p2_id = p2_id;
+ this->p1_id = p1_id;
+ this->p2_id = p2_id;
}
};
diff --git a/src/cmb.cc b/src/cmb.cc
index 47bea0e..6859d34 100644
--- a/src/cmb.cc
+++ b/src/cmb.cc
@@ -30,7 +30,7 @@
#include "cmb.h"
//
-// A cache to store combinations generated as (N choose k)
+// A cache to store combinations generated as (N choose k)
// for all set sizes encountered
//
map<int, map<int, int **> > _cmbs;
@@ -69,7 +69,7 @@ CombSet::CombSet(int n, int k, MinSpanTree *tree) {
//
// Add the initial combination: the empty set
//
- if (_cmbs.count(this->num_elements) == 0 &&
+ if (_cmbs.count(this->num_elements) == 0 &&
_cmbs[this->num_elements].count(0) == 0) {
//cerr << " N: " << this->num_elements << "; K: 0; Total elements: 0\n";
comb = new int * [2];
@@ -83,7 +83,7 @@ CombSet::CombSet(int n, int k, MinSpanTree *tree) {
//
// Check if this set of combinations is already cached.
//
- if (_cmbs.count(this->num_elements) > 0 &&
+ if (_cmbs.count(this->num_elements) > 0 &&
_cmbs[this->num_elements].count(set_size) > 0) {
set_size--;
continue;
@@ -302,7 +302,7 @@ int **CombSet::generate_combinations(int n, int k, int total) {
for (int i = 0; i < k; i++)
comb[comb_num][i] = i;
comb_num++;
-
+
//
// Generate each successive combination
//
@@ -319,7 +319,7 @@ int **CombSet::generate_combinations(int n, int k, int total) {
int CombSet::next_combination(int *comb, int n, int k) {
int i;
- //
+ //
// The zero'th position has been incremented to its maximal value,
// it's not possible to further increment values in the set.
//
@@ -333,7 +333,7 @@ int CombSet::next_combination(int *comb, int n, int k) {
comb[i]++;
//
- // Check if the last position has reached its maximal possible value,
+ // Check if the last position has reached its maximal possible value,
// if so, move back one position, and increment it.
//
while ((i > 0) && (comb[i] >= n - k + 1 + i)) {
@@ -353,7 +353,7 @@ int CombSet::next_combination(int *comb, int n, int k) {
long int CombSet::num_combinations(int n, int k) {
//
// Compute the binomial coefficient using the method of:
- // Y. Manolopoulos, "Binomial coefficient computation: recursion or iteration?",
+ // Y. Manolopoulos, "Binomial coefficient computation: recursion or iteration?",
// ACM SIGCSE Bulletin, 34(4):65-67, 2002.
//
long int r = 1;
@@ -383,7 +383,7 @@ Cmb **CombSet::next(int map[]) {
// index = e[i];
// // sets vector index number
// k = this->compound_set[index].first;
-// // combination number
+// // combination number
// n = this->compound_set[index].second;
// c[i] = new Cmb;
@@ -391,8 +391,8 @@ Cmb **CombSet::next(int map[]) {
// c[i]->elem = new int[this->size[k]];
// for (j = 0; j < this->size[k]; j++)
-// c[i]->elem[j] = (map == NULL) ?
-// this->sets[k][n][j] :
+// c[i]->elem[j] = (map == NULL) ?
+// this->sets[k][n][j] :
// map[this->sets[k][n][j]];
// }
diff --git a/src/cmb.h b/src/cmb.h
index 1551ff7..3742866 100644
--- a/src/cmb.h
+++ b/src/cmb.h
@@ -55,11 +55,11 @@ typedef struct cmb {
class CombSet {
//
- // Given these two variables, we will select N choose K combinations.
+ // Given these two variables, we will select N choose K combinations.
// This combination will be stored in sets, and we will then decrement K by 1
// and continue to generate sets.
//
- // Once we have generated all the combinations of a particular size, K, we
+ // Once we have generated all the combinations of a particular size, K, we
// will partition the minimum spanning tree by dropping combinations of edges
// from the graph. The selection of edges to drop is provided by the combinations
// generated first. Finally, each set of disconnected subgraphs makes for one
diff --git a/src/constants.h b/src/constants.h
index 1bd1f03..de1358a 100644
--- a/src/constants.h
+++ b/src/constants.h
@@ -53,11 +53,11 @@ const int libz_buffer_size = 1048576;
//
// Supported file types
//
-enum class FileT {unknown,
- sql, gzsql,
- fasta, gzfasta,
- fastq, gzfastq,
- bowtie, sam, bam, tsv,
- bustard, phase, fastphase, beagle};
+enum class FileT {unknown,
+ sql, gzsql,
+ fasta, gzfasta,
+ fastq, gzfastq,
+ bowtie, sam, bam, tsv,
+ bustard, phase, fastphase, beagle};
#endif
diff --git a/src/cstacks.cc b/src/cstacks.cc
index 010b7be..fd0fca1 100644
--- a/src/cstacks.cc
+++ b/src/cstacks.cc
@@ -49,11 +49,11 @@ int main (int argc, char* argv[]) {
uint sample_cnt = samples.size();
cerr << "cstacks paramters selected:\n"
- << " Loci matched based on " << (search_type == sequence ? "sequence identity" : "genomic location") << ".\n";
+ << " Loci matched based on " << (search_type == sequence ? "sequence identity" : "genomic location") << ".\n";
if (search_type == sequence)
cerr << " Number of mismatches allowed between stacks: " << ctag_dist << "\n";
cerr << " Gapped alignments: " << (gapped_alignments ? "enabled" : "disabled") << "\n"
- << "Constructing catalog from " << sample_cnt << " samples.\n";
+ << "Constructing catalog from " << sample_cnt << " samples.\n";
//
// Set the number of OpenMP parallel threads to execute.
@@ -70,23 +70,23 @@ int main (int argc, char* argv[]) {
int i;
if (catalog_path.length() > 0) {
- cerr << "Initializing existing catalog...\n";
- if (!initialize_existing_catalog(catalog_path, catalog)) {
- cerr << "Failed to initialize the catalog.\n";
- return 1;
- }
- i = 1;
+ cerr << "Initializing existing catalog...\n";
+ if (!initialize_existing_catalog(catalog_path, catalog)) {
+ cerr << "Failed to initialize the catalog.\n";
+ return 1;
+ }
+ i = 1;
} else {
- s = samples.front();
- samples.pop();
-
- cerr << "Initializing new catalog...\n";
- if (!initialize_new_catalog(s, catalog)) {
- cerr << "Failed to initialize the catalog.\n";
- return 1;
- }
- i = 2;
+ s = samples.front();
+ samples.pop();
+
+ cerr << "Initializing new catalog...\n";
+ if (!initialize_new_catalog(s, catalog)) {
+ cerr << "Failed to initialize the catalog.\n";
+ return 1;
+ }
+ i = 2;
}
//
@@ -94,49 +94,49 @@ int main (int argc, char* argv[]) {
//
map<string, int> cat_index;
if (search_type == genomic_loc) {
- cerr << "Building an index of the catalog.\n";
- update_catalog_index(catalog, cat_index);
+ cerr << "Building an index of the catalog.\n";
+ update_catalog_index(catalog, cat_index);
}
while (!samples.empty()) {
map<int, QLocus *> sample;
- cerr << "Processing sample " << s.second << " [" << i << " of " << sample_cnt << "]\n";
+ cerr << "Processing sample " << s.second << " [" << i << " of " << sample_cnt << "]\n";
- s = samples.front();
- samples.pop();
+ s = samples.front();
+ samples.pop();
- if (!load_loci(s.second, sample, false, false, compressed)) {
+ if (!load_loci(s.second, sample, false, false, compressed)) {
cerr << "Failed to load sample " << i << "\n";
continue;
}
- //
- // Assign the ID for this sample data.
- //
- s.first = sample.begin()->second->sample_id;
+ //
+ // Assign the ID for this sample data.
+ //
+ s.first = sample.begin()->second->sample_id;
- //dump_loci(sample);
+ //dump_loci(sample);
if (search_type == sequence) {
cerr << "Searching for sequence matches...\n";
find_kmer_matches_by_sequence(catalog, sample, ctag_dist);
- if (gapped_alignments) {
+ if (gapped_alignments) {
cerr << "Searching for gapped alignments...\n";
- search_for_gaps(catalog, sample, min_match_len, ctag_dist);
+ search_for_gaps(catalog, sample, min_match_len, ctag_dist);
}
- } else if (search_type == genomic_loc) {
+ } else if (search_type == genomic_loc) {
cerr << "Searching for matches by genomic location...\n";
find_matches_by_genomic_loc(cat_index, sample);
}
- cerr << "Merging matches into catalog...\n";
- uint mmatches = 0;
+ cerr << "Merging matches into catalog...\n";
+ uint mmatches = 0;
uint gmatches = 0;
uint umatches = 0;
uint nmatches = 0;
- merge_matches(catalog, sample, s, ctag_dist, nmatches, umatches, gmatches, mmatches);
+ merge_matches(catalog, sample, s, ctag_dist, nmatches, umatches, gmatches, mmatches);
cerr << " " << umatches << " loci were matched to a catalog locus.\n"
<< " " << gmatches << " loci were matched to a catalog locus using gapped alignments.\n"
<< " " << nmatches << " loci were newly added to the catalog.\n"
@@ -144,27 +144,34 @@ int main (int argc, char* argv[]) {
//
// Regenerate the alleles for the catalog tags after merging the new sample into the catalog.
- //
+ //
for (cat_it = catalog.begin(); cat_it != catalog.end(); cat_it++) {
cat_it->second->populate_alleles();
cat_it->second->match_cnt = 0;
}
- if (search_type == genomic_loc) {
- cerr << " Updating catalog index...\n";
- update_catalog_index(catalog, cat_index);
- }
- i++;
+ if (search_type == genomic_loc) {
+ cerr << " Updating catalog index...\n";
+ update_catalog_index(catalog, cat_index);
+ }
+ i++;
- for (query_it = sample.begin(); query_it != sample.end(); query_it++)
- delete (*query_it).second;
- sample.clear();
+ for (query_it = sample.begin(); query_it != sample.end(); query_it++)
+ delete query_it->second;
+ sample.clear();
}
cerr << "Writing catalog to '" << out_path << "...";
write_catalog(catalog);
cerr << " done.\n";
+ //
+ // Free memory associated with the catalog.
+ //
+ for (cat_it = catalog.begin(); cat_it != catalog.end(); cat_it++)
+ delete cat_it->second;
+ catalog.clear();
+
return 0;
}
@@ -173,17 +180,17 @@ int update_catalog_index(map<int, CLocus *> &catalog, map<string, int> &cat_inde
char id[id_len];
for (j = catalog.begin(); j != catalog.end(); j++) {
- snprintf(id, id_len - 1, "%s|%d|%c",
- j->second->loc.chr,
- j->second->loc.bp,
- j->second->loc.strand == strand_plus ? '+' : '-');
-
- if (cat_index.count(id) == 0) {
- cat_index[id] = j->first;
- } else {
- if (cat_index[id] != j->first)
- cerr << "Error: Catalog index mismatch, key: '" << id << "'.\n";
- }
+ snprintf(id, id_len - 1, "%s|%d|%c",
+ j->second->loc.chr,
+ j->second->loc.bp,
+ j->second->loc.strand == strand_plus ? '+' : '-');
+
+ if (cat_index.count(id) == 0) {
+ cat_index[id] = j->first;
+ } else {
+ if (cat_index[id] != j->first)
+ cerr << "Error: Catalog index mismatch, key: '" << id << "'.\n";
+ }
}
return 0;
@@ -195,9 +202,9 @@ characterize_mismatch_snps(CLocus *catalog_tag, QLocus *query_tag)
set<int> snp_cols;
uint i;
for (i = 0; i < catalog_tag->snps.size(); i++)
- snp_cols.insert(catalog_tag->snps[i]->col);
+ snp_cols.insert(catalog_tag->snps[i]->col);
for (i = 0; i < query_tag->snps.size(); i++)
- snp_cols.insert(query_tag->snps[i]->col);
+ snp_cols.insert(query_tag->snps[i]->col);
//
// For each mismatch found, create a SNP object
@@ -211,12 +218,12 @@ characterize_mismatch_snps(CLocus *catalog_tag, QLocus *query_tag)
i = 0;
while (c < c_end && q < q_end) {
- if (snp_cols.count(i) == 0 &&
- (*c != *q) && (*c != 'N' && *q != 'N')) {
+ if (snp_cols.count(i) == 0 &&
+ (*c != *q) && (*c != 'N' && *q != 'N')) {
// cerr << "Adding a new SNP at position " << c - c_beg << ", " << *c << "/" << *q << "\n";
SNP *s = new SNP;
- s->type = snp_type_het;
+ s->type = snp_type_het;
s->col = c - c_beg;
s->lratio = 0;
s->rank_1 = *c;
@@ -228,7 +235,7 @@ characterize_mismatch_snps(CLocus *catalog_tag, QLocus *query_tag)
catalog_tag->snps.push_back(s);
s = new SNP;
- s->type = snp_type_het;
+ s->type = snp_type_het;
s->col = q - q_beg;
s->lratio = 0;
s->rank_1 = *q;
@@ -236,17 +243,17 @@ characterize_mismatch_snps(CLocus *catalog_tag, QLocus *query_tag)
query_tag->snps.push_back(s);
}
- c++;
- q++;
- i++;
+ c++;
+ q++;
+ i++;
}
return 1;
}
-int
+int
merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int, string> &sample_file, int ctag_dist,
- uint &new_matches, uint &unique_matches, uint &gapped_matches, uint &multiple_matches)
+ uint &new_matches, uint &unique_matches, uint &gapped_matches, uint &multiple_matches)
{
map<int, QLocus *>::iterator i;
CLocus *ctag;
@@ -258,25 +265,25 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
GappedAln *aln = new GappedAln();
for (i = sample.begin(); i != sample.end(); i++) {
- qtag = i->second;
+ qtag = i->second;
//
- // If this stack didn't match an existing catalog stack, add this stack to the
+ // If this stack didn't match an existing catalog stack, add this stack to the
// catalog as a new stack.
//
- if (qtag->matches.size() == 0) {
- add_unique_tag(sample_file, catalog, qtag);
+ if (qtag->matches.size() == 0) {
+ add_unique_tag(sample_file, catalog, qtag);
new_matches++;
- continue;
- }
+ continue;
+ }
//
// Check for multiple matches. We will reduce the list of Match objects, which
// contain matches to multiple alleles for a single locus, to the smallest distance
// for a locus.
//
- map<int, uint> local_matches;
- for (uint k = 0; k < qtag->matches.size(); k++) {
+ map<int, uint> local_matches;
+ for (uint k = 0; k < qtag->matches.size(); k++) {
if (local_matches.count(qtag->matches[k]->cat_id) == 0)
local_matches[qtag->matches[k]->cat_id] = qtag->matches[k]->dist;
else if (qtag->matches[k]->dist < local_matches[qtag->matches[k]->cat_id])
@@ -299,29 +306,29 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
min_cat_id = j->first;
}
- //
- // Emit a warning if the query tag matches more than one tag in the catalog.
- //
- if (num_matches > 1) {
- multiple_matches++;
- if (report_mmatches) {
- cerr <<
- " Warning: sample " << sample_file.second << ", tag " << qtag->id <<
- ", matches more than one tag in the catalog and was excluded: ";
- for (map<int, uint>::iterator j = local_matches.begin(); j != local_matches.end(); j++)
- cerr << j->first << " ";
- cerr << "\n";
- }
- //
- // Don't record matches to multiple catalog entries unless instructed
- // to do so by the command line option.
- //
- if (!mult_matches) continue;
- }
+ //
+ // Emit a warning if the query tag matches more than one tag in the catalog.
+ //
+ if (num_matches > 1) {
+ multiple_matches++;
+ if (report_mmatches) {
+ cerr <<
+ " Warning: sample " << sample_file.second << ", tag " << qtag->id <<
+ ", matches more than one tag in the catalog and was excluded: ";
+ for (map<int, uint>::iterator j = local_matches.begin(); j != local_matches.end(); j++)
+ cerr << j->first << " ";
+ cerr << "\n";
+ }
+ //
+ // Don't record matches to multiple catalog entries unless instructed
+ // to do so by the command line option.
+ //
+ if (!mult_matches) continue;
+ }
ctag = catalog[min_cat_id];
- if (ctag == NULL)
+ if (ctag == NULL)
cerr << " Unable to locate catalog tag " << min_cat_id << "\n";
cigar_str = "";
@@ -370,6 +377,7 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
// Adjust the postition of any SNPs that were shifted down sequence due to a gap.
//
if (gapped_aln) {
+ cseq_len = parse_cigar(cigar_str.c_str(), cigar);
qseq = apply_cigar_to_seq(qtag->con, cigar);
adjust_snps_for_gaps(cigar, qtag);
@@ -378,6 +386,13 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
cseq = apply_cigar_to_seq(ctag->con, cigar);
adjust_snps_for_gaps(cigar, ctag);
+ if (qseq.length() != cseq.length())
+ cerr << "Sample ID: " << qtag->sample_id << "; Query ID: " << qtag->id << "; catalog ID: " << ctag->id << ";\n"
+ << "qloc: " << qtag->con << "\n"
+ << "qseq: " << qseq << "\n"
+ << "cloc: " << ctag->con << " [" << cigar_str << "]\n"
+ << "cseq: " << cseq << "\n";
+
//
// If the alignment modified the catalog locus, record it so we can re-align
// any other matching sequences from this sample.
@@ -399,38 +414,40 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
}
//
- // If mismatches are allowed between query and catalog tags, identify the
+ // If mismatches are allowed between query and catalog tags, identify the
// mismatches and convert them into SNP objects to be merged into the catalog tag.
//
if ((ctag_dist > 0 || search_type == genomic_loc) && !characterize_mismatch_snps(ctag, qtag))
- cerr
- << " Error characterizing mismatch SNPs "
- << sample_file.second << ", tag " << qtag->id
+ cerr
+ << " Error characterizing mismatch SNPs "
+ << sample_file.second << ", tag " << qtag->id
<< " with catalog tag " << ctag->id << "\n";
- //
- // Merge the SNPs and alleles from the sample into the catalog tag.
- //
- if (!ctag->merge_snps(qtag)) {
- cerr << "Error merging " << sample_file.second << ", tag " << qtag->id
- << " with catalog tag " << ctag->id << "\n";
- }
+ //
+ // Merge the SNPs and alleles from the sample into the catalog tag.
+ //
+ if (!ctag->merge_snps(qtag)) {
+ cerr << "Error merging " << sample_file.second << ", tag " << qtag->id
+ << " with catalog tag " << ctag->id << "\n";
+ }
//
// Add any new sequence information into the catalog consensus.
//
if (gapped_aln) {
- for (uint k = 0; k < ctag->len; k++)
+ for (uint k = 0; k < ctag->len && k < qtag->len; k++)
if (qtag->con[k] != 'N' && ctag->con[k] == 'N')
ctag->con[k] = qtag->con[k];
} else if (strlen(ctag->con) < strlen(qtag->con)) {
- ctag->add_consensus(qtag->con);
- }
+ ctag->add_consensus(qtag->con);
+ }
- ctag->sources.push_back(make_pair(sample_file.first, qtag->id));
+ ctag->sources.push_back(make_pair(sample_file.first, qtag->id));
}
+ delete aln;
+
return 0;
}
@@ -457,18 +474,18 @@ int add_unique_tag(pair<int, string> &sample_file, map<int, CLocus *> &catalog,
// cerr << "Adding sample: " << qloc->id << " to the catalog as ID: " << c->id << "\n";
for (i = qloc->snps.begin(); i != qloc->snps.end(); i++) {
- SNP *snp = new SNP;
- snp->col = (*i)->col;
- snp->type = (*i)->type;
- snp->lratio = (*i)->lratio;
- snp->rank_1 = (*i)->rank_1;
- snp->rank_2 = (*i)->rank_2;
-
- c->snps.push_back(snp);
+ SNP *snp = new SNP;
+ snp->col = (*i)->col;
+ snp->type = (*i)->type;
+ snp->lratio = (*i)->lratio;
+ snp->rank_1 = (*i)->rank_1;
+ snp->rank_2 = (*i)->rank_2;
+
+ c->snps.push_back(snp);
}
for (j = qloc->alleles.begin(); j != qloc->alleles.end(); j++) {
- c->alleles[j->first] = j->second;
+ c->alleles[j->first] = j->second;
}
c->populate_alleles();
@@ -492,8 +509,8 @@ int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *
// OpenMP can't parallelize random access iterators, so we convert
// our map to a vector of integer keys.
vector<int> keys;
- for (it = sample.begin(); it != sample.end(); it++)
- keys.push_back(it->first);
+ for (it = sample.begin(); it != sample.end(); it++)
+ keys.push_back(it->first);
//
// Calculate the number of k-mers we will generate. If kmer_len == 0,
@@ -509,8 +526,8 @@ int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *
int min_hits = calc_min_kmer_matches(kmer_len, ctag_dist, con_len, set_kmer_len ? true : false);
cerr << " Distance allowed between stacks: " << ctag_dist
- << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
- << min_hits << " k-mer hits required.\n";
+ << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
+ << min_hits << " k-mer hits required.\n";
// clock_t time_1, time_2, time_3, time_4;
// double per_locus = 0.0;
@@ -518,22 +535,22 @@ int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *
// time_1 = clock();
populate_kmer_hash(catalog, kmer_map, kmer_map_keys, allele_map, kmer_len);
// time_2 = clock();
-
+
cerr << " " << catalog.size() << " loci in the catalog, " << kmer_map.size() << " kmers in the catalog hash.\n";
#pragma omp parallel private(tag_1, tag_2, allele)
{
- KmerHashMap::iterator h;
+ KmerHashMap::iterator h;
vector<char *> kmers;
set<string> uniq_kmers;
- vector<int> hits;
+ vector<int> hits;
vector<pair<int, int> > ordered_hits;
- uint hit_cnt, index, prev_id, allele_id, hits_size;
+ uint hit_cnt, index, prev_id, allele_id, hits_size;
int d;
- pair<allele_type, int> cat_hit;
-
- initialize_kmers(kmer_len, num_kmers, kmers);
-
+ pair<allele_type, int> cat_hit;
+
+ initialize_kmers(kmer_len, num_kmers, kmers);
+
#pragma omp for
for (uint i = 0; i < keys.size(); i++) {
tag_1 = sample[keys[i]];
@@ -541,77 +558,77 @@ int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *
// time_3 = clock();
for (allele = tag_1->strings.begin(); allele != tag_1->strings.end(); allele++) {
-
+
generate_kmers_lazily(allele->second.c_str(), kmer_len, num_kmers, kmers);
//
- // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
- // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
- //
- uniq_kmers.clear();
+ // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
+ // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
+ //
+ uniq_kmers.clear();
for (int j = 0; j < num_kmers; j++)
- uniq_kmers.insert(kmers[j]);
+ uniq_kmers.insert(kmers[j]);
- hits.clear();
- ordered_hits.clear();
+ hits.clear();
+ ordered_hits.clear();
- //
- // Lookup the occurances of each k-mer in the kmer_map
- //
- for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
+ //
+ // Lookup the occurances of each k-mer in the kmer_map
+ //
+ for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
- h = kmer_map.find(j->c_str());
+ h = kmer_map.find(j->c_str());
- if (h != kmer_map.end())
- for (uint k = 0; k < h->second.size(); k++)
- hits.push_back(h->second[k]);
+ if (h != kmer_map.end())
+ for (uint k = 0; k < h->second.size(); k++)
+ hits.push_back(h->second[k]);
}
- //
- // Sort the vector of indexes; provides the number of hits to each allele/locus
- // and orders them largest to smallest.
- //
- sort(hits.begin(), hits.end());
+ //
+ // Sort the vector of indexes; provides the number of hits to each allele/locus
+ // and orders them largest to smallest.
+ //
+ sort(hits.begin(), hits.end());
- //
- // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
- //
- hits_size = hits.size();
+ //
+ // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
+ //
+ hits_size = hits.size();
if (hits_size == 0)
continue;
- prev_id = hits[0];
- index = 0;
+ prev_id = hits[0];
+ index = 0;
- do {
- hit_cnt = 0;
- allele_id = prev_id;
+ do {
+ hit_cnt = 0;
+ allele_id = prev_id;
- while ((uint)hits[index] == prev_id) {
- hit_cnt++;
- index++;
- }
+ while (index < hits_size && (uint) hits[index] == prev_id) {
+ hit_cnt++;
+ index++;
+ }
- if (index < hits_size)
- prev_id = hits[index];
+ if (index < hits_size)
+ prev_id = hits[index];
- if (hit_cnt >= (uint)min_hits)
- ordered_hits.push_back(make_pair(allele_id, hit_cnt));
+ if (hit_cnt >= (uint) min_hits)
+ ordered_hits.push_back(make_pair(allele_id, hit_cnt));
- } while (index < hits_size);
+ } while (index < hits_size);
- for (uint j = 0; j < ordered_hits.size(); j++) {
- cat_hit = allele_map.at(ordered_hits[j].first);
- hit_cnt = ordered_hits[j].second;
+ for (uint j = 0; j < ordered_hits.size(); j++) {
+ cat_hit = allele_map.at(ordered_hits[j].first);
+ hit_cnt = ordered_hits[j].second;
tag_2 = catalog[cat_hit.second];
d = dist(allele->second.c_str(), tag_2, cat_hit.first);
if (d < 0)
- cerr <<
- "Unknown error calculating distance between " <<
+ cerr <<
+ "Unknown error calculating distance between " <<
tag_1->id << " and " << tag_2->id << "; query allele: " << allele->first << "\n";
//
@@ -639,7 +656,7 @@ int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *
// cerr << "Time to kmerize catalog: " << time_2 - time_1 << "\n"
// << "Average time per locus: " << per_locus / (double) keys.size() << "\n";
-
+
free_kmer_hash(kmer_map, kmer_map_keys);
return 0;
@@ -663,8 +680,8 @@ search_for_gaps(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, double
// our map to a vector of integer keys.
//
vector<int> keys;
- for (it = sample.begin(); it != sample.end(); it++)
- keys.push_back(it->first);
+ for (it = sample.begin(); it != sample.end(); it++)
+ keys.push_back(it->first);
//
// Calculate the number of k-mers we will generate. If kmer_len == 0,
@@ -687,26 +704,26 @@ search_for_gaps(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, double
// time_1 = clock();
populate_kmer_hash(catalog, kmer_map, kmer_map_keys, allele_map, kmer_len);
// time_2 = clock();
-
+
#pragma omp parallel private(tag_1, tag_2)
{
- KmerHashMap::iterator h;
- AlignRes aln_res;
+ KmerHashMap::iterator h;
+ AlignRes aln_res;
vector<char *> kmers;
- set<string> uniq_kmers;
- vector<int> hits;
- vector<pair<int, int> > ordered_hits;
- uint hit_cnt, index, prev_id, allele_id, hits_size, stop, top_hit;
+ set<string> uniq_kmers;
+ vector<int> hits;
+ vector<pair<int, int> > ordered_hits;
+ uint hit_cnt, index, prev_id, allele_id, hits_size, stop, top_hit;
int d;
vector<pair<char, uint> > cigar;
- pair<allele_type, int> cat_hit;
- string cat_seq;
+ pair<allele_type, int> cat_hit;
+ string cat_seq;
GappedAln *aln = new GappedAln();
initialize_kmers(kmer_len, num_kmers, kmers);
- #pragma omp for schedule(dynamic)
+ #pragma omp for schedule(dynamic)
for (uint i = 0; i < keys.size(); i++) {
tag_1 = sample[keys[i]];
@@ -720,102 +737,102 @@ search_for_gaps(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, double
for (vector<pair<allele_type, string> >::iterator allele = tag_1->strings.begin(); allele != tag_1->strings.end(); allele++) {
- generate_kmers_lazily(allele->second.c_str(), kmer_len, num_kmers, kmers);
+ generate_kmers_lazily(allele->second.c_str(), kmer_len, num_kmers, kmers);
//
- // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
- // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
- //
- uniq_kmers.clear();
+ // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
+ // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
+ //
+ uniq_kmers.clear();
for (int j = 0; j < num_kmers; j++)
- uniq_kmers.insert(kmers[j]);
+ uniq_kmers.insert(kmers[j]);
hits.clear();
- ordered_hits.clear();
+ ordered_hits.clear();
//
- // Lookup the occurances of each k-mer in the kmer_map
- //
- for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
+ // Lookup the occurances of each k-mer in the kmer_map
+ //
+ for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
- h = kmer_map.find(j->c_str());
+ h = kmer_map.find(j->c_str());
- if (h != kmer_map.end())
- for (uint k = 0; k < h->second.size(); k++)
- hits.push_back(h->second[k]);
+ if (h != kmer_map.end())
+ for (uint k = 0; k < h->second.size(); k++)
+ hits.push_back(h->second[k]);
}
- //
- // Sort the vector of indexes; provides the number of hits to each allele/locus
- // and orders them largest to smallest.
- //
- sort(hits.begin(), hits.end());
+ //
+ // Sort the vector of indexes; provides the number of hits to each allele/locus
+ // and orders them largest to smallest.
+ //
+ sort(hits.begin(), hits.end());
- //
- // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
- //
- hits_size = hits.size();
+ //
+ // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
+ //
+ hits_size = hits.size();
if (hits_size == 0)
continue;
- prev_id = hits[0];
- index = 0;
+ prev_id = hits[0];
+ index = 0;
- do {
- hit_cnt = 0;
- allele_id = prev_id;
+ do {
+ hit_cnt = 0;
+ allele_id = prev_id;
- while ((uint)hits[index] == prev_id) {
- hit_cnt++;
- index++;
- }
+ while (index < hits_size && (uint) hits[index] == prev_id) {
+ hit_cnt++;
+ index++;
+ }
- if (index < hits_size)
- prev_id = hits[index];
+ if (index < hits_size)
+ prev_id = hits[index];
- if (hit_cnt >= (uint)min_hits)
- ordered_hits.push_back(make_pair(allele_id, hit_cnt));
+ if (hit_cnt >= (uint) min_hits)
+ ordered_hits.push_back(make_pair(allele_id, hit_cnt));
- } while (index < hits_size);
+ } while (index < hits_size);
if (ordered_hits.size() == 0)
continue;
-
- //
- // Process the hits from most kmer hits to least kmer hits.
- //
- sort(ordered_hits.begin(), ordered_hits.end(), compare_pair_intint);
-
- //
- // Only try to align the sequences with the most kmers in common.
- //
- top_hit = ordered_hits[0].second;
+
+ //
+ // Process the hits from most kmer hits to least kmer hits.
+ //
+ sort(ordered_hits.begin(), ordered_hits.end(), compare_pair_intint);
+
+ //
+ // Only try to align the sequences with the most kmers in common.
+ //
+ top_hit = ordered_hits[0].second;
stop = 1;
- for (uint j = 1; j < ordered_hits.size(); j++)
- if ((uint)ordered_hits[j].second < top_hit) {
- stop = j;
- break;
- }
-
- for (uint j = 0; j < stop; j++) {
- cat_hit = allele_map.at(ordered_hits[j].first);
- hit_cnt = ordered_hits[j].second;
-
- tag_2 = catalog[cat_hit.second];
-
- cat_seq = "";
- for (uint k = 0; k < tag_2->strings.size(); k++)
- if (tag_2->strings[k].first == cat_hit.first) {
- cat_seq = tag_2->strings[k].second;
- break;
- }
-
- aln->init(tag_2->len, tag_1->len);
-
- if (aln->align(cat_seq, allele->second)) {
- cigar.clear();
- aln->parse_cigar(cigar);
+ for (uint j = 1; j < ordered_hits.size(); j++)
+ if ((uint)ordered_hits[j].second < top_hit) {
+ stop = j;
+ break;
+ }
+
+ for (uint j = 0; j < stop; j++) {
+ cat_hit = allele_map.at(ordered_hits[j].first);
+ hit_cnt = ordered_hits[j].second;
+
+ tag_2 = catalog[cat_hit.second];
+
+ cat_seq = "";
+ for (uint k = 0; k < tag_2->strings.size(); k++)
+ if (tag_2->strings[k].first == cat_hit.first) {
+ cat_seq = tag_2->strings[k].second;
+ break;
+ }
+
+ aln->init(tag_2->len, tag_1->len);
+
+ if (aln->align(cat_seq, allele->second)) {
+ cigar.clear();
+ aln->parse_cigar(cigar);
aln_res = aln->result();
d = dist(cat_seq.c_str(), allele->second.c_str(), cigar);
@@ -832,8 +849,8 @@ search_for_gaps(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, double
tag_1->add_match(tag_2->id, cat_hit.first, allele->first, d, invert_cigar(aln_res.cigar));
}
}
- }
- }
+ }
+ }
}
// time_4 = clock();
@@ -874,37 +891,37 @@ int find_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *> &sa
// OpenMP can't parallelize random access iterators, so we convert
// our map to a vector of integer keys.
vector<int> keys;
- for (i = sample.begin(); i != sample.end(); i++)
- keys.push_back(i->first);
+ for (i = sample.begin(); i != sample.end(); i++)
+ keys.push_back(i->first);
#pragma omp parallel private(i, j, k)
{
- #pragma omp for schedule(dynamic)
- for (k = 0; k < (int) keys.size(); k++) {
-
- i = sample.find(keys[k]);
-
- vector<pair<allele_type, string> >::iterator r, s;
-
- //
- // Iterate through the possible SAMPLE alleles
- //
- for (r = i->second->strings.begin(); r != i->second->strings.end(); r++) {
-
- for (j = catalog.begin(); j != catalog.end(); j++) {
- //
- // Iterate through the possible CATALOG alleles
- //
- for (s = j->second->strings.begin(); s != j->second->strings.end(); s++) {
- if (r->second == s->second) {
- //cerr << "Found a match between " << i->first << " (" << r->first << ") and " << j->first << " (" << s->first << ")\n";
-
- i->second->add_match(j->second->id, s->first, r->first, 0);
- }
- }
- }
- }
- }
+ #pragma omp for schedule(dynamic)
+ for (k = 0; k < (int) keys.size(); k++) {
+
+ i = sample.find(keys[k]);
+
+ vector<pair<allele_type, string> >::iterator r, s;
+
+ //
+ // Iterate through the possible SAMPLE alleles
+ //
+ for (r = i->second->strings.begin(); r != i->second->strings.end(); r++) {
+
+ for (j = catalog.begin(); j != catalog.end(); j++) {
+ //
+ // Iterate through the possible CATALOG alleles
+ //
+ for (s = j->second->strings.begin(); s != j->second->strings.end(); s++) {
+ if (r->second == s->second) {
+ //cerr << "Found a match between " << i->first << " (" << r->first << ") and " << j->first << " (" << s->first << ")\n";
+
+ i->second->add_match(j->second->id, s->first, r->first, 0);
+ }
+ }
+ }
+ }
+ }
}
return 0;
@@ -919,25 +936,25 @@ int find_matches_by_genomic_loc(map<string, int> &cat_index, map<int, QLocus *>
// our map to a vector of integer keys.
//
vector<int> keys;
- for (i = sample.begin(); i != sample.end(); i++)
- keys.push_back(i->first);
+ for (i = sample.begin(); i != sample.end(); i++)
+ keys.push_back(i->first);
#pragma omp parallel private(i, j)
{
- char id[id_len];
+ char id[id_len];
#pragma omp for
- for (int k = 0; k < (int) keys.size(); k++) {
+ for (int k = 0; k < (int) keys.size(); k++) {
- i = sample.find(keys[k]);
+ i = sample.find(keys[k]);
- snprintf(id, id_len - 1, "%s|%d|%c",
- i->second->loc.chr,
- i->second->loc.bp,
- i->second->loc.strand == strand_plus ? '+' : '-');
+ snprintf(id, id_len - 1, "%s|%d|%c",
+ i->second->loc.chr,
+ i->second->loc.bp,
+ i->second->loc.strand == strand_plus ? '+' : '-');
- if (cat_index.count(id) > 0)
- i->second->add_match(cat_index[id], "", "", 0);
+ if (cat_index.count(id) > 0)
+ i->second->add_match(cat_index[id], "", "", 0);
}
}
@@ -954,7 +971,7 @@ int write_catalog(map<int, CLocus *> &catalog) {
//
// Parse the input file names to create the output file
//
- stringstream prefix;
+ stringstream prefix;
prefix << out_path << "batch_" << batch_id;
string tag_file = prefix.str() + ".catalog.tags.tsv";
@@ -963,8 +980,8 @@ int write_catalog(map<int, CLocus *> &catalog) {
if (gzip) {
tag_file += ".gz";
- snp_file += ".gz";
- all_file += ".gz";
+ snp_file += ".gz";
+ all_file += ".gz";
}
//
@@ -974,45 +991,45 @@ int write_catalog(map<int, CLocus *> &catalog) {
ofstream tags, snps, alle;
if (gzip) {
gz_tags = gzopen(tag_file.c_str(), "wb");
- if (!gz_tags) {
- cerr << "Error: Unable to open gzipped catalog tag file '" << tag_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ if (!gz_tags) {
+ cerr << "Error: Unable to open gzipped catalog tag file '" << tag_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_tags, libz_buffer_size);
- #endif
- gz_snps = gzopen(snp_file.c_str(), "wb");
- if (!gz_snps) {
- cerr << "Error: Unable to open gzipped catalog snps file '" << snp_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gzbuffer(gz_tags, libz_buffer_size);
+ #endif
+ gz_snps = gzopen(snp_file.c_str(), "wb");
+ if (!gz_snps) {
+ cerr << "Error: Unable to open gzipped catalog snps file '" << snp_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_snps, libz_buffer_size);
- #endif
- gz_alle = gzopen(all_file.c_str(), "wb");
- if (!gz_alle) {
- cerr << "Error: Unable to open gzipped catalog alleles file '" << all_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gzbuffer(gz_snps, libz_buffer_size);
+ #endif
+ gz_alle = gzopen(all_file.c_str(), "wb");
+ if (!gz_alle) {
+ cerr << "Error: Unable to open gzipped catalog alleles file '" << all_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_alle, libz_buffer_size);
- #endif
+ gzbuffer(gz_alle, libz_buffer_size);
+ #endif
} else {
- tags.open(tag_file.c_str());
- if (tags.fail()) {
- cerr << "Error: Unable to open catalog tag file for writing.\n";
- exit(1);
- }
- snps.open(snp_file.c_str());
- if (snps.fail()) {
- cerr << "Error: Unable to open catalog SNPs file for writing.\n";
- exit(1);
- }
- alle.open(all_file.c_str());
- if (alle.fail()) {
- cerr << "Error: Unable to open catalog alleles file for writing.\n";
- exit(1);
- }
+ tags.open(tag_file.c_str());
+ if (tags.fail()) {
+ cerr << "Error: Unable to open catalog tag file for writing.\n";
+ exit(1);
+ }
+ snps.open(snp_file.c_str());
+ if (snps.fail()) {
+ cerr << "Error: Unable to open catalog SNPs file for writing.\n";
+ exit(1);
+ }
+ alle.open(all_file.c_str());
+ if (alle.fail()) {
+ cerr << "Error: Unable to open catalog alleles file for writing.\n";
+ exit(1);
+ }
}
//
@@ -1027,34 +1044,34 @@ int write_catalog(map<int, CLocus *> &catalog) {
time(&rawtime);
timeinfo = localtime(&rawtime);
strftime(date, 32, "%F %T", timeinfo);
- log << "# cstacks version " << VERSION << "; catalog generated on " << date << "\n";
+ log << "# cstacks version " << VERSION << "; catalog generated on " << date << "\n";
if (gzip) {
gzputs(gz_tags, log.str().c_str());
gzputs(gz_snps, log.str().c_str());
gzputs(gz_alle, log.str().c_str());
} else {
tags << log.str();
- snps << log.str();
- alle << log.str();
+ snps << log.str();
+ alle << log.str();
}
for (i = catalog.begin(); i != catalog.end(); i++) {
- tag = i->second;
+ tag = i->second;
- if (gzip)
- write_gzip_output(tag, gz_tags, gz_snps, gz_alle);
- else
- write_simple_output(tag, tags, snps, alle);
+ if (gzip)
+ write_gzip_output(tag, gz_tags, gz_snps, gz_alle);
+ else
+ write_simple_output(tag, tags, snps, alle);
}
if (gzip) {
- gzclose(gz_tags);
- gzclose(gz_snps);
- gzclose(gz_alle);
+ gzclose(gz_tags);
+ gzclose(gz_snps);
+ gzclose(gz_alle);
} else {
- tags.close();
- snps.close();
- alle.close();
+ tags.close();
+ snps.close();
+ alle.close();
}
return 0;
@@ -1067,52 +1084,52 @@ int merge_allele(Locus *locus, SNP *snp) {
SNP *lsnp;
for (i = locus->snps.begin(); i != locus->snps.end(); i++)
- columns[(*i)->col] = make_pair("sample", *i);
+ columns[(*i)->col] = make_pair("sample", *i);
if (columns.count(snp->col)) {
- lsnp = columns[snp->col].second;
-
- //
- // If this is a new allele for this nucleotide, add it to the catalog SNP.
- //
- bool rank_1_exists = false;
- bool rank_2_exists = false;
-
- if (snp->rank_1 == lsnp->rank_1 ||
- snp->rank_1 == lsnp->rank_2 ||
- snp->rank_1 == lsnp->rank_3 ||
- snp->rank_1 == lsnp->rank_4) {
- rank_1_exists = true;
- }
- if (snp->rank_2 == lsnp->rank_1 ||
- snp->rank_2 == lsnp->rank_2 ||
- snp->rank_2 == lsnp->rank_3 ||
- snp->rank_2 == lsnp->rank_4) {
- rank_2_exists = true;
- }
-
- if (rank_1_exists == false) {
- if (lsnp->rank_3 == 0)
- lsnp->rank_3 = snp->rank_1;
- else
- lsnp->rank_4 = snp->rank_1;
- }
- if (rank_2_exists == false) {
- if (lsnp->rank_3 == 0)
- lsnp->rank_3 = snp->rank_2;
- else
- lsnp->rank_4 = snp->rank_2;
- }
-
- columns[snp->col] = make_pair("both", lsnp);
+ lsnp = columns[snp->col].second;
+
+ //
+ // If this is a new allele for this nucleotide, add it to the catalog SNP.
+ //
+ bool rank_1_exists = false;
+ bool rank_2_exists = false;
+
+ if (snp->rank_1 == lsnp->rank_1 ||
+ snp->rank_1 == lsnp->rank_2 ||
+ snp->rank_1 == lsnp->rank_3 ||
+ snp->rank_1 == lsnp->rank_4) {
+ rank_1_exists = true;
+ }
+ if (snp->rank_2 == lsnp->rank_1 ||
+ snp->rank_2 == lsnp->rank_2 ||
+ snp->rank_2 == lsnp->rank_3 ||
+ snp->rank_2 == lsnp->rank_4) {
+ rank_2_exists = true;
+ }
+
+ if (rank_1_exists == false) {
+ if (lsnp->rank_3 == 0)
+ lsnp->rank_3 = snp->rank_1;
+ else
+ lsnp->rank_4 = snp->rank_1;
+ }
+ if (rank_2_exists == false) {
+ if (lsnp->rank_3 == 0)
+ lsnp->rank_3 = snp->rank_2;
+ else
+ lsnp->rank_4 = snp->rank_2;
+ }
+
+ columns[snp->col] = make_pair("both", lsnp);
} else {
- columns[snp->col] = make_pair("merge", snp);
+ columns[snp->col] = make_pair("merge", snp);
}
vector<pair<string, SNP *> > merged_snps;
- for (c = columns.begin(); c != columns.end(); c++)
- merged_snps.push_back((*c).second);
+ for (c = columns.begin(); c != columns.end(); c++)
+ merged_snps.push_back((*c).second);
//
// Sort the SNPs by column
@@ -1120,7 +1137,7 @@ int merge_allele(Locus *locus, SNP *snp) {
sort(merged_snps.begin(), merged_snps.end(), compare_pair_snp);
//
- // Modify any existing alleles to account for this new SNP. If there are not any alleles,
+ // Modify any existing alleles to account for this new SNP. If there are not any alleles,
// create new ones.
//
stringstream sallele;
@@ -1134,38 +1151,38 @@ int merge_allele(Locus *locus, SNP *snp) {
}
map<string, int>::iterator j;
- vector<pair<string, SNP *> >::iterator k;
+ vector<pair<string, SNP *> >::iterator k;
for (j = locus->alleles.begin(); j != locus->alleles.end(); j++) {
- allele = j->first;
- new_allele = "";
- pos = 0;
+ allele = j->first;
+ new_allele = "";
+ pos = 0;
// cerr << "Allele length: " << allele.size() << "\n";
- for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
- //
- // If we inserted a SNP from the sample, add the proper nucleotide from the consensus
- // sequence to account for it in the allele string.
- //
- if ((*k).first == "merge") {
- new_allele += locus->con[(*k).second->col];
+ for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
+ //
+ // If we inserted a SNP from the sample, add the proper nucleotide from the consensus
+ // sequence to account for it in the allele string.
+ //
+ if ((*k).first == "merge") {
+ new_allele += locus->con[(*k).second->col];
// cerr << " Adding char '" << locus->con[k->second->col] << "' from consensus position " << (*k).second->col << "\n";
- } else {
- new_allele += allele[pos];
+ } else {
+ new_allele += allele[pos];
// cerr << " Adding char '" << allele[pos] << "' from allele position " << pos << "\n";
- pos++;
- }
- }
+ pos++;
+ }
+ }
- merged_alleles.insert(new_allele);
+ merged_alleles.insert(new_allele);
}
set<string>::iterator s;
locus->alleles.clear();
for (s = merged_alleles.begin(); s != merged_alleles.end(); s++) {
- locus->alleles[*s] = 0;
+ locus->alleles[*s] = 0;
}
return 1;
@@ -1184,55 +1201,55 @@ int CLocus::merge_snps(QLocus *matched_tag) {
SNP *csnp;
for (i = this->snps.begin(); i != this->snps.end(); i++)
- columns[(*i)->col] = make_pair("catalog", *i);
+ columns[(*i)->col] = make_pair("catalog", *i);
for (i = matched_tag->snps.begin(); i != matched_tag->snps.end(); i++) {
- //
- // Is this column already represented from the previous sample?
- //
- if (columns.count((*i)->col)) {
- csnp = columns[(*i)->col].second;
-
- //
- // If this is a new allele for this nucleotide, add it to the catalog SNP.
- //
- bool rank_1_exists = false;
- bool rank_2_exists = false;
-
- if ((*i)->rank_1 == csnp->rank_1 ||
- (*i)->rank_1 == csnp->rank_2 ||
- (*i)->rank_1 == csnp->rank_3 ||
- (*i)->rank_1 == csnp->rank_4) {
- rank_1_exists = true;
- }
- if ((*i)->rank_2 == csnp->rank_1 ||
- (*i)->rank_2 == csnp->rank_2 ||
- (*i)->rank_2 == csnp->rank_3 ||
- (*i)->rank_2 == csnp->rank_4) {
- rank_2_exists = true;
- }
-
- if (rank_1_exists == false) {
- if (csnp->rank_3 == 0)
- csnp->rank_3 = (*i)->rank_1;
- else
- csnp->rank_4 = (*i)->rank_1;
- }
- if (rank_2_exists == false) {
- if (csnp->rank_3 == 0)
- csnp->rank_3 = (*i)->rank_2;
- else
- csnp->rank_4 = (*i)->rank_2;
- }
-
- columns[(*i)->col] = make_pair("both", csnp);
- } else {
- columns[(*i)->col] = make_pair("sample", *i);
- }
+ //
+ // Is this column already represented from the previous sample?
+ //
+ if (columns.count((*i)->col)) {
+ csnp = columns[(*i)->col].second;
+
+ //
+ // If this is a new allele for this nucleotide, add it to the catalog SNP.
+ //
+ bool rank_1_exists = false;
+ bool rank_2_exists = false;
+
+ if ((*i)->rank_1 == csnp->rank_1 ||
+ (*i)->rank_1 == csnp->rank_2 ||
+ (*i)->rank_1 == csnp->rank_3 ||
+ (*i)->rank_1 == csnp->rank_4) {
+ rank_1_exists = true;
+ }
+ if ((*i)->rank_2 == csnp->rank_1 ||
+ (*i)->rank_2 == csnp->rank_2 ||
+ (*i)->rank_2 == csnp->rank_3 ||
+ (*i)->rank_2 == csnp->rank_4) {
+ rank_2_exists = true;
+ }
+
+ if (rank_1_exists == false) {
+ if (csnp->rank_3 == 0)
+ csnp->rank_3 = (*i)->rank_1;
+ else
+ csnp->rank_4 = (*i)->rank_1;
+ }
+ if (rank_2_exists == false) {
+ if (csnp->rank_3 == 0)
+ csnp->rank_3 = (*i)->rank_2;
+ else
+ csnp->rank_4 = (*i)->rank_2;
+ }
+
+ columns[(*i)->col] = make_pair("both", csnp);
+ } else {
+ columns[(*i)->col] = make_pair("sample", *i);
+ }
}
- for (c = columns.begin(); c != columns.end(); c++)
- merged_snps.push_back((*c).second);
+ for (c = columns.begin(); c != columns.end(); c++)
+ merged_snps.push_back((*c).second);
//
// Sort the SNPs by column
@@ -1247,71 +1264,71 @@ int CLocus::merge_snps(QLocus *matched_tag) {
int pos;
if (this->alleles.size() == 0) {
- char c;
- new_allele = "";
- for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
- csnp = k->second;
- c = this->con[k->second->col];
-
- new_allele += (csnp->col > this->len - 1) ? 'N' : c;
-
- if (csnp->col > this->len - 1) continue;
-
- if (c != csnp->rank_1 &&
- c != csnp->rank_2 &&
- c != csnp->rank_3 &&
- c != csnp->rank_4) {
-
- if (csnp->rank_3 == 0)
- csnp->rank_3 = c;
- else
- csnp->rank_4 = c;
- }
- }
-
- if (new_allele.length() > 0)
- merged_alleles.insert(new_allele);
+ char c;
+ new_allele = "";
+ for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
+ csnp = k->second;
+ c = this->con[k->second->col];
+
+ new_allele += (csnp->col > this->len - 1) ? 'N' : c;
+
+ if (csnp->col > this->len - 1) continue;
+
+ if (c != csnp->rank_1 &&
+ c != csnp->rank_2 &&
+ c != csnp->rank_3 &&
+ c != csnp->rank_4) {
+
+ if (csnp->rank_3 == 0)
+ csnp->rank_3 = c;
+ else
+ csnp->rank_4 = c;
+ }
+ }
+
+ if (new_allele.length() > 0)
+ merged_alleles.insert(new_allele);
}
//
// Merge the alleles accounting for any SNPs added from either of the two samples.
//
for (j = this->alleles.begin(); j != this->alleles.end(); j++) {
- allele = j->first;
- new_allele = "";
- pos = 0;
-
- for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
- //
- // If we inserted a SNP from the sample, add the proper nucleotide from the consensus
- // sequence to account for it in the allele string.
- //
- if (k->first == "sample") {
- new_allele += k->second->col > this->len - 1 ? 'N' : this->con[k->second->col];
- } else {
+ allele = j->first;
+ new_allele = "";
+ pos = 0;
+
+ for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
+ //
+ // If we inserted a SNP from the sample, add the proper nucleotide from the consensus
+ // sequence to account for it in the allele string.
+ //
+ if (k->first == "sample") {
+ new_allele += k->second->col > this->len - 1 ? 'N' : this->con[k->second->col];
+ } else {
new_allele += allele[pos];
- pos++;
- }
- }
+ pos++;
+ }
+ }
- merged_alleles.insert(new_allele);
+ merged_alleles.insert(new_allele);
}
for (j = matched_tag->alleles.begin(); j != matched_tag->alleles.end(); j++) {
- allele = j->first;
- new_allele = "";
- pos = 0;
-
- for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
- if (k->first == "catalog") {
- new_allele += k->second->col > matched_tag->len - 1 ? 'N' : matched_tag->con[k->second->col];
- } else {
- new_allele += allele[pos];
- pos++;
- }
- }
-
- merged_alleles.insert(new_allele);
+ allele = j->first;
+ new_allele = "";
+ pos = 0;
+
+ for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
+ if (k->first == "catalog") {
+ new_allele += k->second->col > matched_tag->len - 1 ? 'N' : matched_tag->con[k->second->col];
+ } else {
+ new_allele += allele[pos];
+ pos++;
+ }
+ }
+
+ merged_alleles.insert(new_allele);
}
//
@@ -1320,30 +1337,30 @@ int CLocus::merge_snps(QLocus *matched_tag) {
// objects contain all the nucleoties.
//
if (matched_tag->alleles.size() == 0) {
- char c;
- new_allele = "";
- for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
- csnp = k->second;
- c = matched_tag->con[k->second->col];
-
- new_allele += (csnp->col > matched_tag->len - 1) ? 'N' : c;
-
- if (csnp->col > matched_tag->len - 1) continue;
-
- if (c != csnp->rank_1 &&
- c != csnp->rank_2 &&
- c != csnp->rank_3 &&
- c != csnp->rank_4) {
-
- if (csnp->rank_3 == 0)
- csnp->rank_3 = c;
- else
- csnp->rank_4 = c;
- }
- }
-
- if (new_allele.length() > 0)
- merged_alleles.insert(new_allele);
+ char c;
+ new_allele = "";
+ for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
+ csnp = k->second;
+ c = matched_tag->con[k->second->col];
+
+ new_allele += (csnp->col > matched_tag->len - 1) ? 'N' : c;
+
+ if (csnp->col > matched_tag->len - 1) continue;
+
+ if (c != csnp->rank_1 &&
+ c != csnp->rank_2 &&
+ c != csnp->rank_3 &&
+ c != csnp->rank_4) {
+
+ if (csnp->rank_3 == 0)
+ csnp->rank_3 = c;
+ else
+ csnp->rank_4 = c;
+ }
+ }
+
+ if (new_allele.length() > 0)
+ merged_alleles.insert(new_allele);
}
// //
@@ -1359,31 +1376,31 @@ int CLocus::merge_snps(QLocus *matched_tag) {
this->snps.clear();
for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
- SNP *snp = new SNP;
- snp->col = (*k).second->col;
- snp->type = (*k).second->type;
- snp->lratio = 0.0;
- snp->rank_1 = (*k).second->rank_1;
- snp->rank_2 = (*k).second->rank_2;
- snp->rank_3 = (*k).second->rank_3;
- snp->rank_4 = (*k).second->rank_4;
-
- this->snps.push_back(snp);
-
- if (k->first == "catalog" || k->first == "both")
- delete k->second;
+ SNP *snp = new SNP;
+ snp->col = (*k).second->col;
+ snp->type = (*k).second->type;
+ snp->lratio = 0.0;
+ snp->rank_1 = (*k).second->rank_1;
+ snp->rank_2 = (*k).second->rank_2;
+ snp->rank_3 = (*k).second->rank_3;
+ snp->rank_4 = (*k).second->rank_4;
+
+ this->snps.push_back(snp);
+
+ if (k->first == "catalog" || k->first == "both")
+ delete k->second;
}
this->alleles.clear();
for (s = merged_alleles.begin(); s != merged_alleles.end(); s++) {
- this->alleles[*s] = 0;
+ this->alleles[*s] = 0;
}
return 1;
}
-int
-CLocus::reduce_alleles(set<string> &alleles)
+int
+CLocus::reduce_alleles(set<string> &alleles)
{
set<string>::iterator it;
uint len, max_len, match, ncnt;
@@ -1391,83 +1408,83 @@ CLocus::reduce_alleles(set<string> &alleles)
max_len = 0;
for (it = alleles.begin(); it != alleles.end(); it++) {
- max_len = it->length() > max_len ? it->length() : max_len;
- haplotypes.push_back(*it);
+ max_len = it->length() > max_len ? it->length() : max_len;
+ haplotypes.push_back(*it);
}
len = alleles.size();
alleles.clear();
for (uint i = 0; i < len; i++) {
- //cerr << "Looking at haplotype[" << i << "]: " << haplotypes[i] << "\n";
- //
- // We will only look at strings that contain Ns.
- //
- if (haplotypes[i].find('N') == string::npos) {
- alleles.insert(haplotypes[i]);
- //cerr << " No Ns, skipping...\n";
- continue;
- }
-
- uint k = 0;
- uint j = i + 1;
- while (k < len - 1) {
- cur.push_back(haplotypes[j % len]);
- k++;
- j++;
- }
-
- //
- // Examine the haplotype alleles one SNP at a time. If we are able to uniquely
- // determine a second haplotype that encompasses the first
- // to, return it.
- //
- j = 0;
- while (cur.size() > 1 && j < max_len) {
-
- for (k = 0; k < cur.size(); k++) {
- cerr << "Comparing haplotypes[" << i << "]: '" << haplotypes[i] << "' to '" << cur[k] << " at position " << j << "'\n";
- if (haplotypes[i][j] == cur[k][j] || haplotypes[i][j] == 'N') {
- cerr << " Keeping this haplotype.\n";
- next.push_back(cur[k]);
- } else {
- cerr << " Discarding this haplotype.\n";
- }
- }
- cur = next;
- next.clear();
- j++;
- }
-
- //
- // If there is only one left, make sure what we have of the haplotype does match
- // and its not simply an erroneously called haplotype. If so, then this haplotype
- // is encompassed by another, longer haplotype and we do not need to keep it.
- //
- ncnt = 0;
- match = 0;
- if (cur.size() > 1) {
- cerr << "Discarding " << haplotypes[i] << "\n";
- continue;
- } else if (cur.size() == 1) {
- for (k = 0; k < max_len; k++)
- if (haplotypes[i][k] != 'N') ncnt++;
- for (k = 0; k < max_len; k++)
- if (cur[0][k] == haplotypes[i][k]) match++;
- if (match == ncnt) {
- cerr << "Discarding " << haplotypes[i] << "\n";
- continue;
- }
- }
-
- cerr << "Keeping " << haplotypes[i] << "\n";
- alleles.insert(haplotypes[i]);
+ //cerr << "Looking at haplotype[" << i << "]: " << haplotypes[i] << "\n";
+ //
+ // We will only look at strings that contain Ns.
+ //
+ if (haplotypes[i].find('N') == string::npos) {
+ alleles.insert(haplotypes[i]);
+ //cerr << " No Ns, skipping...\n";
+ continue;
+ }
+
+ uint k = 0;
+ uint j = i + 1;
+ while (k < len - 1) {
+ cur.push_back(haplotypes[j % len]);
+ k++;
+ j++;
+ }
+
+ //
+ // Examine the haplotype alleles one SNP at a time. If we are able to uniquely
+ // determine a second haplotype that encompasses the first
+ // to, return it.
+ //
+ j = 0;
+ while (cur.size() > 1 && j < max_len) {
+
+ for (k = 0; k < cur.size(); k++) {
+ cerr << "Comparing haplotypes[" << i << "]: '" << haplotypes[i] << "' to '" << cur[k] << " at position " << j << "'\n";
+ if (haplotypes[i][j] == cur[k][j] || haplotypes[i][j] == 'N') {
+ cerr << " Keeping this haplotype.\n";
+ next.push_back(cur[k]);
+ } else {
+ cerr << " Discarding this haplotype.\n";
+ }
+ }
+ cur = next;
+ next.clear();
+ j++;
+ }
+
+ //
+ // If there is only one left, make sure what we have of the haplotype does match
+ // and its not simply an erroneously called haplotype. If so, then this haplotype
+ // is encompassed by another, longer haplotype and we do not need to keep it.
+ //
+ ncnt = 0;
+ match = 0;
+ if (cur.size() > 1) {
+ cerr << "Discarding " << haplotypes[i] << "\n";
+ continue;
+ } else if (cur.size() == 1) {
+ for (k = 0; k < max_len; k++)
+ if (haplotypes[i][k] != 'N') ncnt++;
+ for (k = 0; k < max_len; k++)
+ if (cur[0][k] == haplotypes[i][k]) match++;
+ if (match == ncnt) {
+ cerr << "Discarding " << haplotypes[i] << "\n";
+ continue;
+ }
+ }
+
+ cerr << "Keeping " << haplotypes[i] << "\n";
+ alleles.insert(haplotypes[i]);
}
return 0;
}
-int
+int
populate_kmer_hash(map<int, CLocus *> &catalog, CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys, int kmer_len)
{
map<int, CLocus *>::iterator it;
@@ -1497,15 +1514,15 @@ populate_kmer_hash(map<int, CLocus *> &catalog, CatKmerHashMap &kmer_map, vector
generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
for (j = 0; j < num_kmers; j++) {
- hash_key = kmers[j];
+ hash_key = kmers[j];
exists = kmer_map.count(hash_key) == 0 ? false : true;
kmer_map[hash_key].push_back(make_pair(allele->first, tag->id));
if (exists)
- delete [] kmers[j];
+ delete [] kmers[j];
else
- kmer_map_keys.push_back(hash_key);
+ kmer_map_keys.push_back(hash_key);
}
kmers.clear();
@@ -1517,8 +1534,8 @@ populate_kmer_hash(map<int, CLocus *> &catalog, CatKmerHashMap &kmer_map, vector
return 0;
}
-int
-write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstream &all_file)
+int
+write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstream &all_file)
{
vector<SNP *>::iterator snp_it;
map<string, int>::iterator all_it;
@@ -1526,23 +1543,23 @@ write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstrea
string sources;
for (src_it = tag->sources.begin(); src_it != tag->sources.end(); src_it++) {
- stringstream s;
- s << (*src_it).first << "_" << (*src_it).second << ",";
- sources += s.str();
+ stringstream s;
+ s << (*src_it).first << "_" << (*src_it).second << ",";
+ sources += s.str();
}
sources = sources.substr(0, sources.length() - 1);
- cat_file <<
- "0" << "\t" <<
- batch_id << "\t" <<
- tag->id << "\t" <<
+ cat_file <<
+ "0" << "\t" <<
+ batch_id << "\t" <<
+ tag->id << "\t" <<
tag->loc.chr << "\t" <<
tag->loc.bp << "\t" <<
(tag->loc.strand == strand_plus ? "+" : "-") << "\t" <<
- "consensus" << "\t" <<
- "0" << "\t" <<
- sources << "\t" <<
- tag->con << "\t" <<
+ "consensus" << "\t" <<
+ "0" << "\t" <<
+ sources << "\t" <<
+ tag->con << "\t" <<
0 << "\t" << // These flags are unused in cstacks, but important in ustacks
0 << "\t" <<
0 << "\t" <<
@@ -1552,74 +1569,74 @@ write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstrea
// Output the SNPs associated with the catalog tag
//
for (snp_it = tag->snps.begin(); snp_it != tag->snps.end(); snp_it++) {
- snp_file << "0" << "\t" <<
- batch_id << "\t" <<
- tag->id << "\t" <<
- (*snp_it)->col << "\t";
-
- switch((*snp_it)->type) {
- case snp_type_het:
- snp_file << "E\t";
- break;
- case snp_type_hom:
- snp_file << "O\t";
- break;
- default:
- snp_file << "U\t";
- break;
- }
-
- snp_file <<
- (*snp_it)->lratio << "\t" <<
- (*snp_it)->rank_1 << "\t" <<
- (*snp_it)->rank_2 << "\t" <<
- ((*snp_it)->rank_3 == 0 ? '-' : (*snp_it)->rank_3) << "\t" <<
- ((*snp_it)->rank_4 == 0 ? '-' : (*snp_it)->rank_4) << "\n";
+ snp_file << "0" << "\t" <<
+ batch_id << "\t" <<
+ tag->id << "\t" <<
+ (*snp_it)->col << "\t";
+
+ switch((*snp_it)->type) {
+ case snp_type_het:
+ snp_file << "E\t";
+ break;
+ case snp_type_hom:
+ snp_file << "O\t";
+ break;
+ default:
+ snp_file << "U\t";
+ break;
+ }
+
+ snp_file <<
+ (*snp_it)->lratio << "\t" <<
+ (*snp_it)->rank_1 << "\t" <<
+ (*snp_it)->rank_2 << "\t" <<
+ ((*snp_it)->rank_3 == 0 ? '-' : (*snp_it)->rank_3) << "\t" <<
+ ((*snp_it)->rank_4 == 0 ? '-' : (*snp_it)->rank_4) << "\n";
}
//
// Output the alleles associated with the two matched tags
//
for (all_it = tag->alleles.begin(); all_it != tag->alleles.end(); all_it++)
- all_file <<
- "0" << "\t" <<
- batch_id << "\t" <<
- tag->id << "\t" <<
- all_it->first << "\t" <<
- "0" << "\t" << // These two fields are used in the
+ all_file <<
+ "0" << "\t" <<
+ batch_id << "\t" <<
+ tag->id << "\t" <<
+ all_it->first << "\t" <<
+ "0" << "\t" << // These two fields are used in the
"0" << "\n"; // ustacks/pstacks output, not in cstacks.
return 0;
}
-int
-write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_file)
+int
+write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_file)
{
vector<SNP *>::iterator snp_it;
map<string, int>::iterator all_it;
vector<pair<int, int> >::iterator src_it;
string sources;
- stringstream sstr;
+ stringstream sstr;
for (src_it = tag->sources.begin(); src_it != tag->sources.end(); src_it++) {
- sstr << (*src_it).first << "_" << (*src_it).second << ",";
+ sstr << (*src_it).first << "_" << (*src_it).second << ",";
}
sources = sstr.str();
- sources = sources.substr(0, sources.length() - 1);
+ sources = sources.substr(0, sources.length() - 1);
sstr.str("");
- sstr <<
- "0" << "\t" <<
- batch_id << "\t" <<
- tag->id << "\t" <<
+ sstr <<
+ "0" << "\t" <<
+ batch_id << "\t" <<
+ tag->id << "\t" <<
tag->loc.chr << "\t" <<
tag->loc.bp << "\t" <<
(tag->loc.strand == strand_plus ? "+" : "-") << "\t" <<
- "consensus" << "\t" <<
- "0" << "\t" <<
- sources << "\t" <<
- tag->con << "\t" <<
+ "consensus" << "\t" <<
+ "0" << "\t" <<
+ sources << "\t" <<
+ tag->con << "\t" <<
0 << "\t" << // These flags are unused in cstacks, but important in ustacks
0 << "\t" <<
0 << "\t" <<
@@ -1632,29 +1649,29 @@ write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_f
// Output the SNPs associated with the catalog tag
//
for (snp_it = tag->snps.begin(); snp_it != tag->snps.end(); snp_it++) {
- sstr << "0" << "\t" <<
- batch_id << "\t" <<
- tag->id << "\t" <<
- (*snp_it)->col << "\t";
-
- switch((*snp_it)->type) {
- case snp_type_het:
- sstr << "E\t";
- break;
- case snp_type_hom:
- sstr << "O\t";
- break;
- default:
- sstr << "U\t";
- break;
- }
-
- sstr <<
- (*snp_it)->lratio << "\t" <<
- (*snp_it)->rank_1 << "\t" <<
- (*snp_it)->rank_2 << "\t" <<
- ((*snp_it)->rank_3 == 0 ? '-' : (*snp_it)->rank_3) << "\t" <<
- ((*snp_it)->rank_4 == 0 ? '-' : (*snp_it)->rank_4) << "\n";
+ sstr << "0" << "\t" <<
+ batch_id << "\t" <<
+ tag->id << "\t" <<
+ (*snp_it)->col << "\t";
+
+ switch((*snp_it)->type) {
+ case snp_type_het:
+ sstr << "E\t";
+ break;
+ case snp_type_hom:
+ sstr << "O\t";
+ break;
+ default:
+ sstr << "U\t";
+ break;
+ }
+
+ sstr <<
+ (*snp_it)->lratio << "\t" <<
+ (*snp_it)->rank_1 << "\t" <<
+ (*snp_it)->rank_2 << "\t" <<
+ ((*snp_it)->rank_3 == 0 ? '-' : (*snp_it)->rank_3) << "\t" <<
+ ((*snp_it)->rank_4 == 0 ? '-' : (*snp_it)->rank_4) << "\n";
}
gzputs(snp_file, sstr.str().c_str());
@@ -1664,21 +1681,21 @@ write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_f
// Output the alleles associated with the two matched tags
//
for (all_it = tag->alleles.begin(); all_it != tag->alleles.end(); all_it++)
- sstr
- << "0\t"
- << batch_id << "\t"
- << tag->id << "\t"
- << all_it->first << "\t"
- << 0 << "\t"
- << 0 << "\n";
+ sstr
+ << "0\t"
+ << batch_id << "\t"
+ << tag->id << "\t"
+ << all_it->first << "\t"
+ << 0 << "\t"
+ << 0 << "\n";
gzputs(all_file, sstr.str().c_str());
return 0;
}
-int
-initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
+int
+initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
{
map<int, CLocus *> tmp_catalog;
bool compressed = false;
@@ -1700,12 +1717,12 @@ initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
map<int, CLocus *>::iterator j;
int k = 1;
for (j = tmp_catalog.begin(); j != tmp_catalog.end(); j++) {
- j->second->sources.push_back(make_pair(sample.first, j->second->id));
+ j->second->sources.push_back(make_pair(sample.first, j->second->id));
j->second->id = k;
catalog[k] = j->second;
- k++;
+ k++;
}
cerr << " " << catalog.size() << " loci were newly added to the catalog.\n";
@@ -1713,8 +1730,8 @@ initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
return 1;
}
-int
-initialize_existing_catalog(string catalog_path, map<int, CLocus *> &catalog)
+int
+initialize_existing_catalog(string catalog_path, map<int, CLocus *> &catalog)
{
bool compressed;
@@ -1736,28 +1753,28 @@ initialize_existing_catalog(string catalog_path, map<int, CLocus *> &catalog)
int sample_id, locus_id;
for (j = catalog.begin(); j != catalog.end(); j++) {
- loc = j->second;
-
- for (uint i = 0; i < loc->comp.size(); i++) {
- //
- // Parse the ID into sample ID / locus ID, given 43_1356, parse into
- // sample ID 43 and locus ID 1356.
- //
- for (p = loc->comp[i]; *p != '_' && *p != '\0'; p++);
- if (*p != '_')
- return 0;
- p++;
- sample_id = strtol(loc->comp[i], &q, 10);
- if (*q != '_')
- return 0;
-
- locus_id = strtol(p, &q, 10);
-
- if (*q != '\0')
- return 0;
-
- loc->sources.push_back(make_pair(sample_id, locus_id));
- }
+ loc = j->second;
+
+ for (uint i = 0; i < loc->comp.size(); i++) {
+ //
+ // Parse the ID into sample ID / locus ID, given 43_1356, parse into
+ // sample ID 43 and locus ID 1356.
+ //
+ for (p = loc->comp[i]; *p != '_' && *p != '\0'; p++);
+ if (*p != '_')
+ return 0;
+ p++;
+ sample_id = strtol(loc->comp[i], &q, 10);
+ if (*q != '_')
+ return 0;
+
+ locus_id = strtol(p, &q, 10);
+
+ if (*q != '\0')
+ return 0;
+
+ loc->sources.push_back(make_pair(sample_id, locus_id));
+ }
}
return 1;
@@ -1768,76 +1785,76 @@ int parse_command_line(int argc, char* argv[]) {
string sstr;
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
- {"mmatches", no_argument, NULL, 'm'},
- {"genomic_loc", no_argument, NULL, 'g'},
- {"uniq_haplotypes", no_argument, NULL, 'u'},
- {"report_mmatches", no_argument, NULL, 'R'},
+ {"mmatches", no_argument, NULL, 'm'},
+ {"genomic_loc", no_argument, NULL, 'g'},
+ {"uniq_haplotypes", no_argument, NULL, 'u'},
+ {"report_mmatches", no_argument, NULL, 'R'},
{"gapped", no_argument, NULL, 'G'},
{"max_gaps", required_argument, NULL, 'X'},
{"min_aln_len", required_argument, NULL, 'x'},
- {"batch_id", required_argument, NULL, 'b'},
- {"ctag_dist", required_argument, NULL, 'n'},
- {"k_len", required_argument, NULL, 'k'},
- {"catalog", required_argument, NULL, 'c'},
- {"sample", required_argument, NULL, 's'},
- {"outpath", required_argument, NULL, 'o'},
- {"num_threads", required_argument, NULL, 'p'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "hgvuRmGX:x:o:s:c:b:p:n:k:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 'b':
- batch_id = is_integer(optarg);
- if (batch_id < 0) {
- cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
- help();
- }
- break;
- case 'n':
- ctag_dist = is_integer(optarg);
- break;
- case 'k':
- set_kmer_len = false;
- kmer_len = is_integer(optarg);
- break;
- case 'm':
- mult_matches = true;
- break;
- case 'R':
- report_mmatches = true;
- break;
- case 'g':
- search_type = genomic_loc;
- break;
- case 's':
- sstr = optarg;
- samples.push(make_pair(0, sstr));
- break;
- case 'c':
- catalog_path = optarg;
- break;
- case 'o':
- out_path = optarg;
- break;
- case 'u':
- require_uniq_haplotypes = true;
- break;
- case 'G':
+ {"batch_id", required_argument, NULL, 'b'},
+ {"ctag_dist", required_argument, NULL, 'n'},
+ {"k_len", required_argument, NULL, 'k'},
+ {"catalog", required_argument, NULL, 'c'},
+ {"sample", required_argument, NULL, 's'},
+ {"outpath", required_argument, NULL, 'o'},
+ {"num_threads", required_argument, NULL, 'p'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hgvuRmGX:x:o:s:c:b:p:n:k:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 'b':
+ batch_id = is_integer(optarg);
+ if (batch_id < 0) {
+ cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
+ help();
+ }
+ break;
+ case 'n':
+ ctag_dist = is_integer(optarg);
+ break;
+ case 'k':
+ set_kmer_len = false;
+ kmer_len = is_integer(optarg);
+ break;
+ case 'm':
+ mult_matches = true;
+ break;
+ case 'R':
+ report_mmatches = true;
+ break;
+ case 'g':
+ search_type = genomic_loc;
+ break;
+ case 's':
+ sstr = optarg;
+ samples.push(make_pair(0, sstr));
+ break;
+ case 'c':
+ catalog_path = optarg;
+ break;
+ case 'o':
+ out_path = optarg;
+ break;
+ case 'u':
+ require_uniq_haplotypes = true;
+ break;
+ case 'G':
gapped_alignments = true;
break;
case 'X':
@@ -1849,17 +1866,17 @@ int parse_command_line(int argc, char* argv[]) {
case 'v':
version();
break;
- case 'p':
- num_threads = is_integer(optarg);
- break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
- default:
- help();
- abort();
- }
+ case 'p':
+ num_threads = is_integer(optarg);
+ break;
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
+ default:
+ help();
+ abort();
+ }
}
if (set_kmer_len == false && (kmer_len < 5 || kmer_len > 31)) {
@@ -1869,14 +1886,14 @@ int parse_command_line(int argc, char* argv[]) {
if (samples.size() == 0) {
cerr << "You must specify at least one sample file.\n";
- help();
+ help();
}
- if (out_path.length() == 0)
- out_path = ".";
+ if (out_path.length() == 0)
+ out_path = ".";
- if (out_path.at(out_path.length() - 1) != '/')
- out_path += "/";
+ if (out_path.at(out_path.length() - 1) != '/')
+ out_path += "/";
return 0;
}
@@ -1890,23 +1907,23 @@ void version() {
void help() {
std::cerr << "cstacks " << VERSION << "\n"
<< "cstacks -b batch_id -s sample_file [-s sample_file_2 ...] [-o path] [-g] [-n num] [-p num_threads] [--catalog path] [-h]" << "\n"
- << " b: MySQL ID of this batch." << "\n"
- << " s: filename prefix from which to load loci into the catalog." << "\n"
- << " o: output path to write results." << "\n"
+ << " b: MySQL ID of this batch." << "\n"
+ << " s: filename prefix from which to load loci into the catalog." << "\n"
+ << " o: output path to write results." << "\n"
<< " g: base catalog matching on genomic location, not sequence identity." << "\n"
- << " m: include tags in the catalog that match to more than one entry (default false)." << "\n"
+ << " m: include tags in the catalog that match to more than one entry (default false)." << "\n"
<< " n: number of mismatches allowed between sample tags when generating the catalog (default 1)." << "\n"
<< " p: enable parallel execution with num_threads threads.\n"
- << " h: display this help messsage." << "\n\n"
- << " Catalog editing:\n"
- << " --catalog <path>: provide the path to an existing catalog. cstacks will add data to this existing catalog.\n\n"
- << " Gapped assembly options:\n"
+ << " h: display this help messsage." << "\n\n"
+ << " Catalog editing:\n"
+ << " --catalog <path>: provide the path to an existing catalog. cstacks will add data to this existing catalog.\n\n"
+ << " Gapped assembly options:\n"
<< " --gapped: preform gapped alignments between stacks.\n"
<< " --max_gaps: number of gaps allowed between stacks before merging (default: 2).\n"
<< " --min_aln_len: minimum length of aligned sequence in a gapped alignment (default: 0.80).\n\n"
- << " Advanced options:\n"
- << " --k_len <len>: specify k-mer size for matching between between catalog loci (automatically calculated by default).\n"
- << " --report_mmatches: report query loci that match more than one catalog locus.\n";
+ << " Advanced options:\n"
+ << " --k_len <len>: specify k-mer size for matching between between catalog loci (automatically calculated by default).\n"
+ << " --report_mmatches: report query loci that match more than one catalog locus.\n";
exit(0);
}
diff --git a/src/estacks.cc b/src/estacks.cc
index 694f146..1ef5cf3 100644
--- a/src/estacks.cc
+++ b/src/estacks.cc
@@ -93,28 +93,28 @@ int call_alleles(MergedStack *mtag, vector<DNANSeq *> &reads) {
DNANSeq *d;
if (mtag->snps.size() == 0)
- return 0;
+ return 0;
for (row = 0; row < height; row++) {
- allele.clear();
-
- bool haplotype = true;
- for (snp = mtag->snps.begin(); snp != mtag->snps.end(); snp++) {
- d = reads[row];
- base = (*d)[(*snp)->col];
-
- //
- // Check to make sure the nucleotide at the location of this SNP is
- // of one of the two possible states the multinomial model called.
- //
- if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
- allele += base;
- else
- haplotype = false;
- }
-
- if (haplotype && allele.size() == mtag->snps.size())
- mtag->alleles[allele]++;
+ allele.clear();
+
+ bool haplotype = true;
+ for (snp = mtag->snps.begin(); snp != mtag->snps.end(); snp++) {
+ d = reads[row];
+ base = (*d)[(*snp)->col];
+
+ //
+ // Check to make sure the nucleotide at the location of this SNP is
+ // of one of the two possible states the multinomial model called.
+ //
+ if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
+ allele += base;
+ else
+ haplotype = false;
+ }
+
+ if (haplotype && allele.size() == mtag->snps.size())
+ mtag->alleles[allele]++;
}
return 0;
@@ -127,77 +127,77 @@ int call_consensus(map<int, MergedStack *> &merged, map<int, PStack *> &unique,
//
map<int, MergedStack *>::iterator it;
vector<int> keys;
- for (it = merged.begin(); it != merged.end(); it++)
- keys.push_back(it->first);
+ for (it = merged.begin(); it != merged.end(); it++)
+ keys.push_back(it->first);
int i;
#pragma omp parallel private(i)
- {
- #pragma omp for schedule(dynamic)
- for (i = 0; i < (int) keys.size(); i++) {
- MergedStack *mtag;
- PStack *utag;
-
- mtag = merged[keys[i]];
-
- //
- // Create a two-dimensional array, each row containing one read. For
- // each unique tag that has been merged together, add the sequence for
- // that tag into our array as many times as it originally occurred.
- //
- vector<int>::iterator j;
- vector<DNANSeq *> reads;
-
- for (j = mtag->utags.begin(); j != mtag->utags.end(); j++) {
- utag = unique[*j];
-
- for (uint k = 0; k < utag->count; k++) {
- reads.push_back(utag->seq);
- }
- }
-
- //
- // Iterate over each column of the array and call the consensus base.
- //
- int row, col;
- int length = reads[0]->size();
- int height = reads.size();
- string con;
- map<char, int> nuc;
- map<char, int>::iterator max, n;
- DNANSeq *d;
-
- for (col = 0; col < length; col++) {
- nuc['A'] = 0;
- nuc['C'] = 0;
- nuc['G'] = 0;
- nuc['T'] = 0;
-
- for (row = 0; row < height; row++) {
- d = reads[row];
- nuc[(*d)[col]]++;
- }
-
- //
- // Find the base with a plurality of occurances and call it.
- //
- max = nuc.end();
-
- for (n = nuc.begin(); n != nuc.end(); n++) {
- if (max == nuc.end() || n->second > max->second)
- max = n;
- }
- con += max->second == 0 ? 'N' : max->first;
-
- // Search this column for the presence of a SNP
- if (invoke_model)
- model_type == snp ?
+ {
+ #pragma omp for schedule(dynamic)
+ for (i = 0; i < (int) keys.size(); i++) {
+ MergedStack *mtag;
+ PStack *utag;
+
+ mtag = merged[keys[i]];
+
+ //
+ // Create a two-dimensional array, each row containing one read. For
+ // each unique tag that has been merged together, add the sequence for
+ // that tag into our array as many times as it originally occurred.
+ //
+ vector<int>::iterator j;
+ vector<DNANSeq *> reads;
+
+ for (j = mtag->utags.begin(); j != mtag->utags.end(); j++) {
+ utag = unique[*j];
+
+ for (uint k = 0; k < utag->count; k++) {
+ reads.push_back(utag->seq);
+ }
+ }
+
+ //
+ // Iterate over each column of the array and call the consensus base.
+ //
+ int row, col;
+ int length = reads[0]->size();
+ int height = reads.size();
+ string con;
+ map<char, int> nuc;
+ map<char, int>::iterator max, n;
+ DNANSeq *d;
+
+ for (col = 0; col < length; col++) {
+ nuc['A'] = 0;
+ nuc['C'] = 0;
+ nuc['G'] = 0;
+ nuc['T'] = 0;
+
+ for (row = 0; row < height; row++) {
+ d = reads[row];
+ nuc[(*d)[col]]++;
+ }
+
+ //
+ // Find the base with a plurality of occurances and call it.
+ //
+ max = nuc.end();
+
+ for (n = nuc.begin(); n != nuc.end(); n++) {
+ if (max == nuc.end() || n->second > max->second)
+ max = n;
+ }
+ con += max->second == 0 ? 'N' : max->first;
+
+ // Search this column for the presence of a SNP
+ if (invoke_model)
+ model_type == snp ?
call_multinomial_snp(mtag, col, nuc, record_hom) :
call_multinomial_fixed(mtag, col, nuc);
- }
+ }
- if (invoke_model) {
- call_alleles(mtag, reads);
+ if (invoke_model) {
+ call_alleles(mtag, reads);
if (model_type == fixed) {
//
@@ -210,8 +210,8 @@ int call_consensus(map<int, MergedStack *> &merged, map<int, PStack *> &unique,
}
}
- mtag->add_consensus(con.c_str());
- }
+ mtag->add_consensus(con.c_str());
+ }
}
return 0;
@@ -224,10 +224,10 @@ int count_raw_reads(map<int, PStack *> &unique, map<int, MergedStack *> &merged)
long int m = 0;
for (it = merged.begin(); it != merged.end(); it++) {
- for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
- tag = unique[*k];
- m += tag->count;
- }
+ for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
+ tag = unique[*k];
+ m += tag->count;
+ }
m += it->second->remtags.size();
}
@@ -264,106 +264,106 @@ int write_sql(map<int, MergedStack *> &m, map<int, PStack *> &u) {
tag_id = 0;
for (i = m.begin(); i != m.end(); i++) {
- tag_1 = i->second;
-
- // First write the consensus sequence
- for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
-
- float total = 0;
- for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
- //if (u[*k]->seq[(*s)->col] == 'N') continue;
- total += u[*k]->count;
- }
-
- if (total < min_stack_cov) continue;
-
- tags << "0" << "\t"
- << sql_id << "\t"
- << tag_id << "\t"
- << tag_1->loc.chr << "\t"
- << tag_1->loc.bp + (*s)->col << "\t"
- << "consensus\t" << "\t\t"
- << tag_1->con[(*s)->col] << "\t"
- << tag_1->deleveraged << "\t"
- << tag_1->blacklisted << "\t"
- << tag_1->lumberjackstack << "\n";
-
- // Now write out the components of each unique tag merged into this one.
- comp_id = 0;
- for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
- tag_2 = u[*k];
-
- //if (tag_2->seq[(*s)->col] == 'N') continue;
-
- for (j = tag_2->map.begin(); j != tag_2->map.end(); j++) {
- tags << "0" << "\t"
- << sql_id << "\t"
- << tag_id << "\t\t\t"
- << "primary\t"
- << comp_id << "\t"
- << *j << "\t"
- << (*tag_2->seq)[(*s)->col] << "\t\t\t\n";
- }
- comp_id++;
- }
-
- snps << "0" << "\t"
- << sql_id << "\t"
- << tag_id << "\t"
- << 0 << "\t"
- << (*s)->lratio << "\t"
- << (*s)->rank_1 << "\t"
- << (*s)->rank_2 << "\n";
-
- // Write the expressed alleles seen for the recorded SNPs and
- // the percentage of tags a particular allele occupies.
- map<char, int> allele;
- for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
- if ((*u[*k]->seq)[(*s)->col] != (*s)->rank_1 &&
- (*u[*k]->seq)[(*s)->col] != (*s)->rank_2)
- continue;
- allele[(*u[*k]->seq)[(*s)->col]] += u[*k]->count;
- }
-
-
- char pct[id_len];
- map<char, int>::iterator a;
- for (a = allele.begin(); a != allele.end(); a++) {
- sprintf(pct, "%.2f", ((a->second/total) * 100));
- alle << "0" << "\t" << sql_id << "\t" << tag_id << "\t" << a->first << "\t" << pct << "\t" << a->second << "\n";
- }
- tag_id++;
- }
+ tag_1 = i->second;
+
+ // First write the consensus sequence
+ for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
+
+ float total = 0;
+ for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
+ //if (u[*k]->seq[(*s)->col] == 'N') continue;
+ total += u[*k]->count;
+ }
+
+ if (total < min_stack_cov) continue;
+
+ tags << "0" << "\t"
+ << sql_id << "\t"
+ << tag_id << "\t"
+ << tag_1->loc.chr << "\t"
+ << tag_1->loc.bp + (*s)->col << "\t"
+ << "consensus\t" << "\t\t"
+ << tag_1->con[(*s)->col] << "\t"
+ << tag_1->deleveraged << "\t"
+ << tag_1->blacklisted << "\t"
+ << tag_1->lumberjackstack << "\n";
+
+ // Now write out the components of each unique tag merged into this one.
+ comp_id = 0;
+ for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
+ tag_2 = u[*k];
+
+ //if (tag_2->seq[(*s)->col] == 'N') continue;
+
+ for (j = tag_2->map.begin(); j != tag_2->map.end(); j++) {
+ tags << "0" << "\t"
+ << sql_id << "\t"
+ << tag_id << "\t\t\t"
+ << "primary\t"
+ << comp_id << "\t"
+ << *j << "\t"
+ << (*tag_2->seq)[(*s)->col] << "\t\t\t\n";
+ }
+ comp_id++;
+ }
+
+ snps << "0" << "\t"
+ << sql_id << "\t"
+ << tag_id << "\t"
+ << 0 << "\t"
+ << (*s)->lratio << "\t"
+ << (*s)->rank_1 << "\t"
+ << (*s)->rank_2 << "\n";
+
+ // Write the expressed alleles seen for the recorded SNPs and
+ // the percentage of tags a particular allele occupies.
+ map<char, int> allele;
+ for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
+ if ((*u[*k]->seq)[(*s)->col] != (*s)->rank_1 &&
+ (*u[*k]->seq)[(*s)->col] != (*s)->rank_2)
+ continue;
+ allele[(*u[*k]->seq)[(*s)->col]] += u[*k]->count;
+ }
+
+
+ char pct[id_len];
+ map<char, int>::iterator a;
+ for (a = allele.begin(); a != allele.end(); a++) {
+ sprintf(pct, "%.2f", ((a->second/total) * 100));
+ alle << "0" << "\t" << sql_id << "\t" << tag_id << "\t" << a->first << "\t" << pct << "\t" << a->second << "\n";
+ }
+ tag_id++;
+ }
}
for (i = m.begin(); i != m.end(); i++) {
- tag_1 = i->second;
+ tag_1 = i->second;
- float total = 0;
- for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++)
- total += u[*k]->count;
+ float total = 0;
+ for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++)
+ total += u[*k]->count;
- if (total < min_stack_cov) continue;
+ if (total < min_stack_cov) continue;
- // First write the consensus sequence
- pile << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
+ // First write the consensus sequence
+ pile << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
<< tag_1->loc.chr << "\t"
<< tag_1->loc.bp << "\t"
- << "consensus\t" << "\t\t"
- << tag_1->con << "\n";
-
- // Now write out the components of each unique tag merged into this one.
- comp_id = 0;
- for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
- tag_2 = u[*k];
-
- for (j = tag_2->map.begin(); j != tag_2->map.end(); j++) {
- pile << "0" << "\t" << sql_id << "\t" << tag_1->id << "\t\t\t" << "primary\t" << comp_id << "\t" << *j << "\t" << tag_2->seq << "\t\t\t\n";
- }
- comp_id++;
- }
+ << "consensus\t" << "\t\t"
+ << tag_1->con << "\n";
+
+ // Now write out the components of each unique tag merged into this one.
+ comp_id = 0;
+ for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
+ tag_2 = u[*k];
+
+ for (j = tag_2->map.begin(); j != tag_2->map.end(); j++) {
+ pile << "0" << "\t" << sql_id << "\t" << tag_1->id << "\t\t\t" << "primary\t" << comp_id << "\t" << *j << "\t" << tag_2->seq << "\t\t\t\n";
+ }
+ comp_id++;
+ }
}
tags.close();
@@ -396,14 +396,14 @@ int populate_merged_tags(map<int, PStack *> &unique, map<int, MergedStack *> &me
it_old = merged.begin();
for (k = locations.begin(); k != locations.end(); k++) {
- m = new MergedStack;
- m->id = global_id;
+ m = new MergedStack;
+ m->id = global_id;
- //
+ //
// Record the consensus and physical location for this stack.
//
s = k->second.begin();
- m->add_consensus(unique[*s]->seq);
+ m->add_consensus(unique[*s]->seq);
strncpy(m->loc.chr, unique[*s]->loc.chr, id_len - 1);
m->loc.chr[id_len] = '\0';
m->loc.bp = unique[*s]->loc.bp;
@@ -417,11 +417,11 @@ int populate_merged_tags(map<int, PStack *> &unique, map<int, MergedStack *> &me
m->utags.push_back(u->id);
}
- // Insert the new MergedStack giving a hint as to which position
- // to insert it at.
- it_new = merged.insert(it_old, pair<int, MergedStack *>(global_id, m));
- it_old = it_new;
- global_id++;
+ // Insert the new MergedStack giving a hint as to which position
+ // to insert it at.
+ it_new = merged.insert(it_old, pair<int, MergedStack *>(global_id, m));
+ it_old = it_new;
+ global_id++;
}
cerr << " Merged " << unique.size() << " unique Stacks into " << merged.size() << " loci.\n";
@@ -437,7 +437,7 @@ int populate_merged_tags(map<int, PStack *> &unique, map<int, MergedStack *> &me
int reduce_radtags(HashMap &radtags, map<int, PStack *> &unique) {
HashMap::iterator it;
vector<Seq *>::iterator sit;
-
+
PStack *u;
int global_id = 1;
@@ -499,7 +499,7 @@ int load_radtags(string in_file, HashMap &radtags) {
while ((c = fh->next_seq()) != NULL) {
if (i % 10000 == 0) cerr << "Loading aligned sequence " << i << " \r";
- radtags[c->seq].push_back(c);
+ radtags[c->seq].push_back(c);
i++;
}
@@ -528,14 +528,14 @@ int dump_stacks(map<int, PStack *> &u) {
for (it = u.begin(); it != u.end(); it++) {
- cerr << "Stack ID: " << (*it).second->id << "\n"
- << " Seq: " << (*it).second->seq << "\n"
- << " IDs: ";
+ cerr << "Stack ID: " << (*it).second->id << "\n"
+ << " Seq: " << (*it).second->seq << "\n"
+ << " IDs: ";
- for (fit = (*it).second->map.begin(); fit != (*it).second->map.end(); fit++)
- cerr << *fit << " ";
+ for (fit = (*it).second->map.begin(); fit != (*it).second->map.end(); fit++)
+ cerr << *fit << " ";
- cerr << "\n\n";
+ cerr << "\n\n";
}
return 0;
@@ -548,24 +548,24 @@ int dump_merged_stacks(map<int, MergedStack *> &m) {
for (it = m.begin(); it != m.end(); it++) {
- cerr << "MergedStack ID: " << it->second->id << "\n"
- << " Consensus: ";
- if (it->second->con != NULL)
- cerr << it->second->con << "\n";
- else
- cerr << "\n";
- cerr << " IDs: ";
+ cerr << "MergedStack ID: " << it->second->id << "\n"
+ << " Consensus: ";
+ if (it->second->con != NULL)
+ cerr << it->second->con << "\n";
+ else
+ cerr << "\n";
+ cerr << " IDs: ";
- for (fit = it->second->utags.begin(); fit != it->second->utags.end(); fit++)
- cerr << (*fit) << " ";
+ for (fit = it->second->utags.begin(); fit != it->second->utags.end(); fit++)
+ cerr << (*fit) << " ";
- cerr << "\n"
- << " Distances: ";
+ cerr << "\n"
+ << " Distances: ";
- for (pit = it->second->dist.begin(); pit != it->second->dist.end(); pit++)
- cerr << (*pit).first << ": " << (*pit).second << ", ";
+ for (pit = it->second->dist.begin(); pit != it->second->dist.end(); pit++)
+ cerr << (*pit).first << ": " << (*pit).second << ", ";
- cerr << "\n\n";
+ cerr << "\n\n";
}
return 0;
@@ -573,37 +573,37 @@ int dump_merged_stacks(map<int, MergedStack *> &m) {
int parse_command_line(int argc, char* argv[]) {
int c;
-
+
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
- {"rec_hom", no_argument, NULL, 'O'},
- {"infile_type", required_argument, NULL, 't'},
- {"outfile_type", required_argument, NULL, 'y'},
- {"file", required_argument, NULL, 'f'},
- {"outpath", required_argument, NULL, 'o'},
- {"id", required_argument, NULL, 'i'},
- {"min_cov", required_argument, NULL, 'm'},
- {"num_threads", required_argument, NULL, 'p'},
- {"bc_err_freq", required_argument, NULL, 'e'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "hvOf:o:i:e:p:m:s:f:t:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 't':
+ {"rec_hom", no_argument, NULL, 'O'},
+ {"infile_type", required_argument, NULL, 't'},
+ {"outfile_type", required_argument, NULL, 'y'},
+ {"file", required_argument, NULL, 'f'},
+ {"outpath", required_argument, NULL, 'o'},
+ {"id", required_argument, NULL, 'i'},
+ {"min_cov", required_argument, NULL, 'm'},
+ {"num_threads", required_argument, NULL, 'p'},
+ {"bc_err_freq", required_argument, NULL, 'e'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hvOf:o:i:e:p:m:s:f:t:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 't':
if (strcmp(optarg, "bowtie") == 0)
in_file_type = FileT::bowtie;
else if (strcmp(optarg, "sam") == 0)
@@ -612,57 +612,57 @@ int parse_command_line(int argc, char* argv[]) {
in_file_type = FileT::tsv;
else
in_file_type = FileT::unknown;
- break;
- case 'f':
- in_file = optarg;
- break;
- case 'o':
- out_path = optarg;
- break;
- case 'i':
- sql_id = atoi(optarg);
- break;
- case 'm':
- min_stack_cov = atoi(optarg);
- break;
- case 'e':
- barcode_err_freq = atof(optarg);
- break;
- case 'p':
- num_threads = atoi(optarg);
- break;
+ break;
+ case 'f':
+ in_file = optarg;
+ break;
+ case 'o':
+ out_path = optarg;
+ break;
+ case 'i':
+ sql_id = atoi(optarg);
+ break;
+ case 'm':
+ min_stack_cov = atoi(optarg);
+ break;
+ case 'e':
+ barcode_err_freq = atof(optarg);
+ break;
+ case 'p':
+ num_threads = atoi(optarg);
+ break;
case 'O':
- record_hom = true;
+ record_hom = true;
break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
-
- default:
- cerr << "Unknown command line option '" << (char) c << "'\n";
- help();
- abort();
- }
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
+
+ default:
+ cerr << "Unknown command line option '" << (char) c << "'\n";
+ help();
+ abort();
+ }
}
if (in_file.length() == 0 || in_file_type == FileT::unknown) {
- cerr << "You must specify an input file of a supported type.\n";
- help();
+ cerr << "You must specify an input file of a supported type.\n";
+ help();
}
- if (out_path.length() == 0)
- out_path = ".";
+ if (out_path.length() == 0)
+ out_path = ".";
- if (out_path.at(out_path.length() - 1) != '/')
- out_path += "/";
+ if (out_path.at(out_path.length() - 1) != '/')
+ out_path += "/";
if (model_type == fixed && barcode_err_freq == 0) {
- cerr << "You must specify the barcode error frequency.\n";
- help();
+ cerr << "You must specify the barcode error frequency.\n";
+ help();
}
return 0;
@@ -678,14 +678,14 @@ void help() {
std::cerr << "estacks " << VERSION << "\n"
<< "estacks -t file_type -f file_path [-o path] [-i id] [-m min_cov] [-r] [-e errfreq] [-p num_threads] [-h]" << "\n"
<< " p: enable parallel execution with num_threads threads.\n"
- << " t: input file Type. Supported types: bowtie, sam.\n"
+ << " t: input file Type. Supported types: bowtie, sam.\n"
<< " f: input file path.\n"
- << " o: output path to write results.\n"
- << " i: SQL ID to insert into the output to identify this sample.\n"
- << " O: record homozygotes along with heterozygote SNPs.\n"
- << " m: minimum depth of coverage to report a stack (default 1).\n"
- << " e: specify the barcode error frequency (0 < e < 1) if using the 'fixed' model.\n"
- << " h: display this help messsage." << "\n\n";
+ << " o: output path to write results.\n"
+ << " i: SQL ID to insert into the output to identify this sample.\n"
+ << " O: record homozygotes along with heterozygote SNPs.\n"
+ << " m: minimum depth of coverage to report a stack (default 1).\n"
+ << " e: specify the barcode error frequency (0 < e < 1) if using the 'fixed' model.\n"
+ << " h: display this help messsage." << "\n\n";
exit(0);
}
diff --git a/src/estacks.h b/src/estacks.h
index f551835..398b1a8 100644
--- a/src/estacks.h
+++ b/src/estacks.h
@@ -52,7 +52,7 @@ using std::unordered_map;
using google::sparse_hash_map;
#endif
-#include "constants.h"
+#include "constants.h"
#include "stacks.h" // Major data structures for holding stacks
#include "kmers.h"
#include "mstack.h"
diff --git a/src/export_formats.cc b/src/export_formats.cc
index 6397f57..51bdbe4 100644
--- a/src/export_formats.cc
+++ b/src/export_formats.cc
@@ -23,8 +23,8 @@ extern set<string> debug_flags;
extern MetaPopInfo mpopi;
extern map<string, int> renz_olap;
-int
-write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
+int
+write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
{
string file = out_path + out_prefix + ".markers.tsv";
@@ -38,9 +38,9 @@ write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
fh.precision(fieldw);
fh.setf(std::ios::fixed);
- fh << "# SQL ID" << "\t"
- << "Batch ID" << "\t"
- << "Catalog Locus ID" << "\t"
+ fh << "# SQL ID" << "\t"
+ << "Batch ID" << "\t"
+ << "Catalog Locus ID" << "\t"
<< "\t"
<< "Total Genotypes" << "\t"
<< "Max" << "\t"
@@ -73,9 +73,9 @@ write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
gtype_map << j->first << ":" << j->second << ";";
}
- fh << 0 << "\t"
- << batch_id << "\t"
- << loc->id << "\t"
+ fh << 0 << "\t"
+ << batch_id << "\t"
+ << loc->id << "\t"
<< "\t" // Marker
<< total << "\t"
<< max << "\t"
@@ -91,11 +91,11 @@ write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
return 0;
}
-int
+int
write_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
{
//
- // Write a FASTA file containing each allele from each locus from
+ // Write a FASTA file containing each allele from each locus from
// each sample in the population.
//
string file = out_path + out_prefix + ".fa";
@@ -122,7 +122,7 @@ write_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
strcpy(seq, loc->con);
for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] == NULL)
+ if (d[j] == NULL)
continue;
for (uint k = 0; k < d[j]->obshap.size(); k++) {
@@ -132,9 +132,9 @@ write_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
seq[col] = col < loc->len ? d[j]->obshap[k][i] : loc->con[col];
}
- fh << ">CLocus_" << loc->id
+ fh << ">CLocus_" << loc->id
<< "_Sample_" << pmap->rev_sample_index(j)
- << "_Locus_" << d[j]->id
+ << "_Locus_" << d[j]->id
<< "_Allele_" << k
<< " [" << mpopi.samples()[j].name;
@@ -153,11 +153,11 @@ write_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
return 0;
}
-int
+int
write_strict_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
{
//
- // Write a FASTA file containing each allele from each locus from
+ // Write a FASTA file containing each allele from each locus from
// each sample in the population.
//
string file = out_path + out_prefix + ".strict.fa";
@@ -184,9 +184,9 @@ write_strict_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
strcpy(seq, loc->con);
for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] == NULL)
+ if (d[j] == NULL)
continue;
- if (d[j]->obshap.size() > 2)
+ if (d[j]->obshap.size() > 2)
continue;
if (d[j]->obshap.size() == 1) {
@@ -196,9 +196,9 @@ write_strict_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
seq[col] = col < loc->len ? d[j]->obshap[0][i] : loc->con[col];
}
- fh << ">CLocus_" << loc->id
+ fh << ">CLocus_" << loc->id
<< "_Sample_" << pmap->rev_sample_index(j)
- << "_Locus_" << d[j]->id
+ << "_Locus_" << d[j]->id
<< "_Allele_" << 0
<< " [" << mpopi.samples()[j].name;
if (strcmp(loc->loc.chr, "un") != 0)
@@ -206,9 +206,9 @@ write_strict_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
fh << "]\n"
<< seq << "\n";
- fh << ">CLocus_" << loc->id
+ fh << ">CLocus_" << loc->id
<< "_Sample_" << pmap->rev_sample_index(j)
- << "_Locus_" << d[j]->id
+ << "_Locus_" << d[j]->id
<< "_Allele_" << 1
<< " [" << mpopi.samples()[j].name;
if (strcmp(loc->loc.chr, "un") != 0)
@@ -223,9 +223,9 @@ write_strict_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
seq[col] = col < loc->len ? d[j]->obshap[k][i] : loc->con[col];
}
- fh << ">CLocus_" << loc->id
+ fh << ">CLocus_" << loc->id
<< "_Sample_" << mpopi.samples()[j].id
- << "_Locus_" << d[j]->id
+ << "_Locus_" << d[j]->id
<< "_Allele_" << k
<< " [" << mpopi.samples()[j].name;
if (strcmp(loc->loc.chr, "un") != 0)
@@ -245,10 +245,10 @@ write_strict_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
return 0;
}
-int
-write_vcf_ordered(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
- map<int, pair<merget, int> > &merge_map, ofstream &log_fh)
+int
+write_vcf_ordered(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
+ map<int, pair<merget, int> > &merge_map, ofstream &log_fh)
{
//
// Write a VCF file as defined here: http://www.1000genomes.org/node/101
@@ -400,10 +400,10 @@ write_vcf_ordered(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_vcf(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
- map<int, pair<merget, int> > &merge_map)
+int
+write_vcf(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
+ map<int, pair<merget, int> > &merge_map)
{
//
// Write a VCF file as defined here: http://www.1000genomes.org/node/101
@@ -562,8 +562,8 @@ write_vcf(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_vcf_haplotypes(map<int, CSLocus *> &catalog,
+int
+write_vcf_haplotypes(map<int, CSLocus *> &catalog,
PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
@@ -621,7 +621,7 @@ write_vcf_haplotypes(map<int, CSLocus *> &catalog,
h.second /= n_alleles;
//
- // Order the haplotypes according to most frequent. Record the ordered position or each
+ // Order the haplotypes according to most frequent. Record the ordered position or each
// haplotype and convert them from counts to frequencies.
//
@@ -699,9 +699,9 @@ write_vcf_haplotypes(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_genepop(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+int
+write_genepop(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
//
@@ -751,7 +751,7 @@ write_genepop(map<int, CSLocus *> &catalog,
col = loc->snps[j]->col;
t = psum->locus_tally(loc->id);
- if (t->nucs[col].allele_cnt != 2)
+ if (t->nucs[col].allele_cnt != 2)
continue;
cnt++;
}
@@ -764,10 +764,10 @@ write_genepop(map<int, CSLocus *> &catalog,
col = loc->snps[j]->col;
t = psum->locus_tally(loc->id);
- //
+ //
// If this site is fixed in all populations or has too many alleles don't output it.
//
- if (t->nucs[col].allele_cnt != 2)
+ if (t->nucs[col].allele_cnt != 2)
continue;
i++;
fh << loc->id << "_" << col;
@@ -800,7 +800,7 @@ write_genepop(map<int, CSLocus *> &catalog,
for (i = 0; i < loc->snps.size(); i++) {
uint col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt != 2)
+ if (t->nucs[col].allele_cnt != 2)
continue;
if (s[p]->nucs[col].incompatible_site ||
@@ -850,10 +850,10 @@ write_genepop(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_genepop_ordered(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
+int
+write_genepop_ordered(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
ofstream &log_fh)
{
//
@@ -986,9 +986,9 @@ write_genepop_ordered(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_structure(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+int
+write_structure(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
//
@@ -1071,10 +1071,10 @@ write_structure(map<int, CSLocus *> &catalog,
for (uint i = 0; i < loc->snps.size(); i++) {
uint col = loc->snps[i]->col;
- //
+ //
// If this site is fixed in all populations or has too many alleles don't output it.
//
- if (t->nucs[col].allele_cnt != 2)
+ if (t->nucs[col].allele_cnt != 2)
continue;
if (s[p]->nucs[col].incompatible_site ||
@@ -1128,7 +1128,7 @@ write_structure(map<int, CSLocus *> &catalog,
for (uint i = 0; i < loc->snps.size(); i++) {
uint col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt != 2)
+ if (t->nucs[col].allele_cnt != 2)
continue;
if (s[p]->nucs[col].incompatible_site ||
@@ -1162,10 +1162,10 @@ write_structure(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_structure_ordered(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
+int
+write_structure_ordered(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
ofstream &log_fh)
{
//
@@ -1327,13 +1327,13 @@ write_structure_ordered(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_hzar(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+int
+write_hzar(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
//
- // Write a Hybrid Zone Analysis using R (HZAR) file as defined here:
+ // Write a Hybrid Zone Analysis using R (HZAR) file as defined here:
// http://cran.r-project.org/web/packages/hzar/hzar.pdf
//
string file = out_path + out_prefix + ".hzar.csv";
@@ -1401,10 +1401,10 @@ write_hzar(map<int, CSLocus *> &catalog,
for (uint i = 0; i < loc->snps.size(); i++) {
uint col = loc->snps[i]->col;
- //
+ //
// If this site is fixed in all populations or has too many alleles don't output it.
//
- if (t->nucs[col].allele_cnt != 2)
+ if (t->nucs[col].allele_cnt != 2)
continue;
if (s[p]->nucs[col].num_indv == 0 ||
@@ -1413,7 +1413,7 @@ write_hzar(map<int, CSLocus *> &catalog,
fh << ",0,0,0";
continue;
}
-
+
if (t->nucs[col].p_allele == s[p]->nucs[col].p_nuc)
fh << "," << s[p]->nucs[col].p << "," << 1 - s[p]->nucs[col].p << ",";
else
@@ -1433,9 +1433,9 @@ write_hzar(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_treemix(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+int
+write_treemix(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
//
@@ -1493,28 +1493,28 @@ write_treemix(map<int, CSLocus *> &catalog,
stringstream sstr;
for (auto& pop : mpopi.pops())
sstr << pop.name << " ";
-
+
fh << sstr.str().substr(0, sstr.str().length() - 1) << "\n";
double p_freq, p_cnt, q_cnt, allele_cnt;
long int line = 1;
-
+
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
for (uint pos = 0; pos < it->second.size(); pos++) {
loc = it->second[pos];
s = psum->locus(loc->id);
t = psum->locus_tally(loc->id);
-
+
for (uint i = 0; i < loc->snps.size(); i++) {
uint col = loc->snps[i]->col;
sstr.str("");
- //
+ //
// If this site is fixed in all populations or has too many alleles don't output it.
//
- if (t->nucs[col].allele_cnt != 2)
+ if (t->nucs[col].allele_cnt != 2)
continue;
for (size_t p=0; p<mpopi.pops().size(); ++p) {
@@ -1525,11 +1525,11 @@ write_treemix(map<int, CSLocus *> &catalog,
sstr << "0,0 ";
continue;
}
-
+
p_freq = (t->nucs[col].p_allele == s[p]->nucs[col].p_nuc) ?
s[p]->nucs[col].p :
1 - s[p]->nucs[col].p;
-
+
allele_cnt = s[p]->nucs[col].num_indv * 2;
p_cnt = round(allele_cnt * p_freq);
q_cnt = allele_cnt - p_cnt;
@@ -1548,15 +1548,15 @@ write_treemix(map<int, CSLocus *> &catalog,
fh.close();
log_fh.close();
-
+
cerr << "done.\n";
return 0;
}
-int
-write_fastphase(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+int
+write_fastphase(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
//
@@ -1637,7 +1637,7 @@ write_fastphase(map<int, CSLocus *> &catalog,
//
string snp_markers, gtypes_str;
snp_markers.assign(total_sites, 'S');
- fh << snp_markers << '\n';
+ fh << snp_markers << '\n';
//
// Now output each sample name followed by a new line, then all of the genotypes for that sample
@@ -1665,10 +1665,10 @@ write_fastphase(map<int, CSLocus *> &catalog,
d = pmap->locus(loc->id);
t = psum->locus_tally(loc->id);
- //
+ //
// If this site is fixed in all populations or has too many alleles don't output it.
//
- if (t->nucs[col].allele_cnt != 2)
+ if (t->nucs[col].allele_cnt != 2)
continue;
if (s[p]->nucs[col].incompatible_site ||
@@ -1719,7 +1719,7 @@ write_fastphase(map<int, CSLocus *> &catalog,
d = pmap->locus(loc->id);
t = psum->locus_tally(loc->id);
- if (t->nucs[col].allele_cnt != 2)
+ if (t->nucs[col].allele_cnt != 2)
continue;
if (s[p]->nucs[col].incompatible_site ||
@@ -1756,16 +1756,16 @@ write_fastphase(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_phase(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+int
+write_phase(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
//
// Write a PHASE file as defined here: http://stephenslab.uchicago.edu/software.html
//
- // Data will be written as mixture of multiple allele, linked RAD sites
- // (SNPs within a single RAD locus are already phased), and bi-allelic SNPs. We
+ // Data will be written as mixture of multiple allele, linked RAD sites
+ // (SNPs within a single RAD locus are already phased), and bi-allelic SNPs. We
// will write one file per chromosome.
//
cerr << "Writing population data to PHASE files...";
@@ -1814,7 +1814,7 @@ write_phase(map<int, CSLocus *> &catalog,
d = pmap->locus(loc->id);
for (int j = 0; j < pmap->sample_cnt(); j++) {
if (d[j] != NULL &&
- d[j]->obshap.size() > 0 &&
+ d[j]->obshap.size() > 0 &&
d[j]->obshap.size() <= 2) {
//
// Data exists, and there are the correct number of haplotypes.
@@ -1873,7 +1873,7 @@ write_phase(map<int, CSLocus *> &catalog,
// Output all the loci for this sample, printing only the p allele
//
fh << mpopi.samples()[j].name << "\n";
-
+
gtypes.str("");
for (uint pos = 0; pos < ordered_loci.size(); pos++) {
loc = catalog[ordered_loci[pos].id];
@@ -1893,7 +1893,7 @@ write_phase(map<int, CSLocus *> &catalog,
gtypes << "-1 ";
} else {
//
- // Data exists, output the first haplotype. We will assume the haplotypes are
+ // Data exists, output the first haplotype. We will assume the haplotypes are
// numbered by their position in the loc->strings vector.
//
if (d[j]->obshap.size() > 2) {
@@ -1907,7 +1907,7 @@ write_phase(map<int, CSLocus *> &catalog,
gtypes << k + 1 << " ";
}
if (found == false)
- cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
+ cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
<< mpopi.samples()[j].name << "; catalog locus: " << loc->id << "\n";
}
}
@@ -1972,7 +1972,7 @@ write_phase(map<int, CSLocus *> &catalog,
gtypes << "-1 ";
} else {
//
- // Data exists, output the second haplotype. We will assume the haplotypes are
+ // Data exists, output the second haplotype. We will assume the haplotypes are
// numbered by their position in the loc->strings vector.
//
if (d[j]->obshap.size() > 2) {
@@ -1986,7 +1986,7 @@ write_phase(map<int, CSLocus *> &catalog,
gtypes << k + 1 << " ";
}
if (found == false)
- cerr << "Unable to find haplotype " << d[j]->obshap[1] << " from individual "
+ cerr << "Unable to find haplotype " << d[j]->obshap[1] << " from individual "
<< mpopi.samples()[j].name << "; catalog locus: " << loc->id << "\n";
} else {
found = false;
@@ -1996,7 +1996,7 @@ write_phase(map<int, CSLocus *> &catalog,
gtypes << k + 1 << " ";
}
if (found == false)
- cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
+ cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
<< mpopi.samples()[j].name << "; catalog locus: " << loc->id << "\n";
}
}
@@ -2038,9 +2038,9 @@ write_phase(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_plink(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+int
+write_plink(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
//
@@ -2097,7 +2097,7 @@ write_plink(map<int, CSLocus *> &catalog,
if (t->nucs[col].allele_cnt == 2)
fh << chr << "\t"
<< loc->id << "_" << col << "\t"
- << "0\t"
+ << "0\t"
<< loc->sort_bp(col) +1 << "\n";
}
}
@@ -2142,14 +2142,14 @@ write_plink(map<int, CSLocus *> &catalog,
s = psum->locus(loc->id);
d = pmap->locus(loc->id);
t = psum->locus_tally(loc->id);
-
+
for (uint i = 0; i < loc->snps.size(); i++) {
uint col = loc->snps[i]->col;
- //
+ //
// If this site is fixed in all populations or has too many alleles don't output it.
//
- if (t->nucs[col].allele_cnt != 2)
+ if (t->nucs[col].allele_cnt != 2)
continue;
//
// Output the p and q alleles
@@ -2200,9 +2200,9 @@ write_plink(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_beagle(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+int
+write_beagle(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
//
@@ -2309,10 +2309,10 @@ write_beagle(map<int, CSLocus *> &catalog,
t = psum->locus_tally(loc->id);
col = loc->snps[ordered_loci[pos].snp_index]->col;
- //
+ //
// If this site is fixed in all populations or has too many alleles don't output it.
//
- if (t->nucs[col].allele_cnt != 2)
+ if (t->nucs[col].allele_cnt != 2)
continue;
//
@@ -2324,9 +2324,9 @@ write_beagle(map<int, CSLocus *> &catalog,
//
// Output this locus to the markers file.
//
- mfh << loc->id << "_" << col << "\t"
+ mfh << loc->id << "_" << col << "\t"
<< loc->sort_bp(col) +1 << "\t"
- << t->nucs[col].p_allele << "\t"
+ << t->nucs[col].p_allele << "\t"
<< t->nucs[col].q_allele << "\n";
fh << "M" << "\t" << loc->id << "_" << col;
@@ -2402,13 +2402,13 @@ write_beagle(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_beagle_phased(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+int
+write_beagle_phased(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
//
- // Write a Beagle file as a set of haplotpyes as defined here:
+ // Write a Beagle file as a set of haplotpyes as defined here:
// http://faculty.washington.edu/browning/beagle/beagle.html
//
// We will write one file per chromosome.
@@ -2455,7 +2455,7 @@ write_beagle_phased(map<int, CSLocus *> &catalog,
d = pmap->locus(loc->id);
for (int j = 0; j < pmap->sample_cnt(); j++) {
if (d[j] != NULL &&
- d[j]->obshap.size() > 0 &&
+ d[j]->obshap.size() > 0 &&
d[j]->obshap.size() <= 2) {
//
// Data exists, and their are the corrent number of haplotypes.
@@ -2538,7 +2538,7 @@ write_beagle_phased(map<int, CSLocus *> &catalog,
//
// Output this locus to the markers file.
//
- mfh << loc->id << "\t"
+ mfh << loc->id << "\t"
<< loc->sort_bp() +1;
for (uint j = 0; j < loc->strings.size(); j++)
mfh << "\t" << loc->strings[j].first;
@@ -2560,7 +2560,7 @@ write_beagle_phased(map<int, CSLocus *> &catalog,
fh << "\t" << "?" << "\t" << "?";
} else {
//
- // Data exists, output the first haplotype. We will assume the haplotypes are
+ // Data exists, output the first haplotype. We will assume the haplotypes are
// numbered by their position in the loc->strings vector.
//
if (d[j]->obshap.size() > 2)
@@ -2576,22 +2576,22 @@ write_beagle_phased(map<int, CSLocus *> &catalog,
fh.close();
mfh.close();
}
- }
+ }
cerr << "done.\n";
return 0;
}
-int
-write_phylip(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+int
+write_phylip(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
//
// We want to find loci where each locus is fixed within a population but variable between populations.
//
- // We will write those loci to a Phylip file as defined here:
+ // We will write those loci to a Phylip file as defined here:
// http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
//
string file = out_path + out_prefix + ".phylip";
@@ -2655,7 +2655,7 @@ write_phylip(map<int, CSLocus *> &catalog,
if (phylip_var == false) {
//
- // We are looking for loci that are fixed within each population, but are
+ // We are looking for loci that are fixed within each population, but are
// variable between one or more populations.
//
if (t->nucs[col].fixed == true || t->nucs[col].allele_cnt != 2 || t->nucs[col].pop_cnt < 2)
@@ -2810,16 +2810,16 @@ write_phylip(map<int, CSLocus *> &catalog,
return 0;
}
-int
-write_fullseq_phylip(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+int
+write_fullseq_phylip(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum)
{
//
// We want to write all variable loci in Phylip interleaved format. Polymorphic positions
// will be encoded using IUPAC notation.
//
- // We will write those loci to a Phylip file as defined here:
+ // We will write those loci to a Phylip file as defined here:
// http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
//
string file = out_path + out_prefix + ".fullseq.phylip";
@@ -2914,7 +2914,7 @@ write_fullseq_phylip(map<int, CSLocus *> &catalog,
len = strlen(id_str);
for (uint j = len; j < 10; j++)
id_str[j] = ' ';
- id_str[9] = '\0';
+ id_str[9] = '\0';
outstrs[i_pop] = string(id_str) + " ";
}
@@ -2923,7 +2923,7 @@ write_fullseq_phylip(map<int, CSLocus *> &catalog,
int line = 1;
int index = 1;
int cnt = 1;
-
+
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
for (uint pos = 0; pos < it->second.size(); pos++) {
loc = it->second[pos];
@@ -3025,7 +3025,7 @@ write_fullseq_phylip(map<int, CSLocus *> &catalog,
seq[col] = nuc;
}
-
+
outstrs[j] += string(seq);
}
delete [] seq;
diff --git a/src/file_io.cc b/src/file_io.cc
index 3631769..d54f997 100644
--- a/src/file_io.cc
+++ b/src/file_io.cc
@@ -28,28 +28,28 @@
#include "file_io.h"
-int
+int
open_files(vector<pair<string, string> > &files,
- vector<BarcodePair> &barcodes,
- map<BarcodePair, ofstream *> &pair_1_fhs,
- map<BarcodePair, ofstream *> &pair_2_fhs,
- map<BarcodePair, ofstream *> &rem_1_fhs,
- map<BarcodePair, ofstream *> &rem_2_fhs,
- map<string, map<string, long> > &counters) {
+ vector<BarcodePair> &barcodes,
+ map<BarcodePair, ofstream *> &pair_1_fhs,
+ map<BarcodePair, ofstream *> &pair_2_fhs,
+ map<BarcodePair, ofstream *> &rem_1_fhs,
+ map<BarcodePair, ofstream *> &rem_2_fhs,
+ map<string, map<string, long> > &counters) {
string path, suffix_1, suffix_2, filepath, file;
if (paired) {
- suffix_1 = ".1";
- suffix_2 = ".2";
+ suffix_1 = ".1";
+ suffix_2 = ".2";
}
if (out_file_type == FileT::fastq) {
- suffix_1 += ".fq";
- suffix_2 += ".fq";
+ suffix_1 += ".fq";
+ suffix_2 += ".fq";
} else {
- suffix_1 += ".fa";
- suffix_2 += ".fa";
+ suffix_1 += ".fa";
+ suffix_2 += ".fa";
}
-
+
uint pos;
ofstream *fh;
BarcodePair bc;
@@ -60,261 +60,261 @@ open_files(vector<pair<string, string> > &files,
//
if (barcodes.size() == 0 && merge == false) {
- struct stat sb_1, sb_2;
-
- for (uint i = 0; i < files.size(); i++) {
-
- bc.se = files[i].first;
- if (paired)
- bc.pe = files[i].second;
-
- path = out_path + files[i].first;
-
- //
- // Check that the file has a proper suffix for the output type.
- //
- pos = path.find_last_of(".");
- if (path.substr(pos) == ".bam") {
- path = path.substr(0, pos) + suffix_1;
- } else if (path.substr(pos) == ".gz") {
- path = path.substr(0, pos);
- pos = path.find_last_of(".");
- path = path.substr(0, pos) + suffix_1;
- } else {
- path = path.substr(0, pos) + suffix_1;
- }
-
- if (stat((in_path_1 + files[i].first).c_str(), &sb_1) == -1) {
- cerr << "Unable to stat input file '" << in_path_1 + files[i].first << "'\n";
- exit(1);
- }
- if (stat(path.c_str(), &sb_2) == 0 &&
- sb_2.st_dev == sb_1.st_dev &&
- sb_2.st_ino == sb_1.st_ino) {
- cerr << "Input and output files ('" << path << "') are the same and will cause the input "
- << "file to be overwritten. Please specify a separate output directory using '-o'.\n";
- help();
- }
-
- fh = new ofstream(path.c_str(), ifstream::out);
+ struct stat sb_1, sb_2;
+
+ for (uint i = 0; i < files.size(); i++) {
+
+ bc.se = files[i].first;
+ if (paired)
+ bc.pe = files[i].second;
+
+ path = out_path + files[i].first;
+
+ //
+ // Check that the file has a proper suffix for the output type.
+ //
+ pos = path.find_last_of(".");
+ if (path.substr(pos) == ".bam") {
+ path = path.substr(0, pos) + suffix_1;
+ } else if (path.substr(pos) == ".gz") {
+ path = path.substr(0, pos);
+ pos = path.find_last_of(".");
+ path = path.substr(0, pos) + suffix_1;
+ } else {
+ path = path.substr(0, pos) + suffix_1;
+ }
+
+ if (stat((in_path_1 + files[i].first).c_str(), &sb_1) == -1) {
+ cerr << "Unable to stat input file '" << in_path_1 + files[i].first << "'\n";
+ exit(1);
+ }
+ if (stat(path.c_str(), &sb_2) == 0 &&
+ sb_2.st_dev == sb_1.st_dev &&
+ sb_2.st_ino == sb_1.st_ino) {
+ cerr << "Input and output files ('" << path << "') are the same and will cause the input "
+ << "file to be overwritten. Please specify a separate output directory using '-o'.\n";
+ help();
+ }
+
+ fh = new ofstream(path.c_str(), ifstream::out);
pair_1_fhs[bc] = fh;
- if (pair_1_fhs[bc]->fail()) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
+ if (pair_1_fhs[bc]->fail()) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
if (paired) {
- file = interleaved ? files[i].first : files[i].second;
- path = out_path + file;
-
- pos = path.find_last_of(".");
- if (path.substr(pos) == ".bam") {
- path = path.substr(0, pos) + suffix_2;
- } else if (path.substr(pos) == ".gz") {
- path = path.substr(0, pos);
- pos = path.find_last_of(".");
- path = path.substr(0, pos) + suffix_2;
- } else {
- path = path.substr(0, pos) + suffix_2;
- }
-
- if (stat((in_path_2 + file).c_str(), &sb_1) == -1) {
- cerr << "Unable to stat input file '" << in_path_2 + file << "'\n";
- exit(1);
- }
- if (stat(path.c_str(), &sb_2) == 0 &&
- sb_2.st_dev == sb_1.st_dev &&
- sb_2.st_ino == sb_1.st_ino) {
- cerr << "Input and output file names ('" << path << "') are the same and will cause the input "
- << "file to be overwritten. Please specify a separate output directory using '-o'.\n";
- help();
- }
-
- fh = new ofstream(path.c_str(), ifstream::out);
+ file = interleaved ? files[i].first : files[i].second;
+ path = out_path + file;
+
+ pos = path.find_last_of(".");
+ if (path.substr(pos) == ".bam") {
+ path = path.substr(0, pos) + suffix_2;
+ } else if (path.substr(pos) == ".gz") {
+ path = path.substr(0, pos);
+ pos = path.find_last_of(".");
+ path = path.substr(0, pos) + suffix_2;
+ } else {
+ path = path.substr(0, pos) + suffix_2;
+ }
+
+ if (stat((in_path_2 + file).c_str(), &sb_1) == -1) {
+ cerr << "Unable to stat input file '" << in_path_2 + file << "'\n";
+ exit(1);
+ }
+ if (stat(path.c_str(), &sb_2) == 0 &&
+ sb_2.st_dev == sb_1.st_dev &&
+ sb_2.st_ino == sb_1.st_ino) {
+ cerr << "Input and output file names ('" << path << "') are the same and will cause the input "
+ << "file to be overwritten. Please specify a separate output directory using '-o'.\n";
+ help();
+ }
+
+ fh = new ofstream(path.c_str(), ifstream::out);
pair_2_fhs[bc] = fh;
- if (pair_2_fhs[bc]->fail()) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
-
- filepath = files[i].first;
- pos = filepath.find_last_of(".");
- if (filepath.substr(pos) == ".gz") {
- filepath = filepath.substr(0, pos);
- pos = filepath.find_last_of(".");
- filepath = filepath.substr(0, pos);
- } else if (filepath.substr(pos) == ".bam") {
- filepath = filepath.substr(0, pos);
- }
- path = out_path + filepath + ".rem" + suffix_1;
-
- fh = new ofstream(path.c_str(), ifstream::out);
+ if (pair_2_fhs[bc]->fail()) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
+
+ filepath = files[i].first;
+ pos = filepath.find_last_of(".");
+ if (filepath.substr(pos) == ".gz") {
+ filepath = filepath.substr(0, pos);
+ pos = filepath.find_last_of(".");
+ filepath = filepath.substr(0, pos);
+ } else if (filepath.substr(pos) == ".bam") {
+ filepath = filepath.substr(0, pos);
+ }
+ path = out_path + filepath + ".rem" + suffix_1;
+
+ fh = new ofstream(path.c_str(), ifstream::out);
rem_1_fhs[bc] = fh;
- if (rem_1_fhs[bc]->fail()) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
+ if (rem_1_fhs[bc]->fail()) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
+
+ filepath = file;
+ pos = filepath.find_last_of(".");
+ if (filepath.substr(pos) == ".gz") {
+ filepath = filepath.substr(0, pos);
+ pos = filepath.find_last_of(".");
+ filepath = filepath.substr(0, pos);
+ } else if (filepath.substr(pos) == ".bam") {
+ filepath = filepath.substr(0, pos);
+ }
+ path = out_path + filepath + ".rem" + suffix_2;
+
+ fh = new ofstream(path.c_str(), ifstream::out);
+ rem_2_fhs[bc] = fh;
+
+ if (rem_2_fhs[bc]->fail()) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
+ }
+ }
- filepath = file;
- pos = filepath.find_last_of(".");
- if (filepath.substr(pos) == ".gz") {
- filepath = filepath.substr(0, pos);
- pos = filepath.find_last_of(".");
- filepath = filepath.substr(0, pos);
- } else if (filepath.substr(pos) == ".bam") {
- filepath = filepath.substr(0, pos);
- }
- path = out_path + filepath + ".rem" + suffix_2;
+ return 0;
- fh = new ofstream(path.c_str(), ifstream::out);
- rem_2_fhs[bc] = fh;
+ } else if (barcodes.size() == 0 && merge == true) {
- if (rem_2_fhs[bc]->fail()) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
+ path = out_path + "sample_unbarcoded" + suffix_1;
+ fh = new ofstream(path.c_str(), ifstream::out);
+
+ if (fh->fail()) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
+
+ for (uint i = 0; i < files.size(); i++) {
+ bc.se = files[i].first;
+ if (paired)
+ bc.pe = files[i].second;
+ pair_1_fhs[bc] = fh;
+ }
+
+ if (paired) {
+ path = out_path + "sample_unbarcoded" + suffix_2;
+ fh = new ofstream(path.c_str(), ifstream::out);
+
+ if (fh->fail()) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
+
+ for (uint i = 0; i < files.size(); i++) {
+ bc.se = files[i].first;
+ bc.pe = files[i].second;
+ pair_2_fhs[bc] = fh;
}
- }
- return 0;
+ path = out_path + "sample_unbarcoded.rem" + suffix_1;
+ fh = new ofstream(path.c_str(), ifstream::out);
- } else if (barcodes.size() == 0 && merge == true) {
+ if (fh->fail()) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
- path = out_path + "sample_unbarcoded" + suffix_1;
- fh = new ofstream(path.c_str(), ifstream::out);
-
- if (fh->fail()) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
-
- for (uint i = 0; i < files.size(); i++) {
- bc.se = files[i].first;
- if (paired)
- bc.pe = files[i].second;
- pair_1_fhs[bc] = fh;
- }
-
- if (paired) {
- path = out_path + "sample_unbarcoded" + suffix_2;
- fh = new ofstream(path.c_str(), ifstream::out);
-
- if (fh->fail()) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
-
- for (uint i = 0; i < files.size(); i++) {
- bc.se = files[i].first;
- bc.pe = files[i].second;
- pair_2_fhs[bc] = fh;
- }
-
- path = out_path + "sample_unbarcoded.rem" + suffix_1;
- fh = new ofstream(path.c_str(), ifstream::out);
-
- if (fh->fail()) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
-
- for (uint i = 0; i < files.size(); i++) {
- bc.se = files[i].first;
- bc.pe = files[i].second;
- rem_1_fhs[bc] = fh;
- }
-
- path = out_path + "sample_unbarcoded.rem" + suffix_2;
- fh = new ofstream(path.c_str(), ifstream::out);
-
- if (fh->fail()) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
-
- for (uint i = 0; i < files.size(); i++) {
- bc.se = files[i].first;
- bc.pe = files[i].second;
- rem_2_fhs[bc] = fh;
- }
- }
-
- return 0;
+ for (uint i = 0; i < files.size(); i++) {
+ bc.se = files[i].first;
+ bc.pe = files[i].second;
+ rem_1_fhs[bc] = fh;
+ }
+
+ path = out_path + "sample_unbarcoded.rem" + suffix_2;
+ fh = new ofstream(path.c_str(), ifstream::out);
+
+ if (fh->fail()) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
+
+ for (uint i = 0; i < files.size(); i++) {
+ bc.se = files[i].first;
+ bc.pe = files[i].second;
+ rem_2_fhs[bc] = fh;
+ }
+ }
+
+ return 0;
}
string filename;
for (uint i = 0; i < barcodes.size(); i++) {
- filename = barcodes[i].name_exists() ? barcodes[i].name : "sample_" + barcodes[i].str();
-
- path = out_path + filename + suffix_1;
- fh = new ofstream(path.c_str(), ifstream::out);
- pair_1_fhs[barcodes[i]] = fh;
-
- if (pair_1_fhs[barcodes[i]]->fail()) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
-
- if (paired) {
- path = out_path + filename + suffix_2;
- fh = new ofstream(path.c_str(), ifstream::out);
- pair_2_fhs[barcodes[i]] = fh;
-
- if (pair_2_fhs[barcodes[i]]->fail()) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
-
- path = out_path + filename + ".rem" + suffix_1;
- fh = new ofstream(path.c_str(), ifstream::out);
- rem_1_fhs[barcodes[i]] = fh;
-
- if (rem_1_fhs[barcodes[i]]->fail()) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
-
- path = out_path + filename + ".rem" + suffix_2;
- fh = new ofstream(path.c_str(), ifstream::out);
- rem_2_fhs[barcodes[i]] = fh;
-
- if (rem_2_fhs[barcodes[i]]->fail()) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
- }
+ filename = barcodes[i].name_exists() ? barcodes[i].name : "sample_" + barcodes[i].str();
+
+ path = out_path + filename + suffix_1;
+ fh = new ofstream(path.c_str(), ifstream::out);
+ pair_1_fhs[barcodes[i]] = fh;
+
+ if (pair_1_fhs[barcodes[i]]->fail()) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
+
+ if (paired) {
+ path = out_path + filename + suffix_2;
+ fh = new ofstream(path.c_str(), ifstream::out);
+ pair_2_fhs[barcodes[i]] = fh;
+
+ if (pair_2_fhs[barcodes[i]]->fail()) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
+
+ path = out_path + filename + ".rem" + suffix_1;
+ fh = new ofstream(path.c_str(), ifstream::out);
+ rem_1_fhs[barcodes[i]] = fh;
+
+ if (rem_1_fhs[barcodes[i]]->fail()) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
+
+ path = out_path + filename + ".rem" + suffix_2;
+ fh = new ofstream(path.c_str(), ifstream::out);
+ rem_2_fhs[barcodes[i]] = fh;
+
+ if (rem_2_fhs[barcodes[i]]->fail()) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
+ }
}
return 0;
}
-int
+int
open_files(vector<pair<string, string> > &files,
- vector<BarcodePair> &barcodes,
- map<BarcodePair, gzFile *> &pair_1_fhs,
- map<BarcodePair, gzFile *> &pair_2_fhs,
- map<BarcodePair, gzFile *> &rem_1_fhs,
- map<BarcodePair, gzFile *> &rem_2_fhs,
- map<string, map<string, long> > &counters) {
+ vector<BarcodePair> &barcodes,
+ map<BarcodePair, gzFile *> &pair_1_fhs,
+ map<BarcodePair, gzFile *> &pair_2_fhs,
+ map<BarcodePair, gzFile *> &rem_1_fhs,
+ map<BarcodePair, gzFile *> &rem_2_fhs,
+ map<string, map<string, long> > &counters) {
string path, suffix_1, suffix_2, filepath, file;
if (paired) {
- suffix_1 = ".1";
- suffix_2 = ".2";
+ suffix_1 = ".1";
+ suffix_2 = ".2";
}
if (out_file_type == FileT::gzfastq) {
- suffix_1 += ".fq.gz";
- suffix_2 += ".fq.gz";
+ suffix_1 += ".fq.gz";
+ suffix_2 += ".fq.gz";
} else {
- suffix_1 += ".fa.gz";
- suffix_2 += ".fa.gz";
+ suffix_1 += ".fa.gz";
+ suffix_2 += ".fa.gz";
}
-
+
uint pos;
gzFile *fh;
BarcodePair bc;
@@ -325,270 +325,270 @@ open_files(vector<pair<string, string> > &files,
//
if (barcodes.size() == 0 && merge == false) {
- struct stat sb_1, sb_2;
-
- for (uint i = 0; i < files.size(); i++) {
-
- bc.se = files[i].first;
- if (paired)
- bc.pe = files[i].second;
-
- path = out_path + files[i].first;
-
- //
- // Check that the file has a proper suffix for the output type.
- //
- pos = path.find_last_of(".");
- if (path.substr(pos) == ".bam") {
- path = path.substr(0, pos) + suffix_1;
- } else if (path.substr(pos) == ".gz") {
- path = path.substr(0, pos);
- pos = path.find_last_of(".");
- path = path.substr(0, pos) + suffix_1;
- } else {
- path = path.substr(0, pos) + suffix_1;
- }
-
- if (stat((in_path_1 + files[i].first).c_str(), &sb_1) == -1) {
- cerr << "Unable to stat input file '" << in_path_1 + files[i].first << "'\n";
- exit(1);
- }
- if (stat(path.c_str(), &sb_2) == 0 &&
- sb_2.st_dev == sb_1.st_dev &&
- sb_2.st_ino == sb_1.st_ino) {
- cerr << "Input and output files ('" << path << "') are the same and will cause the input "
- << "file to be overwritten. Please specify a separate output directory using '-o'.\n";
- help();
- }
-
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
+ struct stat sb_1, sb_2;
+
+ for (uint i = 0; i < files.size(); i++) {
+
+ bc.se = files[i].first;
+ if (paired)
+ bc.pe = files[i].second;
+
+ path = out_path + files[i].first;
+
+ //
+ // Check that the file has a proper suffix for the output type.
+ //
+ pos = path.find_last_of(".");
+ if (path.substr(pos) == ".bam") {
+ path = path.substr(0, pos) + suffix_1;
+ } else if (path.substr(pos) == ".gz") {
+ path = path.substr(0, pos);
+ pos = path.find_last_of(".");
+ path = path.substr(0, pos) + suffix_1;
+ } else {
+ path = path.substr(0, pos) + suffix_1;
+ }
+
+ if (stat((in_path_1 + files[i].first).c_str(), &sb_1) == -1) {
+ cerr << "Unable to stat input file '" << in_path_1 + files[i].first << "'\n";
+ exit(1);
+ }
+ if (stat(path.c_str(), &sb_2) == 0 &&
+ sb_2.st_dev == sb_1.st_dev &&
+ sb_2.st_ino == sb_1.st_ino) {
+ cerr << "Input and output files ('" << path << "') are the same and will cause the input "
+ << "file to be overwritten. Please specify a separate output directory using '-o'.\n";
+ help();
+ }
+
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
pair_1_fhs[bc] = fh;
- if (!(*fh)) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
+ if (!(*fh)) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
if (paired) {
- file = interleaved ? files[i].first : files[i].second;
- path = out_path + file;
-
- pos = path.find_last_of(".");
- if (path.substr(pos) == ".bam") {
- path.replace(pos, 4, suffix_2);
- } else if (path.substr(pos) == ".gz") {
- path = path.substr(0, pos);
- pos = path.find_last_of(".");
- path = path.substr(0, pos) + suffix_2;
- } else {
- path = path.substr(0, pos) + suffix_2;
- }
-
- if (stat((in_path_2 + file).c_str(), &sb_1) == -1) {
- cerr << "Unable to stat input file '" << in_path_2 + file << "'\n";
- exit(1);
- }
- if (stat(path.c_str(), &sb_2) == 0 &&
- sb_2.st_dev == sb_1.st_dev &&
- sb_2.st_ino == sb_1.st_ino) {
- cerr << "Input and output file names ('" << path << "') are the same and will cause the input "
- << "file to be overwritten. Please specify a separate output directory using '-o'.\n";
- help();
- }
-
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
- pair_2_fhs[bc] = fh;
-
- if (!(*fh)) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
-
- filepath = files[i].first;
- pos = filepath.find_last_of(".");
- if (filepath.substr(pos) == ".gz") {
- filepath = filepath.substr(0, pos);
- pos = filepath.find_last_of(".");
- filepath = filepath.substr(0, pos);
- } else if (filepath.substr(pos) == ".bam") {
- filepath = filepath.substr(0, pos);
- }
- path = out_path + filepath + ".rem" + suffix_1;
-
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
+ file = interleaved ? files[i].first : files[i].second;
+ path = out_path + file;
+
+ pos = path.find_last_of(".");
+ if (path.substr(pos) == ".bam") {
+ path.replace(pos, 4, suffix_2);
+ } else if (path.substr(pos) == ".gz") {
+ path = path.substr(0, pos);
+ pos = path.find_last_of(".");
+ path = path.substr(0, pos) + suffix_2;
+ } else {
+ path = path.substr(0, pos) + suffix_2;
+ }
+
+ if (stat((in_path_2 + file).c_str(), &sb_1) == -1) {
+ cerr << "Unable to stat input file '" << in_path_2 + file << "'\n";
+ exit(1);
+ }
+ if (stat(path.c_str(), &sb_2) == 0 &&
+ sb_2.st_dev == sb_1.st_dev &&
+ sb_2.st_ino == sb_1.st_ino) {
+ cerr << "Input and output file names ('" << path << "') are the same and will cause the input "
+ << "file to be overwritten. Please specify a separate output directory using '-o'.\n";
+ help();
+ }
+
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
+ pair_2_fhs[bc] = fh;
+
+ if (!(*fh)) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
+
+ filepath = files[i].first;
+ pos = filepath.find_last_of(".");
+ if (filepath.substr(pos) == ".gz") {
+ filepath = filepath.substr(0, pos);
+ pos = filepath.find_last_of(".");
+ filepath = filepath.substr(0, pos);
+ } else if (filepath.substr(pos) == ".bam") {
+ filepath = filepath.substr(0, pos);
+ }
+ path = out_path + filepath + ".rem" + suffix_1;
+
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
rem_1_fhs[bc] = fh;
- if (!*(fh)) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
-
- filepath = file;
- pos = filepath.find_last_of(".");
- if (filepath.substr(pos) == ".gz") {
- filepath = filepath.substr(0, pos);
- pos = filepath.find_last_of(".");
- filepath = filepath.substr(0, pos);
- } else if (filepath.substr(pos) == ".bam") {
- filepath = filepath.substr(0, pos);
- }
- path = out_path + filepath + ".rem" + suffix_2;
-
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
- rem_2_fhs[bc] = fh;
-
- if (!(*fh)) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
- }
- }
-
- return 0;
+ if (!*(fh)) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
+
+ filepath = file;
+ pos = filepath.find_last_of(".");
+ if (filepath.substr(pos) == ".gz") {
+ filepath = filepath.substr(0, pos);
+ pos = filepath.find_last_of(".");
+ filepath = filepath.substr(0, pos);
+ } else if (filepath.substr(pos) == ".bam") {
+ filepath = filepath.substr(0, pos);
+ }
+ path = out_path + filepath + ".rem" + suffix_2;
+
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
+ rem_2_fhs[bc] = fh;
+
+ if (!(*fh)) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
+ }
+ }
+
+ return 0;
} else if (barcodes.size() == 0 && merge == true) {
- path = out_path + "sample_unbarcoded" + suffix_1;
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
-
- if (!(*fh)) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
-
- for (uint i = 0; i < files.size(); i++) {
- bc.se = files[i].first;
- if (paired)
- bc.pe = files[i].second;
- pair_1_fhs[bc] = fh;
- }
-
- if (paired) {
- path = out_path + "sample_unbarcoded" + suffix_2;
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
-
- if (!(*fh)) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
-
- for (uint i = 0; i < files.size(); i++) {
- bc.se = files[i].first;
- bc.pe = files[i].second;
- pair_2_fhs[bc] = fh;
- }
-
- path = out_path + "sample_unbarcoded.rem" + suffix_1;
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
-
- if (!(*fh)) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
-
- for (uint i = 0; i < files.size(); i++) {
- bc.se = files[i].first;
- bc.pe = files[i].second;
- rem_1_fhs[bc] = fh;
- }
-
- path = out_path + "sample_unbarcoded.rem" + suffix_2;
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
-
- if (!(*fh)) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
-
- for (uint i = 0; i < files.size(); i++) {
- bc.se = files[i].first;
- bc.pe = files[i].second;
- rem_2_fhs[bc] = fh;
- }
- }
-
- return 0;
+ path = out_path + "sample_unbarcoded" + suffix_1;
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
+
+ if (!(*fh)) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
+
+ for (uint i = 0; i < files.size(); i++) {
+ bc.se = files[i].first;
+ if (paired)
+ bc.pe = files[i].second;
+ pair_1_fhs[bc] = fh;
+ }
+
+ if (paired) {
+ path = out_path + "sample_unbarcoded" + suffix_2;
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
+
+ if (!(*fh)) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
+
+ for (uint i = 0; i < files.size(); i++) {
+ bc.se = files[i].first;
+ bc.pe = files[i].second;
+ pair_2_fhs[bc] = fh;
+ }
+
+ path = out_path + "sample_unbarcoded.rem" + suffix_1;
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
+
+ if (!(*fh)) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
+
+ for (uint i = 0; i < files.size(); i++) {
+ bc.se = files[i].first;
+ bc.pe = files[i].second;
+ rem_1_fhs[bc] = fh;
+ }
+
+ path = out_path + "sample_unbarcoded.rem" + suffix_2;
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
+
+ if (!(*fh)) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
+
+ for (uint i = 0; i < files.size(); i++) {
+ bc.se = files[i].first;
+ bc.pe = files[i].second;
+ rem_2_fhs[bc] = fh;
+ }
+ }
+
+ return 0;
}
string filename;
for (uint i = 0; i < barcodes.size(); i++) {
- filename = barcodes[i].name_exists() ? barcodes[i].name : "sample_" + barcodes[i].str();
-
- path = out_path + filename + suffix_1;
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
- pair_1_fhs[barcodes[i]] = fh;
-
- if (!(*pair_1_fhs[barcodes[i]])) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
-
- if (paired) {
- path = out_path + filename + suffix_2;
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
- pair_2_fhs[barcodes[i]] = fh;
-
- if (!(*pair_2_fhs[barcodes[i]])) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
- }
-
- path = out_path + filename + ".rem" + suffix_1;
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
- rem_1_fhs[barcodes[i]] = fh;
-
- if (!(*rem_1_fhs[barcodes[i]])) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
-
- path = out_path + filename + ".rem" + suffix_2;
- fh = new gzFile;
- *fh = gzopen(path.c_str(), "wb");
- rem_2_fhs[barcodes[i]] = fh;
-
- if (!(*rem_2_fhs[barcodes[i]])) {
- cerr << "Error opening remainder output file '" << path << "'\n";
- exit(1);
- }
- }
+ filename = barcodes[i].name_exists() ? barcodes[i].name : "sample_" + barcodes[i].str();
+
+ path = out_path + filename + suffix_1;
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
+ pair_1_fhs[barcodes[i]] = fh;
+
+ if (!(*pair_1_fhs[barcodes[i]])) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
+
+ if (paired) {
+ path = out_path + filename + suffix_2;
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
+ pair_2_fhs[barcodes[i]] = fh;
+
+ if (!(*pair_2_fhs[barcodes[i]])) {
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
+ }
+
+ path = out_path + filename + ".rem" + suffix_1;
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
+ rem_1_fhs[barcodes[i]] = fh;
+
+ if (!(*rem_1_fhs[barcodes[i]])) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
+
+ path = out_path + filename + ".rem" + suffix_2;
+ fh = new gzFile;
+ *fh = gzopen(path.c_str(), "wb");
+ rem_2_fhs[barcodes[i]] = fh;
+
+ if (!(*rem_2_fhs[barcodes[i]])) {
+ cerr << "Error opening remainder output file '" << path << "'\n";
+ exit(1);
+ }
+ }
}
return 0;
}
-int
-close_file_handles(map<BarcodePair, ofstream *> &fhs)
+int
+close_file_handles(map<BarcodePair, ofstream *> &fhs)
{
map<BarcodePair, ofstream*>::iterator i;
set<ofstream*> ptrs;
set<ofstream*>::iterator j;
for (i = fhs.begin(); i != fhs.end(); i++) {
- i->second->close();
- ptrs.insert(i->second);
+ i->second->close();
+ ptrs.insert(i->second);
}
for (j = ptrs.begin(); j != ptrs.end(); j++) {
- delete *j;
+ delete *j;
}
return 0;
}
-int
+int
close_file_handles(map<BarcodePair, gzFile *> &fhs)
{
map<BarcodePair, gzFile *>::iterator i;
@@ -596,65 +596,65 @@ close_file_handles(map<BarcodePair, gzFile *> &fhs)
set<gzFile *>::iterator j;
for (i = fhs.begin(); i != fhs.end(); i++) {
- gzclose(*(i->second));
- ptrs.insert(i->second);
+ gzclose(*(i->second));
+ ptrs.insert(i->second);
}
for (j = ptrs.begin(); j != ptrs.end(); j++) {
- delete *j;
+ delete *j;
}
return 0;
}
-int
-load_barcodes(string barcode_file, vector<BarcodePair> &barcodes,
- set<string> &se_bc, set<string> &pe_bc,
- uint &min_se_len, uint &max_se_len,
- uint &min_pe_len, uint &max_pe_len)
+int
+load_barcodes(string barcode_file, vector<BarcodePair> &barcodes,
+ set<string> &se_bc, set<string> &pe_bc,
+ uint &min_se_len, uint &max_se_len,
+ uint &min_pe_len, uint &max_pe_len)
{
switch(barcode_type) {
case null_null:
- cerr << "Barcode type unspecified, assuming unbarcoded data.\n";
- break;
+ cerr << "Barcode type unspecified, assuming unbarcoded data.\n";
+ break;
case null_index:
- cerr << "Searching for single, index barcode.\n";
- break;
+ cerr << "Searching for single, index barcode.\n";
+ break;
case index_null:
- cerr << "Searching for single-end, indexed barcodes.\n";
- break;
+ cerr << "Searching for single-end, indexed barcodes.\n";
+ break;
case inline_null:
- cerr << "Searching for single-end, inlined barcodes.\n";
- break;
+ cerr << "Searching for single-end, inlined barcodes.\n";
+ break;
case index_index:
- cerr << "Searching for single and paired-end, indexed barcodes.\n";
- break;
+ cerr << "Searching for single and paired-end, indexed barcodes.\n";
+ break;
case inline_inline:
- cerr << "Searching for single and paired-end, inlined barcodes.\n";
- break;
+ cerr << "Searching for single and paired-end, inlined barcodes.\n";
+ break;
case inline_index:
- if (paired)
- cerr << "Searching for single-end, inlined and paired-end, indexed barcodes.\n";
- else
- cerr << "Searching for single-end inlined and indexed barcodes.\n";
- break;
+ if (paired)
+ cerr << "Searching for single-end, inlined and paired-end, indexed barcodes.\n";
+ else
+ cerr << "Searching for single-end inlined and indexed barcodes.\n";
+ break;
case index_inline:
- if (paired)
- cerr << "Searching for single-end, indexed and paired-end, inlined barcodes.\n";
- else
- cerr << "Searching for single-end, indexed and inlined barcodes.\n";
- break;
+ if (paired)
+ cerr << "Searching for single-end, indexed and paired-end, inlined barcodes.\n";
+ else
+ cerr << "Searching for single-end, indexed and inlined barcodes.\n";
+ break;
}
if (barcode_file.length() == 0)
- return 0;
+ return 0;
char line[id_len];
ifstream fh(barcode_file.c_str(), ifstream::in);
if (fh.fail()) {
cerr << "Error opening barcode file '" << barcode_file << "'\n";
- exit(1);
+ exit(1);
}
char *p, *q, *r, *s;
@@ -662,140 +662,140 @@ load_barcodes(string barcode_file, vector<BarcodePair> &barcodes,
uint cols, line_num = 0;
while (fh.good()) {
- memset(line, 0, id_len);
- fh.getline(line, id_len);
- line_num++;
-
- if (strlen(line) == 0) continue;
-
- //
- // Check that the proper number of columns exist.
- //
- cols = 1;
- for (p = line; *p != '\0'; p++) if (*p == '\t') cols++;
-
- if (cols > 2 &&
- (barcode_type == inline_null || barcode_type == index_null)) {
- cerr << "Too many columns (" << cols << ") specified in '" << barcode_file << "' for single-end barcodes on line " << line_num << ".\n";
- exit(1);
- } else if (cols > 3) {
- cerr << "Too many columns (" << cols << ") specified in '" << barcode_file << "' on line " << line_num << ".\n";
- exit(1);
- }
-
- //
- // Identify the first barcode and check that it's legitimate.
- //
- p = line;
- q = p;
- while (*q != '\0') {
- switch (*q) {
- case 'A':
- case 'C':
- case 'G':
- case 'T':
- break;
- case 'a':
- *q = 'A';
- break;
- case 'c':
- *q = 'C';
- break;
- case 'g':
- *q = 'G';
- break;
- case 't':
- *q = 'T';
- break;
- case '\r':
- case '\t':
- *q = '\0';
- break;
- default:
- cerr << "Invalid barcode on line " << line_num << ": '" << p << "'\n";
- exit(1);
- }
- if (*q != '\0') q++;
- }
-
- //
- // If a second barcode was specified on the command line, identify it and check that it's legitimate.
- //
- r = NULL;
- if (barcode_type == inline_inline ||
- barcode_type == inline_index ||
- barcode_type == index_inline ||
- barcode_type == index_index) {
-
- if (q - p < id_len)
- q++;
- r = q;
- while (*q != '\0') {
- switch (*q) {
- case 'A':
- case 'C':
- case 'G':
- case 'T':
- break;
- case 'a':
- *q = 'A';
- break;
- case 'c':
- *q = 'C';
- break;
- case 'g':
- *q = 'G';
- break;
- case 't':
- *q = 'T';
- break;
- case '\r':
- case '\t':
- *q = '\0';
- break;
- default:
- cerr << "Invalid barcode on line " << line_num << ": '" << r << "'\n";
- exit(1);
- }
- if (*q != '\0') q++;
- }
- }
-
- //
- // Check for the existence of a file name to associate with this barcode set.
- //
- if (q - p < id_len)
- q++;
- s = q;
- while (*q != '\0') {
- if (!isalnum(*q)) {
- switch (*q) {
- case '-':
- case '_':
+ memset(line, 0, id_len);
+ fh.getline(line, id_len);
+ line_num++;
+
+ if (strlen(line) == 0) continue;
+
+ //
+ // Check that the proper number of columns exist.
+ //
+ cols = 1;
+ for (p = line; *p != '\0'; p++) if (*p == '\t') cols++;
+
+ if (cols > 2 &&
+ (barcode_type == inline_null || barcode_type == index_null)) {
+ cerr << "Too many columns (" << cols << ") specified in '" << barcode_file << "' for single-end barcodes on line " << line_num << ".\n";
+ exit(1);
+ } else if (cols > 3) {
+ cerr << "Too many columns (" << cols << ") specified in '" << barcode_file << "' on line " << line_num << ".\n";
+ exit(1);
+ }
+
+ //
+ // Identify the first barcode and check that it's legitimate.
+ //
+ p = line;
+ q = p;
+ while (*q != '\0') {
+ switch (*q) {
+ case 'A':
+ case 'C':
+ case 'G':
+ case 'T':
+ break;
+ case 'a':
+ *q = 'A';
+ break;
+ case 'c':
+ *q = 'C';
+ break;
+ case 'g':
+ *q = 'G';
+ break;
+ case 't':
+ *q = 'T';
+ break;
+ case '\r':
+ case '\t':
+ *q = '\0';
+ break;
+ default:
+ cerr << "Invalid barcode on line " << line_num << ": '" << p << "'\n";
+ exit(1);
+ }
+ if (*q != '\0') q++;
+ }
+
+ //
+ // If a second barcode was specified on the command line, identify it and check that it's legitimate.
+ //
+ r = NULL;
+ if (barcode_type == inline_inline ||
+ barcode_type == inline_index ||
+ barcode_type == index_inline ||
+ barcode_type == index_index) {
+
+ if (q - p < id_len)
+ q++;
+ r = q;
+ while (*q != '\0') {
+ switch (*q) {
+ case 'A':
+ case 'C':
+ case 'G':
+ case 'T':
+ break;
+ case 'a':
+ *q = 'A';
+ break;
+ case 'c':
+ *q = 'C';
+ break;
+ case 'g':
+ *q = 'G';
+ break;
+ case 't':
+ *q = 'T';
+ break;
+ case '\r':
+ case '\t':
+ *q = '\0';
+ break;
+ default:
+ cerr << "Invalid barcode on line " << line_num << ": '" << r << "'\n";
+ exit(1);
+ }
+ if (*q != '\0') q++;
+ }
+ }
+
+ //
+ // Check for the existence of a file name to associate with this barcode set.
+ //
+ if (q - p < id_len)
+ q++;
+ s = q;
+ while (*q != '\0') {
+ if (!isalnum(*q)) {
+ switch (*q) {
+ case '-':
+ case '_':
case '.':
- break;
- case '\r':
- case '\t':
- *q = '\0';
- break;
- default:
- cerr << "Invalid filename on line " << line_num << ": '" << s << "' (filenames can consist of letters, numbers, '.', '-' and '_').\n";
- exit(1);
- }
- }
- if (*q != '\0') q++;
- }
-
- barcodes.push_back(BarcodePair(p, r, s));
- if (p != NULL && strlen(p) > 0) se_bc.insert(string(p));
- if (r != NULL && strlen(r) > 0) pe_bc.insert(string(r));
+ break;
+ case '\r':
+ case '\t':
+ *q = '\0';
+ break;
+ default:
+ cerr << "Invalid filename on line " << line_num << ": '" << s << "' (filenames can consist of letters, numbers, '.', '-' and '_').\n";
+ exit(1);
+ }
+ }
+ if (*q != '\0') q++;
+ }
+
+ barcodes.push_back(BarcodePair(p, r, s));
+ if (p != NULL && strlen(p) > 0) se_bc.insert(string(p));
+ if (r != NULL && strlen(r) > 0) pe_bc.insert(string(r));
}
fh.close();
if (barcodes.size() == 0) {
- cerr << "Unable to load any barcodes from '" << barcode_file << "'\n";
- help();
+ cerr << "Unable to load any barcodes from '" << barcode_file << "'\n";
+ help();
}
//
@@ -804,13 +804,13 @@ load_barcodes(string barcode_file, vector<BarcodePair> &barcodes,
int pe_cnt = 0;
int se_cnt = 0;
for (uint i = 0; i < barcodes.size(); i++) {
- se_cnt += (barcodes[i].se.length() > 0) ? 1 : 0;
- pe_cnt += (barcodes[i].pe.length() > 0) ? 1 : 0;
+ se_cnt += (barcodes[i].se.length() > 0) ? 1 : 0;
+ pe_cnt += (barcodes[i].pe.length() > 0) ? 1 : 0;
}
if (pe_cnt > 0 && se_cnt != pe_cnt) {
- cerr << "Single and paired-end barcodes must be properly paired.\n";
- help();
+ cerr << "Single and paired-end barcodes must be properly paired.\n";
+ help();
}
//
@@ -820,9 +820,9 @@ load_barcodes(string barcode_file, vector<BarcodePair> &barcodes,
max_se_len = min_se_len;
for (uint i = 1; i < barcodes.size(); i++) {
if (barcodes[i].se.length() < min_se_len)
- min_se_len = barcodes[i].se.length();
- else if (barcodes[i].se.length() > max_se_len)
- max_se_len = barcodes[i].se.length();
+ min_se_len = barcodes[i].se.length();
+ else if (barcodes[i].se.length() > max_se_len)
+ max_se_len = barcodes[i].se.length();
}
//
@@ -832,144 +832,145 @@ load_barcodes(string barcode_file, vector<BarcodePair> &barcodes,
max_pe_len = min_pe_len;
for (uint i = 0; i < barcodes.size(); i++) {
if (barcodes[i].pe.length() < min_pe_len)
- min_pe_len = barcodes[i].pe.length();
- else if (barcodes[i].pe.length() > max_pe_len)
- max_pe_len = barcodes[i].pe.length();
+ min_pe_len = barcodes[i].pe.length();
+ else if (barcodes[i].pe.length() > max_pe_len)
+ max_pe_len = barcodes[i].pe.length();
}
//
- // If paired barcodes were supplied check that a paired barcode type was
+ // If paired barcodes were supplied check that a paired barcode type was
// specified and vice versa.
//
if (se_bc.size() > 0 && pe_bc.size() > 0) {
- if (barcode_type != inline_inline &&
- barcode_type != index_index &&
- barcode_type != inline_index &&
- barcode_type != index_inline) {
- cerr << "You provided paried barcodes but did not specify a paired barcode type.\n";
- help();
- }
+ if (barcode_type != inline_inline &&
+ barcode_type != index_index &&
+ barcode_type != inline_index &&
+ barcode_type != index_inline) {
+ cerr << "You provided paried barcodes but did not specify a paired barcode type.\n";
+ help();
+ }
} else {
- if (barcode_type != inline_null &&
- barcode_type != index_null) {
- cerr << "You provided single-end barcodes but did not specify a single-end barcode type.\n";
- help();
- }
+ if (barcode_type != inline_null &&
+ barcode_type != index_null) {
+ cerr << "You provided single-end barcodes but did not specify a single-end barcode type.\n";
+ help();
+ }
}
cerr << "Loaded " << barcodes.size() << " barcodes ";
if (pe_bc.size() > 0) {
- if (min_se_len != max_se_len)
- cerr << "(" << min_se_len << "-" << max_se_len << "bp / ";
- else
- cerr << "(" << max_se_len << "bp / ";
- if (min_pe_len != max_pe_len)
- cerr << min_pe_len << "-" << max_pe_len << "bp).\n";
- else
- cerr << max_pe_len << "bp).\n";
+ if (min_se_len != max_se_len)
+ cerr << "(" << min_se_len << "-" << max_se_len << "bp / ";
+ else
+ cerr << "(" << max_se_len << "bp / ";
+ if (min_pe_len != max_pe_len)
+ cerr << min_pe_len << "-" << max_pe_len << "bp).\n";
+ else
+ cerr << max_pe_len << "bp).\n";
} else {
- if (min_se_len != max_se_len)
- cerr << "(" << min_se_len << "-" << max_se_len << "bp).\n";
- else
- cerr << "(" << max_se_len << "bp).\n";
+ if (min_se_len != max_se_len)
+ cerr << "(" << min_se_len << "-" << max_se_len << "bp).\n";
+ else
+ cerr << "(" << max_se_len << "bp).\n";
}
return 0;
}
-int
-build_file_list(vector<pair<string, string> > &files)
+int
+build_file_list(vector<pair<string, string> > &files)
{
//
// Scan a directory for a list of files.
//
if (in_path_1.length() > 0) {
- string file, paired_file;
- const char *p, *q, *end;
- struct dirent *direntry;
-
- DIR *dir = opendir(in_path_1.c_str());
-
- if (dir == NULL) {
- cerr << "Unable to open directory '" << in_path_1 << "' for reading.\n";
- exit(1);
- }
-
- while ((direntry = readdir(dir)) != NULL) {
- file = direntry->d_name;
-
- if (file.substr(0, 1) == ".")
- continue;
-
- //
- // Check the file suffix to make sure we should process it.
- //
- p = file.c_str();
- q = p + file.length() + 1;
- end = q;
- while (q >= p && *q != '.') q--;
- if (strcmp(q, ".gz") == 0) {
- end = q;
- while (q >= p && *q != '.') q--;
- }
- if (strncmp(q, ".fq", end - q) != 0 &&
- strncmp(q, ".fa", end - q) != 0 &&
- strncmp(q, ".fastq", end - q) != 0 &&
- strncmp(q, ".fasta", end - q) != 0 &&
- strncmp(q, ".bam", end - q) != 0)
- continue;
-
- //
- // If paired-end specified, parse file names to sort out which is which.
- //
- if (paired && interleaved == false) {
- int res;
-
- if ((res = parse_illumina_v1(file.c_str())) > 0 ||
- (res = parse_illumina_v2(file.c_str())) > 0) {
- paired_file = file;
- paired_file.replace(res, 1, "2");
- files.push_back(make_pair(file, paired_file));
- }
- } else {
- files.push_back(make_pair(file, ""));
- }
- }
-
- if (files.size() == 0) {
- cerr << "Unable to locate any input files to process within '" << in_path_1 << "'\n";
- }
+ string file, paired_file;
+ const char *p, *q, *end;
+ struct dirent *direntry;
+
+ DIR *dir = opendir(in_path_1.c_str());
+
+ if (dir == NULL) {
+ cerr << "Unable to open directory '" << in_path_1 << "' for reading.\n";
+ exit(1);
+ }
+
+ while ((direntry = readdir(dir)) != NULL) {
+ file = direntry->d_name;
+
+ if (file.substr(0, 1) == ".")
+ continue;
+
+ //
+ // Check the file suffix to make sure we should process it.
+ //
+ p = file.c_str();
+ q = p + file.length() + 1;
+ end = q;
+ while (q >= p && *q != '.') q--;
+ if (strcmp(q, ".gz") == 0) {
+ end = q;
+ while (q >= p && *q != '.') q--;
+ }
+ if (strncmp(q, ".fq", end - q) != 0 &&
+ strncmp(q, ".fa", end - q) != 0 &&
+ strncmp(q, ".fastq", end - q) != 0 &&
+ strncmp(q, ".fasta", end - q) != 0 &&
+ strncmp(q, ".bam", end - q) != 0 &&
+ (file.length()<9 || file.substr(file.length()-9) != "_qseq.txt"))
+ continue;
+
+ //
+ // If paired-end specified, parse file names to sort out which is which.
+ //
+ if (paired && interleaved == false) {
+ int res;
+
+ if ((res = parse_illumina_v1(file.c_str())) > 0 ||
+ (res = parse_illumina_v2(file.c_str())) > 0) {
+ paired_file = file;
+ paired_file.replace(res, 1, "2");
+ files.push_back(make_pair(file, paired_file));
+ }
+ } else {
+ files.push_back(make_pair(file, ""));
+ }
+ }
+
+ if (files.size() == 0) {
+ cerr << "Unable to locate any input files to process within '" << in_path_1 << "'\n";
+ }
} else {
- //
- // Files specified directly:
- // Break off file path and store path and file name.
- //
- if (paired && interleaved == false) {
- int pos_1 = in_file_p1.find_last_of("/");
- in_path_1 = in_file_p1.substr(0, pos_1 + 1);
- int pos_2 = in_file_p2.find_last_of("/");
- in_path_2 = in_file_p2.substr(0, pos_2 + 1);
- files.push_back(make_pair(in_file_p1.substr(pos_1+1), in_file_p2.substr(pos_2+1)));
- } else if (paired && interleaved == true) {
- int pos = in_file.find_last_of("/");
- in_path_1 = in_file.substr(0, pos + 1);
- in_path_2 = in_path_1;
- files.push_back(make_pair(in_file.substr(pos+1), ""));
- } else {
- int pos = in_file.find_last_of("/");
- in_path_1 = in_file.substr(0, pos + 1);
- files.push_back(make_pair(in_file.substr(pos+1), ""));
- }
+ //
+ // Files specified directly:
+ // Break off file path and store path and file name.
+ //
+ if (paired && interleaved == false) {
+ int pos_1 = in_file_p1.find_last_of("/");
+ in_path_1 = in_file_p1.substr(0, pos_1 + 1);
+ int pos_2 = in_file_p2.find_last_of("/");
+ in_path_2 = in_file_p2.substr(0, pos_2 + 1);
+ files.push_back(make_pair(in_file_p1.substr(pos_1+1), in_file_p2.substr(pos_2+1)));
+ } else if (paired && interleaved == true) {
+ int pos = in_file.find_last_of("/");
+ in_path_1 = in_file.substr(0, pos + 1);
+ in_path_2 = in_path_1;
+ files.push_back(make_pair(in_file.substr(pos+1), ""));
+ } else {
+ int pos = in_file.find_last_of("/");
+ in_path_1 = in_file.substr(0, pos + 1);
+ files.push_back(make_pair(in_file.substr(pos+1), ""));
+ }
}
- cerr << "Found " << files.size();
+ cerr << "Found " << files.size();
if (paired && interleaved)
- cerr << " interleaved, paired input file(s).\n";
+ cerr << " interleaved, paired input file(s).\n";
else if (paired)
- cerr << " paired input file(s).\n";
- else
- cerr << " input file(s).\n";
+ cerr << " paired input file(s).\n";
+ else
+ cerr << " input file(s).\n";
return 0;
}
@@ -978,12 +979,12 @@ string
remove_suffix(FileT type, string file)
{
int pos = file.find_last_of(".");
-
+
if ((type == FileT::gzfastq || type == FileT::gzfasta) && file.substr(pos) == ".gz")
- file = file.substr(0, pos);
+ file = file.substr(0, pos);
pos = file.find_last_of(".");
-
+
if (type == FileT::gzfastq || type == FileT::fastq) {
if (file.substr(pos) == ".fastq" || file.substr(pos) == ".fq")
diff --git a/src/file_io.h b/src/file_io.h
index 580ec2c..95518a6 100644
--- a/src/file_io.h
+++ b/src/file_io.h
@@ -73,20 +73,20 @@ extern string in_path_2;
void help( void );
int build_file_list(vector<pair<string, string> > &);
-int load_barcodes(string, vector<BarcodePair> &,
- set<string> &, set<string> &,
+int load_barcodes(string, vector<BarcodePair> &,
+ set<string> &, set<string> &,
uint &, uint &, uint &, uint &);
int open_files(vector<pair<string, string> > &,
- vector<BarcodePair> &,
- map<BarcodePair, ofstream *> &,
- map<BarcodePair, ofstream *> &,
+ vector<BarcodePair> &,
+ map<BarcodePair, ofstream *> &,
+ map<BarcodePair, ofstream *> &,
map<BarcodePair, ofstream *> &,
map<BarcodePair, ofstream *> &,
map<string, map<string, long> > &);
int open_files(vector<pair<string, string> > &,
- vector<BarcodePair> &,
- map<BarcodePair, gzFile *> &,
- map<BarcodePair, gzFile *> &,
+ vector<BarcodePair> &,
+ map<BarcodePair, gzFile *> &,
+ map<BarcodePair, gzFile *> &,
map<BarcodePair, gzFile *> &,
map<BarcodePair, gzFile *> &,
map<string, map<string, long> > &);
diff --git a/src/genotypes.cc b/src/genotypes.cc
index d160159..b1e06cf 100644
--- a/src/genotypes.cc
+++ b/src/genotypes.cc
@@ -30,7 +30,7 @@
#include "genotypes.h"
-extern int **encoded_gtypes;
+extern int encoded_gtypes[4][4];
// Global variables to hold command-line options.
int num_threads = 1;
@@ -94,12 +94,12 @@ int main (int argc, char* argv[]) {
}
if (wl_file.length() > 0) {
- load_marker_list(wl_file, whitelist);
- cerr << "Loaded " << whitelist.size() << " whitelisted markers.\n";
+ load_marker_list(wl_file, whitelist);
+ cerr << "Loaded " << whitelist.size() << " whitelisted markers.\n";
}
if (bl_file.length() > 0) {
- load_marker_list(bl_file, blacklist);
- cerr << "Loaded " << blacklist.size() << " blacklisted markers.\n";
+ load_marker_list(bl_file, blacklist);
+ cerr << "Loaded " << blacklist.size() << " blacklisted markers.\n";
}
//
@@ -111,8 +111,8 @@ int main (int argc, char* argv[]) {
int res;
catalog_file << in_path << "batch_" << batch_id << ".catalog";
if ((res = load_loci(catalog_file.str(), catalog, false, false, compressed)) == 0) {
- cerr << "Unable to load the catalog '" << catalog_file.str() << "'\n";
- return 0;
+ cerr << "Unable to load the catalog '" << catalog_file.str() << "'\n";
+ return 0;
}
//
@@ -197,30 +197,30 @@ int main (int argc, char* argv[]) {
CSLocus *loc;
double mean, cnt;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- d = pmap->locus(loc->id);
+ loc = it->second;
+ d = pmap->locus(loc->id);
- if (loc->marker.length() > 0) {
- create_genotype_map(loc, pmap, parent_ids);
- call_population_genotypes(loc, pmap, global_dictionary);
- }
+ if (loc->marker.length() > 0) {
+ create_genotype_map(loc, pmap, parent_ids);
+ call_population_genotypes(loc, pmap, global_dictionary);
+ }
- mean = 0.0;
- cnt = 0.0;
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL)
- continue;
- mean += d[i]->lnl;
- cnt++;
- }
- loc->lnl = mean / cnt;
+ mean = 0.0;
+ cnt = 0.0;
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL)
+ continue;
+ mean += d[i]->lnl;
+ cnt++;
+ }
+ loc->lnl = mean / cnt;
}
//
// Make automated corrections
- //
+ //
if (corrections)
- automated_corrections(samples, parent_ids, catalog, catalog_matches, pmap);
+ automated_corrections(samples, parent_ids, catalog, catalog_matches, pmap);
//
// Check markers for potentially missing alleles.
@@ -228,28 +228,28 @@ int main (int argc, char* argv[]) {
switch(map_type) {
case cp:
correct_cp_markers_missing_alleles(parent_ids, catalog, pmap);
- break;
+ break;
case dh:
case bc1:
case f2:
case gen:
case none:
case unk:
- break;
+ break;
}
//
- // Reassign genotypes according to specific map type, record any
+ // Reassign genotypes according to specific map type, record any
// marker corrections made by detecting missing alleles.
//
if (map_type != gen)
- map_specific_genotypes(catalog, pmap, parent_ids);
-
+ map_specific_genotypes(catalog, pmap, parent_ids);
+
//
// Incorporate manual corrections exported from a Stacks SQL database.
//
if (man_corrections)
- manual_corrections(cor_path, pmap);
+ manual_corrections(cor_path, pmap);
//
// Calculate segregation distortion using chi-squared test.
@@ -257,8 +257,8 @@ int main (int argc, char* argv[]) {
map<string, map<string, double> > seg_ratios;
if (map_type != gen) {
- load_segregation_ratios(map_type, seg_ratios);
- calc_segregation_distortion(seg_ratios, catalog, pmap, parent_ids);
+ load_segregation_ratios(map_type, seg_ratios);
+ calc_segregation_distortion(seg_ratios, catalog, pmap, parent_ids);
}
//
@@ -266,27 +266,27 @@ int main (int argc, char* argv[]) {
//
switch(map_type) {
case dh:
- export_dh_map(catalog, pmap, parent_ids, samples);
- break;
+ export_dh_map(catalog, pmap, parent_ids, samples);
+ break;
case cp:
- export_cp_map(catalog, pmap, parent_ids, samples);
- break;
+ export_cp_map(catalog, pmap, parent_ids, samples);
+ break;
case bc1:
- export_bc1_map(catalog, pmap, parent_ids, samples);
- break;
+ export_bc1_map(catalog, pmap, parent_ids, samples);
+ break;
case f2:
- export_f2_map(catalog, pmap, parent_ids, samples);
- break;
+ export_f2_map(catalog, pmap, parent_ids, samples);
+ break;
case gen:
- export_gen_map(catalog, pmap, parent_ids, samples);
- break;
+ export_gen_map(catalog, pmap, parent_ids, samples);
+ break;
case none:
case unk:
- break;
+ break;
}
if (sql_out)
- write_sql(catalog, pmap, parent_ids);
+ write_sql(catalog, pmap, parent_ids);
//
// Output the observed haplotypes.
@@ -294,14 +294,14 @@ int main (int argc, char* argv[]) {
write_generic(catalog, pmap, samples, parent_ids, false);
if (out_type == genomic)
- write_genomic(catalog, pmap);
+ write_genomic(catalog, pmap);
return 0;
}
int
-apply_locus_constraints(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap)
+apply_locus_constraints(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap)
{
CSLocus *loc;
Datum **d;
@@ -313,37 +313,37 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
map<int, CSLocus *>::iterator it;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- d = pmap->locus(loc->id);
-
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- //
- // Check that each sample is over the minimum stack depth for this locus.
- //
- if (d[i] != NULL &&
- d[i]->tot_depth < min_stack_depth) {
- below_stack_dep++;
- delete d[i];
- d[i] = NULL;
- }
-
- //
- // Check that each sample is over the log likelihood threshold.
- //
- if (d[i] != NULL &&
- filter_lnl &&
- d[i]->lnl < lnl_limit) {
- below_lnl_thresh++;
- delete d[i];
- d[i] = NULL;
- }
- }
- }
-
- if (min_stack_depth > 0)
- cerr << "Removed " << below_stack_dep << " samples from loci that are below the minimum stack depth of " << min_stack_depth << "x\n";
+ loc = it->second;
+ d = pmap->locus(loc->id);
+
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ //
+ // Check that each sample is over the minimum stack depth for this locus.
+ //
+ if (d[i] != NULL &&
+ d[i]->tot_depth < min_stack_depth) {
+ below_stack_dep++;
+ delete d[i];
+ d[i] = NULL;
+ }
+
+ //
+ // Check that each sample is over the log likelihood threshold.
+ //
+ if (d[i] != NULL &&
+ filter_lnl &&
+ d[i]->lnl < lnl_limit) {
+ below_lnl_thresh++;
+ delete d[i];
+ d[i] = NULL;
+ }
+ }
+ }
+
+ if (min_stack_depth > 0)
+ cerr << "Removed " << below_stack_dep << " samples from loci that are below the minimum stack depth of " << min_stack_depth << "x\n";
if (filter_lnl)
- cerr << "Removed " << below_lnl_thresh << " samples from loci that are below the log likelihood threshold of " << lnl_limit << "\n";
+ cerr << "Removed " << below_lnl_thresh << " samples from loci that are below the log likelihood threshold of " << lnl_limit << "\n";
return 0;
}
@@ -359,11 +359,11 @@ int identify_parental_ids(map<int, CSLocus *> &catalog, vector<int> &sample_ids,
// or more crosses. These are listed in the catalog.tags.tsv file, column 8.
//
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- for (uint i = 0; i < loc->comp.size(); i++) {
- sample_id = (int) strtol(loc->comp[i], NULL, 10);
- catalog_parents.insert(sample_id);
- }
+ loc = it->second;
+ for (uint i = 0; i < loc->comp.size(); i++) {
+ sample_id = (int) strtol(loc->comp[i], NULL, 10);
+ catalog_parents.insert(sample_id);
+ }
}
//
@@ -371,14 +371,14 @@ int identify_parental_ids(map<int, CSLocus *> &catalog, vector<int> &sample_ids,
// searching the Stacks directory and crosscheck those found in the catalog.
//
for (uint i = 0; i < sample_ids.size(); i++) {
- if (catalog_parents.count(sample_ids[i]) > 0)
- parents.insert(sample_ids[i]);
+ if (catalog_parents.count(sample_ids[i]) > 0)
+ parents.insert(sample_ids[i]);
}
set<int>::iterator sit;
cerr << "Identified parent IDs: ";
for (sit = parents.begin(); sit != parents.end(); sit++)
- cerr << *sit << " ";
+ cerr << *sit << " ";
cerr << "\n";
return 0;
@@ -395,82 +395,82 @@ int find_markers(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &
if (parent_ids.size() > 2) return 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- //
- // Count the number of parental tags matching this catalog tag. A proper marker should
- // contain a single representative from each parent; multiple alleles must be called from
- // a single tag from a single parent.
- //
- if (parent_ids.size() == 1) {
- p = parent_ids.begin();
- pid_1 = *p;
- pid_2 = -1;
- d_1 = pmap->blacklisted(loc->id, pid_1) ? NULL : pmap->datum(loc->id, pid_1);
- d_2 = NULL;
- } else {
- p = parent_ids.begin();
- q = p++;
-
- pid_1 = *p < *q ? *p : *q;
- pid_2 = *p < *q ? *q : *p;
- if (pmap->blacklisted(loc->id, pid_1) ||
- pmap->blacklisted(loc->id, pid_2)) {
- d_1 = NULL;
- d_2 = NULL;
- } else {
- d_1 = pmap->datum(loc->id, pid_1);
- d_2 = pmap->datum(loc->id, pid_2);
- }
- }
-
- parent_count = 0;
- if (d_1 != NULL) parent_count++;
- if (d_2 != NULL) parent_count++;
-
- //
- // Locus is present in both parents.
- //
- if (parent_count == 2) {
- allele_cnt_1 = d_1->obshap.size();
- allele_cnt_2 = d_2->obshap.size();
+ loc = it->second;
+ //
+ // Count the number of parental tags matching this catalog tag. A proper marker should
+ // contain a single representative from each parent; multiple alleles must be called from
+ // a single tag from a single parent.
+ //
+ if (parent_ids.size() == 1) {
+ p = parent_ids.begin();
+ pid_1 = *p;
+ pid_2 = -1;
+ d_1 = pmap->blacklisted(loc->id, pid_1) ? NULL : pmap->datum(loc->id, pid_1);
+ d_2 = NULL;
+ } else {
+ p = parent_ids.begin();
+ q = p++;
+
+ pid_1 = *p < *q ? *p : *q;
+ pid_2 = *p < *q ? *q : *p;
+ if (pmap->blacklisted(loc->id, pid_1) ||
+ pmap->blacklisted(loc->id, pid_2)) {
+ d_1 = NULL;
+ d_2 = NULL;
+ } else {
+ d_1 = pmap->datum(loc->id, pid_1);
+ d_2 = pmap->datum(loc->id, pid_2);
+ }
+ }
+
+ parent_count = 0;
+ if (d_1 != NULL) parent_count++;
+ if (d_2 != NULL) parent_count++;
+
+ //
+ // Locus is present in both parents.
+ //
+ if (parent_count == 2) {
+ allele_cnt_1 = d_1->obshap.size();
+ allele_cnt_2 = d_2->obshap.size();
//
// Determine the number of unique alleles
//
- set<string> unique_alleles;
+ set<string> unique_alleles;
- for (hit = d_1->obshap.begin(); hit != d_1->obshap.end(); hit++)
+ for (hit = d_1->obshap.begin(); hit != d_1->obshap.end(); hit++)
unique_alleles.insert(*hit);
- for (hit = d_2->obshap.begin(); hit != d_2->obshap.end(); hit++)
+ for (hit = d_2->obshap.begin(); hit != d_2->obshap.end(); hit++)
unique_alleles.insert(*hit);
int num_unique_alleles = unique_alleles.size();
- //
- // Locus is heterozygous in both parents. However, the number of alleles present distinguishes
- // what type of marker it is. Four unique alleles requries an ab/cd marker, while four
- // alleles that are the same in both parents requires an ab/ab marker. Finally, three unique
+ //
+ // Locus is heterozygous in both parents. However, the number of alleles present distinguishes
+ // what type of marker it is. Four unique alleles requries an ab/cd marker, while four
+ // alleles that are the same in both parents requires an ab/ab marker. Finally, three unique
// alleles requires either an ab/ac marker.
- //
- if (allele_cnt_1 == 2 && allele_cnt_2 == 2) {
+ //
+ if (allele_cnt_1 == 2 && allele_cnt_2 == 2) {
if (num_unique_alleles == 3)
loc->marker = "ab/ac";
- else if (num_unique_alleles == 2)
+ else if (num_unique_alleles == 2)
loc->marker = "ab/ab";
- else
+ else
loc->marker = "ab/cd";
- //
- // Locus is homozygous in one parent and heterozygous in the other.
- //
- } else if (allele_cnt_1 == 2 && allele_cnt_2 == 1) {
+ //
+ // Locus is homozygous in one parent and heterozygous in the other.
+ //
+ } else if (allele_cnt_1 == 2 && allele_cnt_2 == 1) {
if (num_unique_alleles == 3)
loc->marker = "ab/cc";
else if (num_unique_alleles == 2)
loc->marker = "ab/aa";
- //
- // Locus is homozygous in one parent and heterozygous in the other.
- //
- } else if (allele_cnt_1 == 1 && allele_cnt_2 == 2) {
+ //
+ // Locus is homozygous in one parent and heterozygous in the other.
+ //
+ } else if (allele_cnt_1 == 1 && allele_cnt_2 == 2) {
if (num_unique_alleles == 3)
loc->marker = "cc/ab";
@@ -479,20 +479,20 @@ int find_markers(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &
//
// Locus is homozygous in both parents, but heterozygous between parents.
//
- } else if (allele_cnt_1 == 1 && allele_cnt_2 == 1) {
+ } else if (allele_cnt_1 == 1 && allele_cnt_2 == 1) {
- if (strcmp(d_1->obshap[0], d_2->obshap[0]) != 0)
+ if (strcmp(d_1->obshap[0], d_2->obshap[0]) != 0)
loc->marker = "aa/bb";
- }
+ }
//
// Locus only exists in one parent.
- //
- } else if (parent_count == 1) {
- if (d_1 != NULL && d_1->obshap.size() == 2)
- loc->marker = "ab/--";
- else if (d_2 != NULL && d_2->obshap.size() == 2)
- loc->marker = "--/ab";
- }
+ //
+ } else if (parent_count == 1) {
+ if (d_1 != NULL && d_1->obshap.size() == 2)
+ loc->marker = "ab/--";
+ else if (d_2 != NULL && d_2->obshap.size() == 2)
+ loc->marker = "--/ab";
+ }
}
return 0;
@@ -505,46 +505,46 @@ int calculate_f(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &p
CSLocus *loc;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- d = pmap->locus(loc->id);
+ loc = it->second;
+ d = pmap->locus(loc->id);
- if (loc->snps.size() == 0) continue;
+ if (loc->snps.size() == 0) continue;
- double tot = 0.0;
- double hets = 0;
- double p, q, h, h0;
- map<char, int> alle;
+ double tot = 0.0;
+ double hets = 0;
+ double p, q, h, h0;
+ map<char, int> alle;
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL)
- continue;
- if (parent_ids.count(pmap->rev_sample_index(i)))
- continue;
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL)
+ continue;
+ if (parent_ids.count(pmap->rev_sample_index(i)))
+ continue;
- tot++;
+ tot++;
- if (d[i]->obshap.size() > 1) hets++;
+ if (d[i]->obshap.size() > 1) hets++;
- //
- // We are measuring the first SNP in the haplotype
- //
- for (uint j = 0; j < d[i]->obshap.size(); j++)
- alle[d[i]->obshap[j][0]]++;
- }
+ //
+ // We are measuring the first SNP in the haplotype
+ //
+ for (uint j = 0; j < d[i]->obshap.size(); j++)
+ alle[d[i]->obshap[j][0]]++;
+ }
- if (alle.size() > 2 || tot == 0)
- continue;
+ if (alle.size() > 2 || tot == 0)
+ continue;
- j = alle.begin();
- p = j->second;
- j++;
- q = j->second;
- h = hets / tot; // Observed frequency of heterozygotes in the population
- h0 = 2 * (p/tot) * (q/tot); // 2PQ, expected frequency of hets under Hardy-Weinberg
- if (h0 > 0)
- loc->f = (h0 - h) / h0;
+ j = alle.begin();
+ p = j->second;
+ j++;
+ q = j->second;
+ h = hets / tot; // Observed frequency of heterozygotes in the population
+ h0 = 2 * (p/tot) * (q/tot); // 2PQ, expected frequency of hets under Hardy-Weinberg
+ if (h0 > 0)
+ loc->f = (h0 - h) / h0;
- //cerr << "P: " << p << "; Q: " << q << "; Hets: " << hets << "; total: " << tot << "; f: " << loc->f << "\n";
+ //cerr << "P: " << p << "; Q: " << q << "; Hets: " << hets << "; total: " << tot << "; f: " << loc->f << "\n";
}
return 0;
@@ -553,9 +553,9 @@ int calculate_f(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &p
int create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap, set<int> &parent_ids) {
//
// Create a genotype map. For any set of alleles, this routine will
- // assign each allele to one of the constituent genotypes, e.g. given the
+ // assign each allele to one of the constituent genotypes, e.g. given the
// marker type 'aaxbb' and the alleles 'A' from the male, and 'G'
- // from the female, will assign 'G' == 'bb' and 'A'== 'aa'. It assumes that
+ // from the female, will assign 'G' == 'bb' and 'A'== 'aa'. It assumes that
// recombination may have occurred as with an F2, F3 or later cross.
//
//cerr << "Creating genotype map for catalog ID " << locus->id << ", marker: " << locus->marker << ".\n";
@@ -579,9 +579,9 @@ int create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap, set<int> &parent_
p2_gtypes.insert(locus->marker[3]);
p2_gtypes.insert(locus->marker[4]);
for (i = p1_gtypes.begin(); i != p1_gtypes.end(); i++)
- if (*i != '-') legal_gtypes[*i]++;
+ if (*i != '-') legal_gtypes[*i]++;
for (i = p2_gtypes.begin(); i != p2_gtypes.end(); i++)
- if (*i != '-') legal_gtypes[*i]++;
+ if (*i != '-') legal_gtypes[*i]++;
//
// Find the common genotypes
//
@@ -589,7 +589,7 @@ int create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap, set<int> &parent_
map<char, int>::iterator j;
for (j = legal_gtypes.begin(); j != legal_gtypes.end(); j++)
- if (j->second > 1) types.push_back(j->first);
+ if (j->second > 1) types.push_back(j->first);
sort(types.begin(), types.end());
Datum *d_1, *d_2;
@@ -600,140 +600,140 @@ int create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap, set<int> &parent_
d_2 = pmap->datum(locus->id, pid_2);
if (d_1 != NULL) {
- for (uint n = 0; n < d_1->obshap.size(); n++)
- haplotypes[d_1->obshap[n]]++;
+ for (uint n = 0; n < d_1->obshap.size(); n++)
+ haplotypes[d_1->obshap[n]]++;
}
if (d_2 != NULL) {
- for (uint n = 0; n < d_2->obshap.size(); n++)
- haplotypes[d_2->obshap[n]]++;
+ for (uint n = 0; n < d_2->obshap.size(); n++)
+ haplotypes[d_2->obshap[n]]++;
}
- //
+ //
// Sort the haplotypes map by value
//
for (k = haplotypes.begin(); k != haplotypes.end(); k++)
- sorted_haplotypes.push_back(*k);
+ sorted_haplotypes.push_back(*k);
sort(sorted_haplotypes.begin(), sorted_haplotypes.end(), hap_compare);
for (uint n = 0, index = 0; n < sorted_haplotypes.size() && index < types.size(); n++, index++) {
- if (sorted_haplotypes[n].second > 1) {
- locus->gmap[sorted_haplotypes[n].first] = types[index];
- com_gtypes[types[index]]++;
- // cerr << " Assigning common allele " << sorted_haplotypes[n].first << " to genotype '" << locus->gmap[sorted_haplotypes[n].first] << "'\n";
- }
+ if (sorted_haplotypes[n].second > 1) {
+ locus->gmap[sorted_haplotypes[n].first] = types[index];
+ com_gtypes[types[index]]++;
+ // cerr << " Assigning common allele " << sorted_haplotypes[n].first << " to genotype '" << locus->gmap[sorted_haplotypes[n].first] << "'\n";
+ }
}
//
// Now, examine the remaining first parent alleles.
//
if (d_1 != NULL) {
- legal_gtypes.clear();
- for (i = p1_gtypes.begin(); i != p1_gtypes.end(); i++)
- if (*i != '-' && com_gtypes.count(*i) == 0) {
- // cerr << " Adding " << *i << " to first parent genotypes\n";
- legal_gtypes[*i]++;
- }
- types.clear();
- for (j = legal_gtypes.begin(); j != legal_gtypes.end(); j++)
- types.push_back(j->first);
- sort(types.begin(), types.end());
-
- for (uint n = 0, index = 0; n < d_1->obshap.size() && index < types.size(); n++, index++) {
- if (locus->gmap.count(d_1->obshap[n])) {
- index--;
- continue;
- }
- locus->gmap[d_1->obshap[n]] = types[index];
- // cerr << " Assinging '" << d_1->obshap[n] << "' to first parent genotype '" << locus->gmap[d_1->obshap[n]] << "'\n";
- }
+ legal_gtypes.clear();
+ for (i = p1_gtypes.begin(); i != p1_gtypes.end(); i++)
+ if (*i != '-' && com_gtypes.count(*i) == 0) {
+ // cerr << " Adding " << *i << " to first parent genotypes\n";
+ legal_gtypes[*i]++;
+ }
+ types.clear();
+ for (j = legal_gtypes.begin(); j != legal_gtypes.end(); j++)
+ types.push_back(j->first);
+ sort(types.begin(), types.end());
+
+ for (uint n = 0, index = 0; n < d_1->obshap.size() && index < types.size(); n++, index++) {
+ if (locus->gmap.count(d_1->obshap[n])) {
+ index--;
+ continue;
+ }
+ locus->gmap[d_1->obshap[n]] = types[index];
+ // cerr << " Assinging '" << d_1->obshap[n] << "' to first parent genotype '" << locus->gmap[d_1->obshap[n]] << "'\n";
+ }
}
//
// Finally, repeat in the second parent.
//
if (d_2 != NULL) {
- legal_gtypes.clear();
- for (i = p2_gtypes.begin(); i != p2_gtypes.end(); i++)
- if (*i != '-' && com_gtypes.count(*i) == 0) {
- // cerr << " Adding " << *i << " to second genotypes\n";
- legal_gtypes[*i]++;
- }
- types.clear();
- for (j = legal_gtypes.begin(); j != legal_gtypes.end(); j++)
- types.push_back(j->first);
- sort(types.begin(), types.end());
-
- for (uint n = 0, index = 0; n < d_2->obshap.size() && index < types.size(); n++, index++) {
- if (locus->gmap.count(d_2->obshap[n])) {
- index--;
- continue;
- }
- locus->gmap[d_2->obshap[n]] = types[index];
- // cerr << " Assinging '" << d_2->obshap[n] << "' to second parent genotype '" << locus->gmap[d_2->obshap[n]] << "'\n";
- }
+ legal_gtypes.clear();
+ for (i = p2_gtypes.begin(); i != p2_gtypes.end(); i++)
+ if (*i != '-' && com_gtypes.count(*i) == 0) {
+ // cerr << " Adding " << *i << " to second genotypes\n";
+ legal_gtypes[*i]++;
+ }
+ types.clear();
+ for (j = legal_gtypes.begin(); j != legal_gtypes.end(); j++)
+ types.push_back(j->first);
+ sort(types.begin(), types.end());
+
+ for (uint n = 0, index = 0; n < d_2->obshap.size() && index < types.size(); n++, index++) {
+ if (locus->gmap.count(d_2->obshap[n])) {
+ index--;
+ continue;
+ }
+ locus->gmap[d_2->obshap[n]] = types[index];
+ // cerr << " Assinging '" << d_2->obshap[n] << "' to second parent genotype '" << locus->gmap[d_2->obshap[n]] << "'\n";
+ }
}
return 0;
}
-int call_population_genotypes(CSLocus *locus,
- PopMap<CSLocus> *pmap,
- map<string, map<string, string> > &dictionary) {
+int call_population_genotypes(CSLocus *locus,
+ PopMap<CSLocus> *pmap,
+ map<string, map<string, string> > &dictionary) {
//
// Fetch the array of observed haplotypes from the population
//
Datum **d = pmap->locus(locus->id);
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL)
- continue;
-
- vector<string> gtypes;
- string gtype;
+ if (d[i] == NULL)
+ continue;
- //cerr << "Sample Id: " << pmap->rev_sample_index(i) << "\n";
+ vector<string> gtypes;
+ string gtype;
- for (uint j = 0; j < d[i]->obshap.size(); j++) {
- //
- // Impossible allele encountered.
- //
- if (locus->gmap.count(d[i]->obshap[j]) == 0) {
- gtypes.clear();
- gtypes.push_back("-");
- goto impossible;
- }
+ //cerr << "Sample Id: " << pmap->rev_sample_index(i) << "\n";
- gtypes.push_back(locus->gmap[d[i]->obshap[j]]);
- //cerr << " Observed Haplotype: " << d[i]->obshap[j] << ", Genotype: " << locus->gmap[d[i]->obshap[j]] << "\n";
- }
+ for (uint j = 0; j < d[i]->obshap.size(); j++) {
+ //
+ // Impossible allele encountered.
+ //
+ if (locus->gmap.count(d[i]->obshap[j]) == 0) {
+ gtypes.clear();
+ gtypes.push_back("-");
+ goto impossible;
+ }
+
+ gtypes.push_back(locus->gmap[d[i]->obshap[j]]);
+ //cerr << " Observed Haplotype: " << d[i]->obshap[j] << ", Genotype: " << locus->gmap[d[i]->obshap[j]] << "\n";
+ }
impossible:
- sort(gtypes.begin(), gtypes.end());
- for (uint j = 0; j < gtypes.size(); j++) {
- gtype += gtypes[j];
- //cerr << " Adding genotype to string: " << gtypes[j] << "; " << gtype << "\n";
- }
+ sort(gtypes.begin(), gtypes.end());
+ for (uint j = 0; j < gtypes.size(); j++) {
+ gtype += gtypes[j];
+ //cerr << " Adding genotype to string: " << gtypes[j] << "; " << gtype << "\n";
+ }
- string m = dictionary[locus->marker].count(gtype) ?
- dictionary[locus->marker][gtype] :
- "--";
+ string m = dictionary[locus->marker].count(gtype) ?
+ dictionary[locus->marker][gtype] :
+ "--";
- if (d[i]->gtype != NULL)
- delete d[i]->gtype;
+ if (d[i]->gtype != NULL)
+ delete d[i]->gtype;
- d[i]->gtype = new char[m.length() + 1];
- strcpy(d[i]->gtype, m.c_str());
+ d[i]->gtype = new char[m.length() + 1];
+ strcpy(d[i]->gtype, m.c_str());
- if (m != "--")
- locus->gcnt++;
+ if (m != "--")
+ locus->gcnt++;
- //cerr << "Assigning datum, marker: " << locus->marker << ", string: " << m << ", haplotype: " << d[i]->obshap[0] << ", gtype: " << gtype << "\n";
+ //cerr << "Assigning datum, marker: " << locus->marker << ", string: " << m << ", haplotype: " << d[i]->obshap[0] << ", gtype: " << gtype << "\n";
}
return 0;
}
-int
+int
correct_cp_markers_missing_alleles(set<int> &parent_ids, map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
{
map<int, CSLocus *>::iterator it;
@@ -749,7 +749,7 @@ correct_cp_markers_missing_alleles(set<int> &parent_ids, map<int, CSLocus *> &ca
seg_ratios["ab/aa"]["aa"] = 0.50;
seg_ratios["ab/aa"]["ab"] = 0.25;
seg_ratios["ab/aa"]["bb"] = 0.25;
-
+
seg_ratios["aa/ab"]["aa"] = 0.50;
seg_ratios["aa/ab"]["ab"] = 0.25;
seg_ratios["aa/ab"]["bb"] = 0.25;
@@ -768,85 +768,85 @@ correct_cp_markers_missing_alleles(set<int> &parent_ids, map<int, CSLocus *> &ca
int corrected = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- //
- // We only want to examine markers where one parent is homozygous.
- //
- if (loc->marker != "ab/aa" &&
- loc->marker != "aa/ab" &&
- loc->marker != "ab/cc" &&
- loc->marker != "cc/ab") continue;
+ //
+ // We only want to examine markers where one parent is homozygous.
+ //
+ if (loc->marker != "ab/aa" &&
+ loc->marker != "aa/ab" &&
+ loc->marker != "ab/cc" &&
+ loc->marker != "cc/ab") continue;
- map<string, int> cnts;
+ map<string, int> cnts;
- //
- // Calculate initial segregation distortion.
- //
- double n = tally_generic_gtypes(loc->id, pmap, parent_ids, cnts);
- double chisq_pval = chisq_test(seg_ratios, cnts, loc->marker, n);
+ //
+ // Calculate initial segregation distortion.
+ //
+ double n = tally_generic_gtypes(loc->id, pmap, parent_ids, cnts);
+ double chisq_pval = chisq_test(seg_ratios, cnts, loc->marker, n);
- //
- // Check if our genotype ratios match the segregation ratios specified above. If so,
- // we have a dropped allele in one of the parents.
- //
- if (n == 0 || chisq_pval < chisq_pval_limit)
- continue;
+ //
+ // Check if our genotype ratios match the segregation ratios specified above. If so,
+ // we have a dropped allele in one of the parents.
+ //
+ if (n == 0 || chisq_pval < chisq_pval_limit)
+ continue;
- corrected++;
+ corrected++;
- if (loc->marker == "ab/aa")
- loc->marker = "ab/a-";
- else if (loc->marker == "aa/ab")
- loc->marker = "-a/ab";
- else if (loc->marker == "ab/cc")
- loc->marker = "ab/c-";
- else if (loc->marker == "cc/ab")
- loc->marker = "-c/ab";
+ if (loc->marker == "ab/aa")
+ loc->marker = "ab/a-";
+ else if (loc->marker == "aa/ab")
+ loc->marker = "-a/ab";
+ else if (loc->marker == "ab/cc")
+ loc->marker = "ab/c-";
+ else if (loc->marker == "cc/ab")
+ loc->marker = "-c/ab";
- d = pmap->locus(loc->id);
+ d = pmap->locus(loc->id);
- if (loc->marker == "ab/a-" || loc->marker == "-a/ab") {
+ if (loc->marker == "ab/a-" || loc->marker == "-a/ab") {
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL) continue;
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- if (strcmp(d[i]->gtype, "bb") == 0)
- strcpy(d[i]->gtype, "ab");
- }
+ if (strcmp(d[i]->gtype, "bb") == 0)
+ strcpy(d[i]->gtype, "ab");
+ }
- } else if (loc->marker == "ab/c-") {
+ } else if (loc->marker == "ab/c-") {
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL) continue;
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- if (strcmp(d[i]->gtype, "bb") == 0)
- strcpy(d[i]->gtype, "bd");
- else if (strcmp(d[i]->gtype, "aa") == 0)
- strcpy(d[i]->gtype, "ad");
- }
+ if (strcmp(d[i]->gtype, "bb") == 0)
+ strcpy(d[i]->gtype, "bd");
+ else if (strcmp(d[i]->gtype, "aa") == 0)
+ strcpy(d[i]->gtype, "ad");
+ }
- } else if (loc->marker == "-c/ab") {
+ } else if (loc->marker == "-c/ab") {
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL) continue;
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- if (strcmp(d[i]->gtype, "bb") == 0)
- strcpy(d[i]->gtype, "ad");
- else if (strcmp(d[i]->gtype, "aa") == 0)
- strcpy(d[i]->gtype, "ac");
- else if (strcmp(d[i]->gtype, "bc") == 0)
- strcpy(d[i]->gtype, "bd");
- else if (strcmp(d[i]->gtype, "ac") == 0)
- strcpy(d[i]->gtype, "bc");
- }
- }
+ if (strcmp(d[i]->gtype, "bb") == 0)
+ strcpy(d[i]->gtype, "ad");
+ else if (strcmp(d[i]->gtype, "aa") == 0)
+ strcpy(d[i]->gtype, "ac");
+ else if (strcmp(d[i]->gtype, "bc") == 0)
+ strcpy(d[i]->gtype, "bd");
+ else if (strcmp(d[i]->gtype, "ac") == 0)
+ strcpy(d[i]->gtype, "bc");
+ }
+ }
}
//
@@ -865,51 +865,51 @@ correct_cp_markers_missing_alleles(set<int> &parent_ids, map<int, CSLocus *> &ca
seg_ratio_2["aa/bb"]["ab"] = 0.50;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- if (loc->marker != "aa/bb") continue;
+ if (loc->marker != "aa/bb") continue;
- map<string, int> cnts;
+ map<string, int> cnts;
- double n = tally_generic_gtypes(loc->id, pmap, parent_ids, cnts);
- double chisq_pval = chisq_test(seg_ratio_1, cnts, loc->marker, n);
+ double n = tally_generic_gtypes(loc->id, pmap, parent_ids, cnts);
+ double chisq_pval = chisq_test(seg_ratio_1, cnts, loc->marker, n);
- if (n == 0) continue;
+ if (n == 0) continue;
- if (chisq_pval >= chisq_pval_limit) {
- corrected++;
+ if (chisq_pval >= chisq_pval_limit) {
+ corrected++;
- loc->marker = "aa/b-";
+ loc->marker = "aa/b-";
- d = pmap->locus(loc->id);
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
+ d = pmap->locus(loc->id);
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL) continue;
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- if (strcmp(d[i]->gtype, "ab") == 0)
- strcpy(d[i]->gtype, "bb");
- }
+ if (strcmp(d[i]->gtype, "ab") == 0)
+ strcpy(d[i]->gtype, "bb");
+ }
- } else {
- chisq_pval = chisq_test(seg_ratio_2, cnts, loc->marker, n);
+ } else {
+ chisq_pval = chisq_test(seg_ratio_2, cnts, loc->marker, n);
- if (chisq_pval >= chisq_pval_limit) {
- corrected++;
- loc->marker = "-a/bb";
+ if (chisq_pval >= chisq_pval_limit) {
+ corrected++;
+ loc->marker = "-a/bb";
- d = pmap->locus(loc->id);
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
+ d = pmap->locus(loc->id);
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL) continue;
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- if (strcmp(d[i]->gtype, "ab") == 0)
- strcpy(d[i]->gtype, "aa");
- }
+ if (strcmp(d[i]->gtype, "ab") == 0)
+ strcpy(d[i]->gtype, "aa");
+ }
- }
- }
+ }
+ }
}
cerr << "corrected " << corrected << " catalog loci.\n";
@@ -917,9 +917,9 @@ correct_cp_markers_missing_alleles(set<int> &parent_ids, map<int, CSLocus *> &ca
return 0;
}
-int
+int
automated_corrections(map<int, string> &samples, set<int> &parent_ids, map<int, CSLocus *> &catalog,
- vector<vector<CatMatch *> > &matches, PopMap<CSLocus> *pmap)
+ vector<vector<CatMatch *> > &matches, PopMap<CSLocus> *pmap)
{
int sample_id, catalog_id, tag_id;
Datum *d;
@@ -929,54 +929,54 @@ automated_corrections(map<int, string> &samples, set<int> &parent_ids, map<int,
cerr << "Performing automated corrections...\n";
for (uint i = 0; i < matches.size(); i++) {
- sample_id = matches[i][0]->sample_id;
- file = samples[sample_id];
+ sample_id = matches[i][0]->sample_id;
+ file = samples[sample_id];
- //if (sample_id != 29) continue;
- if (parent_ids.count(sample_id)) continue;
+ //if (sample_id != 29) continue;
+ if (parent_ids.count(sample_id)) continue;
- map<int, Locus *> stacks;
- bool compressed = false;
- int res;
- if ((res = load_loci(in_path + file, stacks, true, false, compressed)) == 0) {
- cerr << "Unable to load sample file '" << file << "'\n";
- return 0;
- }
+ map<int, Locus *> stacks;
+ bool compressed = false;
+ int res;
+ if ((res = load_loci(in_path + file, stacks, true, false, compressed)) == 0) {
+ cerr << "Unable to load sample file '" << file << "'\n";
+ return 0;
+ }
- set<pair<int, int> > processed;
+ set<pair<int, int> > processed;
- for (uint j = 0; j < matches[i].size(); j++) {
- catalog_id = matches[i][j]->cat_id;
- sample_id = matches[i][j]->sample_id;
- tag_id = matches[i][j]->tag_id;
+ for (uint j = 0; j < matches[i].size(); j++) {
+ catalog_id = matches[i][j]->cat_id;
+ sample_id = matches[i][j]->sample_id;
+ tag_id = matches[i][j]->tag_id;
- if (catalog.count(catalog_id) == 0) continue;
+ if (catalog.count(catalog_id) == 0) continue;
- //
- // There are multiple matches per stack, but we only need to process
- // each stack once to make corrections.
- //
- if (processed.count(make_pair(catalog_id, tag_id)) == 0 &&
- catalog[catalog_id]->marker.length() > 0) {
- d = pmap->datum(catalog_id, sample_id);
+ //
+ // There are multiple matches per stack, but we only need to process
+ // each stack once to make corrections.
+ //
+ if (processed.count(make_pair(catalog_id, tag_id)) == 0 &&
+ catalog[catalog_id]->marker.length() > 0) {
+ d = pmap->datum(catalog_id, sample_id);
- //cerr << "Accessing catalog ID " << catalog_id << "; sample: " << sample_id << "; marker: " << catalog[catalog_id]->marker << ": d: " << d << "; gtype: " << d->gtype << "\n";
+ //cerr << "Accessing catalog ID " << catalog_id << "; sample: " << sample_id << "; marker: " << catalog[catalog_id]->marker << ": d: " << d << "; gtype: " << d->gtype << "\n";
- if (d != NULL && strcmp(d->gtype, "--") != 0) {
- s = stacks[tag_id];
- check_uncalled_snps(catalog[catalog_id], s, d);
- }
+ if (d != NULL && strcmp(d->gtype, "--") != 0) {
+ s = stacks[tag_id];
+ check_uncalled_snps(catalog[catalog_id], s, d);
+ }
- processed.insert(make_pair(catalog_id, tag_id));
- }
- }
+ processed.insert(make_pair(catalog_id, tag_id));
+ }
+ }
- //
- // Free up memory
- //
- map<int, Locus *>::iterator it;
- for (it = stacks.begin(); it != stacks.end(); it++)
- delete it->second;
+ //
+ // Free up memory
+ //
+ map<int, Locus *>::iterator it;
+ for (it = stacks.begin(); it != stacks.end(); it++)
+ delete it->second;
}
//
@@ -990,47 +990,47 @@ automated_corrections(map<int, string> &samples, set<int> &parent_ids, map<int,
int markers = 0;
map<int, CSLocus *>::iterator it;
for (it = catalog.begin(); it != catalog.end(); it++) {
- if (it->second->marker.length() == 0) continue;
- markers++;
+ if (it->second->marker.length() == 0) continue;
+ markers++;
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- sample_id = pmap->rev_sample_index(j);
- if (parent_ids.count(sample_id)) continue;
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ sample_id = pmap->rev_sample_index(j);
+ if (parent_ids.count(sample_id)) continue;
- d = pmap->datum(it->first, sample_id);
+ d = pmap->datum(it->first, sample_id);
- pot_gen++;
+ pot_gen++;
- if (d == NULL) continue;
+ if (d == NULL) continue;
- tot_gen++;
+ tot_gen++;
- if (d->corrected == true) {
- tot_cor++;
+ if (d->corrected == true) {
+ tot_cor++;
- if (strcmp(d->gtype, "--") == 0)
- rem_cor++;
- else
- het_cor++;
- }
- }
+ if (strcmp(d->gtype, "--") == 0)
+ rem_cor++;
+ else
+ het_cor++;
+ }
+ }
}
- cerr << pot_gen << " potential genotypes in " << markers << " markers, "
- << tot_gen << " populated; "
- << tot_cor << " corrected, "
- << het_cor << " converted to heterozygotes, "
- << rem_cor << " unsupported homozygotes removed.\n";
+ cerr << pot_gen << " potential genotypes in " << markers << " markers, "
+ << tot_gen << " populated; "
+ << tot_cor << " corrected, "
+ << het_cor << " converted to heterozygotes, "
+ << rem_cor << " unsupported homozygotes removed.\n";
return 0;
}
int check_uncalled_snps(CSLocus *clocus, Locus *stack, Datum *d) {
//
- // If this locus is already known to be multi-allele, return, we only want
+ // If this locus is already known to be multi-allele, return, we only want
// to verify uncalled SNPs.
//
if (strlen(d->gtype) > 1 && d->gtype[0] != d->gtype[1])
- return 0;
+ return 0;
//cerr << "Catalog locus: " << clocus->id << ", marker: " << clocus->marker << ", tag_id: " << stack->id << "; Starting gtype: " << d->gtype << "\n";
@@ -1039,24 +1039,24 @@ int check_uncalled_snps(CSLocus *clocus, Locus *stack, Datum *d) {
string homozygous;
for (uint i = 0; i < clocus->snps.size(); i++) {
- check_homozygosity(stack->reads,
- clocus->snps[i]->col, clocus->snps[i]->rank_1, clocus->snps[i]->rank_2,
- homozygous);
-
- if (homozygous == "unknown")
- status = "true";
- else if (homozygous == "false")
- verified_snps.push_back(clocus->snps[i]);
+ check_homozygosity(stack->reads,
+ clocus->snps[i]->col, clocus->snps[i]->rank_1, clocus->snps[i]->rank_2,
+ homozygous);
+
+ if (homozygous == "unknown")
+ status = "true";
+ else if (homozygous == "false")
+ verified_snps.push_back(clocus->snps[i]);
}
if (status == "true") {
- d->corrected = true;
- delete [] d->gtype;
- d->gtype = new char[2];
- strcpy(d->gtype, "-");
- return 0;
+ d->corrected = true;
+ delete [] d->gtype;
+ d->gtype = new char[2];
+ strcpy(d->gtype, "-");
+ return 0;
} else if (verified_snps.size() < clocus->snps.size()) {
- return 0;
+ return 0;
}
//
@@ -1067,35 +1067,35 @@ int check_uncalled_snps(CSLocus *clocus, Locus *stack, Datum *d) {
vector<string> types;
for (uint i = 0; i < haplotypes.size(); i++) {
- if (clocus->gmap.count(haplotypes[i])) {
- //cerr << " Adding haplotype '" << haplotypes[i] << "', which maps to '" << clocus->gmap[haplotypes[i]] << "' to the genotype\n";
- types.push_back(clocus->gmap[haplotypes[i]]);
- } else {
- //cerr << " Adding haplotype '-' for " << haplotypes[i] << "\n";
- types.push_back("-");
- }
+ if (clocus->gmap.count(haplotypes[i])) {
+ //cerr << " Adding haplotype '" << haplotypes[i] << "', which maps to '" << clocus->gmap[haplotypes[i]] << "' to the genotype\n";
+ types.push_back(clocus->gmap[haplotypes[i]]);
+ } else {
+ //cerr << " Adding haplotype '-' for " << haplotypes[i] << "\n";
+ types.push_back("-");
+ }
}
sort(types.begin(), types.end());
string genotype;
for (uint i = 0; i < types.size(); i++)
- genotype += types[i];
+ genotype += types[i];
//cerr << "Final genotype: " << genotype << "\n";
- genotype =
- global_dictionary[clocus->marker].count(genotype) ?
- global_dictionary[clocus->marker][genotype] :
- "--";
+ genotype =
+ global_dictionary[clocus->marker].count(genotype) ?
+ global_dictionary[clocus->marker][genotype] :
+ "--";
//cerr << "Final translated genotype: " << genotype << "\n";
if (strcmp(genotype.c_str(), d->gtype) != 0) {
- d->corrected = true;
- delete [] d->gtype;
- d->gtype = new char[genotype.length() + 1];
- strcpy(d->gtype, genotype.c_str());
+ d->corrected = true;
+ delete [] d->gtype;
+ d->gtype = new char[genotype.length() + 1];
+ strcpy(d->gtype, genotype.c_str());
}
//cerr << " Catalog: " << clocus->id << ", stack: " << stack->id << ", Ending Genotype: " << d->gtype << "\n\n";
@@ -1108,29 +1108,29 @@ int call_alleles(vector<SNP *> &snps, vector<char *> &reads, vector<string> &hap
char base;
for (int i = 0; i < height; i++) {
- string haplotype;
+ string haplotype;
- for (uint j = 0; j < snps.size(); j++) {
- base = reads[i][snps[j]->col];
+ for (uint j = 0; j < snps.size(); j++) {
+ base = reads[i][snps[j]->col];
- //
- // Check to make sure the nucleotide at the location of this SNP is
- // of one of the two possible states the multinomial model called.
- //
- if (base == snps[j]->rank_1 || base == snps[j]->rank_2)
- haplotype += base;
- else
- break;
- }
+ //
+ // Check to make sure the nucleotide at the location of this SNP is
+ // of one of the two possible states the multinomial model called.
+ //
+ if (base == snps[j]->rank_1 || base == snps[j]->rank_2)
+ haplotype += base;
+ else
+ break;
+ }
- if (haplotype.length() == snps.size())
- a[haplotype]++;
+ if (haplotype.length() == snps.size())
+ a[haplotype]++;
}
map<string, int>::iterator it;
for (it = a.begin(); it != a.end(); it++) {
- //cerr << " Calling haplotype: " << it->first << "\n";
- haplotypes.push_back(it->first);
+ //cerr << " Calling haplotype: " << it->first << "\n";
+ haplotypes.push_back(it->first);
}
return 0;
@@ -1144,8 +1144,8 @@ int check_homozygosity(vector<char *> &reads, int col, char rank_1, char rank_2,
homozygous = "true";
if (height < min_hom_seqs) {
- homozygous = "unknown";
- return 0;
+ homozygous = "unknown";
+ return 0;
}
map<char, int> nuc;
@@ -1157,39 +1157,39 @@ int check_homozygosity(vector<char *> &reads, int col, char rank_1, char rank_2,
nuc['T'] = 0;
for (int j = 0; j < height; j++)
- nuc[reads[j][col]]++;
+ nuc[reads[j][col]]++;
map<char, int>::iterator i;
for (i = nuc.begin(); i != nuc.end(); i++)
- sorted_nuc.push_back(make_pair(i->first, i->second));
+ sorted_nuc.push_back(make_pair(i->first, i->second));
sort(sorted_nuc.begin(), sorted_nuc.end(), compare_pair);
//
- // Check if more than a single nucleotide occurs in this column. Only
- // count nucleotides that are part of the called SNP, do not count
+ // Check if more than a single nucleotide occurs in this column. Only
+ // count nucleotides that are part of the called SNP, do not count
// error-generated nucleotides. Also, check that the sorting was successful
// by ensuring that sorted_nuc[0] > sorted_nuc[1] > sorted_nuc[2].
//
if (sorted_nuc[2].second > 0 && sorted_nuc[1].second <= sorted_nuc[2].second) {
- homozygous = "unknown";
- return 0;
+ homozygous = "unknown";
+ return 0;
}
- // cerr << "Sorted_nuc[0], '" << sorted_nuc[0].first << "', count: " << sorted_nuc[0].second
- // << "; Sorted_nuc[1], '" << sorted_nuc[1].first << "', count: " << sorted_nuc[1].second
- // << "; Sorted_nuc[2], '" << sorted_nuc[2].first << "', count: " << sorted_nuc[2].second << "\n";
+ // cerr << "Sorted_nuc[0], '" << sorted_nuc[0].first << "', count: " << sorted_nuc[0].second
+ // << "; Sorted_nuc[1], '" << sorted_nuc[1].first << "', count: " << sorted_nuc[1].second
+ // << "; Sorted_nuc[2], '" << sorted_nuc[2].first << "', count: " << sorted_nuc[2].second << "\n";
- if ((sorted_nuc[0].second > 0) &&
- (sorted_nuc[0].first == rank_1 || sorted_nuc[0].first == rank_2) &&
- (sorted_nuc[1].second > 0) &&
- (sorted_nuc[1].first == rank_1 || sorted_nuc[1].first == rank_2)) {
- homozygous = "false";
+ if ((sorted_nuc[0].second > 0) &&
+ (sorted_nuc[0].first == rank_1 || sorted_nuc[0].first == rank_2) &&
+ (sorted_nuc[1].second > 0) &&
+ (sorted_nuc[1].first == rank_1 || sorted_nuc[1].first == rank_2)) {
+ homozygous = "false";
}
//
- // If we find a second nucleotide present, check its prevelence. If it is
- // less than 1/20 of the total reads, don't count a heterozygote. If it is
+ // If we find a second nucleotide present, check its prevelence. If it is
+ // less than 1/20 of the total reads, don't count a heterozygote. If it is
// less than 1/10 report that we can't tell if its homozygous or not. Otherwise,
// report this tag as a heterozygote.
//
@@ -1197,20 +1197,20 @@ int check_homozygosity(vector<char *> &reads, int col, char rank_1, char rank_2,
//cerr << " Frac: " << frac << "; Second-most Prominent Nuc count: " << sorted_nuc[1].second << "; Depth: " << height << "\n";
if (homozygous == "false" && frac < min_het_seqs)
- homozygous = "true";
+ homozygous = "true";
else if (homozygous == "false" && frac < max_het_seqs)
- homozygous = "unknown";
+ homozygous = "unknown";
//cerr << " Homozygous: " << homozygous << "\n";
return 0;
}
-int
+int
manual_corrections(string cor_path, PopMap<CSLocus> *pmap)
{
//
- // Load manual corrections from a tab-seprated file, as exported from a Stacks SQL
+ // Load manual corrections from a tab-seprated file, as exported from a Stacks SQL
// dataabse. Has the format:
// id<tab>batch_id<tab>catalog_id<tab>sample_id<tab>genotype
//
@@ -1219,7 +1219,7 @@ manual_corrections(string cor_path, PopMap<CSLocus> *pmap)
if (fh.fail()) {
cerr << "Error opening manual corrections file '" << cor_path << "'\n";
- exit(1);
+ exit(1);
}
vector<string> parts;
@@ -1234,85 +1234,85 @@ manual_corrections(string cor_path, PopMap<CSLocus> *pmap)
int i;
while (fh.good()) {
- fh.getline(line, id_len);
- line_num++;
+ fh.getline(line, id_len);
+ line_num++;
- len = strlen(line);
- if (len == 0) continue;
+ len = strlen(line);
+ if (len == 0) continue;
- //
- // Check that there is no carraige return in the buffer.
- //
- if (line[len - 1] == '\r') line[len - 1] = '\0';
+ //
+ // Check that there is no carraige return in the buffer.
+ //
+ if (line[len - 1] == '\r') line[len - 1] = '\0';
- //
- // Ignore comments
- //
- if (line[0] == '#') continue;
+ //
+ // Ignore comments
+ //
+ if (line[0] == '#') continue;
- parse_tsv(line, parts);
+ parse_tsv(line, parts);
if (parts.size() != 5) {
cerr << "Error parsing '" << line << "' at line: " << line_num << ". (" << parts.size() << " fields).\n";
return 0;
}
- catalog_id = (int) strtol(parts[2].c_str(), &e, 10);
- if (*e != '\0') {
+ catalog_id = (int) strtol(parts[2].c_str(), &e, 10);
+ if (*e != '\0') {
cerr << "Error parsing '" << parts[2].c_str() << "' at line: " << line_num << ".\n";
return 0;
}
- sample_id = (int) strtol(parts[3].c_str(), &e, 10);
- if (*e != '\0') {
+ sample_id = (int) strtol(parts[3].c_str(), &e, 10);
+ if (*e != '\0') {
cerr << "Error parsing '" << parts[3].c_str() << "' at line: " << line_num << ".\n";
return 0;
}
- strcpy(gtype, parts[4].c_str());
+ strcpy(gtype, parts[4].c_str());
- //
- // Overwrite the genotype in the PopMap.
- //
- Datum **d = pmap->locus(catalog_id);
- total++;
+ //
+ // Overwrite the genotype in the PopMap.
+ //
+ Datum **d = pmap->locus(catalog_id);
+ total++;
- if (d == NULL) {
- skipped++;
- continue;
- }
+ if (d == NULL) {
+ skipped++;
+ continue;
+ }
- if ((i = pmap->sample_index(sample_id)) < 0) {
- skipped++;
- continue;
- }
+ if ((i = pmap->sample_index(sample_id)) < 0) {
+ skipped++;
+ continue;
+ }
- if (d[i] == NULL) {
- skipped++;
- continue;
- }
+ if (d[i] == NULL) {
+ skipped++;
+ continue;
+ }
- for (uint k = 0; k < strlen(gtype); k++)
- gtype[k] = tolower(gtype[k]);
+ for (uint k = 0; k < strlen(gtype); k++)
+ gtype[k] = tolower(gtype[k]);
- if (strcmp(gtype, "--") == 0)
- strcpy(gtype, "-");
+ if (strcmp(gtype, "--") == 0)
+ strcpy(gtype, "-");
- if (d[i]->gtype != NULL)
- delete [] d[i]->gtype;
+ if (d[i]->gtype != NULL)
+ delete [] d[i]->gtype;
- d[i]->gtype = new char[strlen(gtype) + 1];
- strcpy(d[i]->gtype, gtype);
+ d[i]->gtype = new char[strlen(gtype) + 1];
+ strcpy(d[i]->gtype, gtype);
- d[i]->corrected = true;
+ d[i]->corrected = true;
- success++;
+ success++;
}
fh.close();
- cerr << "Successfully imported "
- << success << " manually corrected genotypes. Skipped "
- << skipped << " genotypes due to invalid catalog or sample IDs, "
- << total << " genotypes read from file.\n";
+ cerr << "Successfully imported "
+ << success << " manually corrected genotypes. Skipped "
+ << skipped << " genotypes due to invalid catalog or sample IDs, "
+ << total << " genotypes read from file.\n";
return 0;
}
@@ -1320,7 +1320,7 @@ manual_corrections(string cor_path, PopMap<CSLocus> *pmap)
int export_gen_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &parent_ids, map<int, string> &samples) {
//
// We wish to export, a set of generic genotypes, not specific to any mapping type.
- //
+ //
//
// Mark those genotypes that have been corrected in uppercase letters.
@@ -1330,21 +1330,21 @@ int export_gen_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
uint len;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- Datum **d = pmap->locus(loc->id);
+ Datum **d = pmap->locus(loc->id);
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL) continue;
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- if (d[i]->corrected) {
- len = strlen(d[i]->gtype);
- for (uint k = 0; k < len; k++)
- d[i]->gtype[k] = toupper(d[i]->gtype[k]);
- }
- }
+ if (d[i]->corrected) {
+ len = strlen(d[i]->gtype);
+ for (uint k = 0; k < len; k++)
+ d[i]->gtype[k] = toupper(d[i]->gtype[k]);
+ }
+ }
}
//
@@ -1357,9 +1357,9 @@ int export_gen_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
int export_f2_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &parent_ids, map<int, string> &samples) {
//
- // We wish to export, according to the JoinMap manual, a locus genotype file (loc-file),
+ // We wish to export, according to the JoinMap manual, a locus genotype file (loc-file),
// which contains the information of all the loci for a single segregating population.
- //
+ //
// We are exporting an F2 population type:
// The result of selfing the F1 of a cross between two fully homozygous diploid parents.
//
@@ -1371,7 +1371,7 @@ int export_f2_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
// <abxcd> a, b, h, –
// <abxaa> a, –
// <aaxab> b, –
- // <abxcc> a, b, –
+ // <abxcc> a, b, –
// <ccxab> b, –
//
map<string, string> types;
@@ -1389,16 +1389,16 @@ int export_f2_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
//
switch(out_type) {
case joinmap:
- write_joinmap(catalog, pmap, types, samples, parent_ids);
- break;
+ write_joinmap(catalog, pmap, types, samples, parent_ids);
+ break;
case rqtl:
- write_rqtl(catalog, pmap, types, samples, parent_ids);
- break;
+ write_rqtl(catalog, pmap, types, samples, parent_ids);
+ break;
case onemap:
- write_onemap_mapmaker(catalog, pmap, types, samples, parent_ids);
- break;
+ write_onemap_mapmaker(catalog, pmap, types, samples, parent_ids);
+ break;
default:
- break;
+ break;
}
return 0;
@@ -1406,11 +1406,11 @@ int export_f2_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
int export_dh_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &parent_ids, map<int, string> &samples) {
//
- // We wish to export, according to the JoinMap manual, a locus genotype file (loc-file),
+ // We wish to export, according to the JoinMap manual, a locus genotype file (loc-file),
// which contains the information of all the loci for a single segregating population.
- //
+ //
// We are exporting a DH population type:
- // a doubled haploid population: the result of doubling the gametes of a single heterozygous
+ // a doubled haploid population: the result of doubling the gametes of a single heterozygous
// diploid individual.
//
// Segregation type codes for population type DH, from Joinmap manual:
@@ -1441,13 +1441,13 @@ int export_dh_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
//
switch(out_type) {
case joinmap:
- write_joinmap(catalog, pmap, types, samples, parent_ids);
- break;
+ write_joinmap(catalog, pmap, types, samples, parent_ids);
+ break;
case rqtl:
- write_rqtl(catalog, pmap, types, samples, parent_ids);
- break;
+ write_rqtl(catalog, pmap, types, samples, parent_ids);
+ break;
default:
- break;
+ break;
}
return 0;
@@ -1455,11 +1455,11 @@ int export_dh_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
int export_bc1_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &parent_ids, map<int, string> &samples) {
//
- // We wish to export, according to the JoinMap manual, a locus genotype file (loc-file),
+ // We wish to export, according to the JoinMap manual, a locus genotype file (loc-file),
// which contains the information of all the loci for a single segregating population.
- //
+ //
// We are exporting a BC1 population type:
- // a first generation backcross population: the result of crossing the F1 of a cross between
+ // a first generation backcross population: the result of crossing the F1 of a cross between
// two fully homozygous diploid parents to one of the parents.
//
// Segregation type codes for population type BC1, from Joinmap manual:
@@ -1491,16 +1491,16 @@ int export_bc1_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
//
switch(out_type) {
case joinmap:
- write_joinmap(catalog, pmap, types, samples, parent_ids);
- break;
+ write_joinmap(catalog, pmap, types, samples, parent_ids);
+ break;
case rqtl:
- write_rqtl(catalog, pmap, types, samples, parent_ids);
- break;
+ write_rqtl(catalog, pmap, types, samples, parent_ids);
+ break;
case onemap:
- write_onemap_mapmaker(catalog, pmap, types, samples, parent_ids);
- break;
+ write_onemap_mapmaker(catalog, pmap, types, samples, parent_ids);
+ break;
default:
- break;
+ break;
}
return 0;
@@ -1508,12 +1508,12 @@ int export_bc1_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
int export_cp_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &parent_ids, map<int, string> &samples) {
//
- // We wish to export, according to the JoinMap manual, a locus genotype file (loc-file),
+ // We wish to export, according to the JoinMap manual, a locus genotype file (loc-file),
// which contains the information of all the loci for a single segregating population.
- //
+ //
// We are exporting a CP population type:
- // a population resulting from a cross between two heterogeneously
- // heterozygous and homozygous diploid parents, linkage phases originally
+ // a population resulting from a cross between two heterogeneously
+ // heterozygous and homozygous diploid parents, linkage phases originally
// (possibly) unknown.
//
// Segregation type codes for population type CP, from Joinmap manual:
@@ -1533,7 +1533,7 @@ int export_cp_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
// <abxcd> ac, ad, bc, bd, ––
// <efxeg> ee, ef, eg, fg, ––
// <hkxhk> hh, hk, kk, h-, k-, ––
- // <lmxll> ll, lm, ––
+ // <lmxll> ll, lm, ––
// <nnxnp> nn, np, ––
//
map<string, string> types;
@@ -1541,13 +1541,13 @@ int export_cp_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
switch(out_type) {
case joinmap:
- load_joinmap_cp_dictionary(types, dictionary);
- break;
+ load_joinmap_cp_dictionary(types, dictionary);
+ break;
case onemap:
- load_onemap_cp_dictionary(types, dictionary);
- break;
+ load_onemap_cp_dictionary(types, dictionary);
+ break;
default:
- break;
+ break;
}
//
@@ -1560,87 +1560,87 @@ int export_cp_map(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int>
//
switch(out_type) {
case joinmap:
- write_joinmap(catalog, pmap, types, samples, parent_ids);
- break;
+ write_joinmap(catalog, pmap, types, samples, parent_ids);
+ break;
case onemap:
- write_onemap(catalog, pmap, types, samples, parent_ids);
- break;
+ write_onemap(catalog, pmap, types, samples, parent_ids);
+ break;
default:
- break;
+ break;
}
return 0;
}
-int
-calc_segregation_distortion(map<string, map<string, double> > &seg_ratios, map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap, set<int> &parent_ids)
+int
+calc_segregation_distortion(map<string, map<string, double> > &seg_ratios, map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap, set<int> &parent_ids)
{
map<string, string> types;
map<string, map<string, string> > dictionary;
switch(map_type) {
case dh:
- load_dh_dictionary(types, dictionary);
- break;
+ load_dh_dictionary(types, dictionary);
+ break;
case cp:
- load_cp_dictionary(types, dictionary);
- break;
+ load_cp_dictionary(types, dictionary);
+ break;
case bc1:
- load_mm_bc_dictionary(types, dictionary);
- break;
+ load_mm_bc_dictionary(types, dictionary);
+ break;
case f2:
- load_mm_f2_dictionary(types, dictionary);
- break;
+ load_mm_f2_dictionary(types, dictionary);
+ break;
case gen:
case none:
case unk:
- break;
+ break;
}
map<int, CSLocus *>::iterator it;
CSLocus *loc;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- if (seg_ratios.count(loc->marker) == 0) continue;
+ if (seg_ratios.count(loc->marker) == 0) continue;
- map<string, int> cnts;
+ map<string, int> cnts;
- double n = tally_translated_gtypes(loc->id, pmap, parent_ids, dictionary[loc->marker], cnts);
+ double n = tally_translated_gtypes(loc->id, pmap, parent_ids, dictionary[loc->marker], cnts);
- if (n == 0) continue;
+ if (n == 0) continue;
- // cerr << "ID: " << loc->id << "; marker: " << loc->marker << "\n";
+ // cerr << "ID: " << loc->id << "; marker: " << loc->marker << "\n";
- loc->chisq = chisq_test(seg_ratios, cnts, loc->marker, n);
+ loc->chisq = chisq_test(seg_ratios, cnts, loc->marker, n);
}
return 0;
}
double
-tally_translated_gtypes(int loc_id, PopMap<CSLocus> *pmap, set<int> &parent_ids,
- map<string, string> &dictionary, map<string, int> &cnts)
+tally_translated_gtypes(int loc_id, PopMap<CSLocus> *pmap, set<int> &parent_ids,
+ map<string, string> &dictionary, map<string, int> &cnts)
{
Datum **d = pmap->locus(loc_id);
double n = 0.0;
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
+ if (d[i] == NULL) continue;
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- if (strcmp(d[i]->gtype, "--") == 0)
- continue;
+ if (strcmp(d[i]->gtype, "--") == 0)
+ continue;
- n++;
+ n++;
- if (cnts.count(dictionary[d[i]->gtype]) > 0)
- cnts[dictionary[d[i]->gtype]]++;
- else
- cnts[dictionary[d[i]->gtype]] = 1;
+ if (cnts.count(dictionary[d[i]->gtype]) > 0)
+ cnts[dictionary[d[i]->gtype]]++;
+ else
+ cnts[dictionary[d[i]->gtype]] = 1;
}
return n;
@@ -1653,19 +1653,19 @@ tally_generic_gtypes(int loc_id, PopMap<CSLocus> *pmap, set<int> &parent_ids, ma
double n = 0.0;
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
+ if (d[i] == NULL) continue;
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- if (strcmp(d[i]->gtype, "--") == 0)
- continue;
+ if (strcmp(d[i]->gtype, "--") == 0)
+ continue;
- n++;
+ n++;
- if (cnts.count(d[i]->gtype) > 0)
- cnts[d[i]->gtype]++;
- else
- cnts[d[i]->gtype] = 1;
+ if (cnts.count(d[i]->gtype) > 0)
+ cnts[d[i]->gtype]++;
+ else
+ cnts[d[i]->gtype] = 1;
}
return n;
@@ -1686,11 +1686,11 @@ chisq_test(map<string, map<string, double> > &seg_ratios, map<string, int> &cnts
map<string, double>::iterator sit;
for (sit = seg_ratios[marker].begin(); sit != seg_ratios[marker].end(); sit++) {
- obs = cnts.count(sit->first) == 0 ? 0 : cnts[sit->first];
- exp = sit->second * n;
- // cerr << " category: " << sit->first << "; obs: " << obs << "; exp: " << exp << "\n";
+ obs = cnts.count(sit->first) == 0 ? 0 : cnts[sit->first];
+ exp = sit->second * n;
+ // cerr << " category: " << sit->first << "; obs: " << obs << "; exp: " << exp << "\n";
- chisq += ((obs - exp) * (obs - exp)) / exp;
+ chisq += ((obs - exp) * (obs - exp)) / exp;
}
// cerr << " df: " << df << "; Chisq value: " << chisq << "; pvalue: " << chisq_pvalue(df, chisq) << "\n";
@@ -1703,19 +1703,19 @@ chisq_test(map<string, map<string, double> > &seg_ratios, map<string, int> &cnts
double
chisq_pvalue(int df, double chisq)
{
- int i = 0;
+ int i = 0;
while (chisq > chisq_crit_values[df][i] &&
- i < chisq_crit_values_size) {
- i++;
+ i < chisq_crit_values_size) {
+ i++;
}
if (i == chisq_crit_values_size)
- return chisq_crit_values[0][chisq_crit_values_size - 1];
+ return chisq_crit_values[0][chisq_crit_values_size - 1];
return chisq_crit_values[0][i];
}
-int
+int
map_specific_genotypes(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &parent_ids)
{
map<string, string> types;
@@ -1723,21 +1723,21 @@ map_specific_genotypes(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<
switch(map_type) {
case dh:
- load_dh_dictionary(types, dictionary);
- break;
+ load_dh_dictionary(types, dictionary);
+ break;
case cp:
- load_cp_dictionary(types, dictionary);
- break;
+ load_cp_dictionary(types, dictionary);
+ break;
case bc1:
- load_bc_dictionary(types, dictionary);
- break;
+ load_bc_dictionary(types, dictionary);
+ break;
case f2:
- load_f2_dictionary(types, dictionary);
- break;
+ load_f2_dictionary(types, dictionary);
+ break;
case gen:
case none:
case unk:
- break;
+ break;
}
map<int, CSLocus *>::iterator it;
@@ -1746,97 +1746,97 @@ map_specific_genotypes(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<
CSLocus *loc;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- loc->gcnt = 0;
+ loc = it->second;
+ loc->gcnt = 0;
- if (loc->marker.length() == 0) continue;
+ if (loc->marker.length() == 0) continue;
- if (types.count(loc->marker)) {
- loc->uncor_marker = loc->marker;
- loc->marker = types[loc->marker];
- marker = loc->marker;
- } else {
- marker = "";
- }
+ if (types.count(loc->marker)) {
+ loc->uncor_marker = loc->marker;
+ loc->marker = types[loc->marker];
+ marker = loc->marker;
+ } else {
+ marker = "";
+ }
- d = pmap->locus(loc->id);
+ d = pmap->locus(loc->id);
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL) continue;
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- if (marker.length() == 0) {
- m = "--";
- } else {
- m = dictionary[marker].count(d[i]->gtype) ?
- dictionary[marker][d[i]->gtype] :
- dictionary[marker]["--"];
- }
+ if (marker.length() == 0) {
+ m = "--";
+ } else {
+ m = dictionary[marker].count(d[i]->gtype) ?
+ dictionary[marker][d[i]->gtype] :
+ dictionary[marker]["--"];
+ }
- strcpy(d[i]->gtype, m.c_str());
+ strcpy(d[i]->gtype, m.c_str());
- if (m != dictionary[marker]["--"])
- loc->gcnt++;
- }
+ if (m != dictionary[marker]["--"])
+ loc->gcnt++;
+ }
}
return 0;
}
-int
-translate_genotypes(map<string, string> &types, map<string, map<string, string> > &dictionary,
- map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int, string> &samples,
- set<int> &parent_ids)
+int
+translate_genotypes(map<string, string> &types, map<string, map<string, string> > &dictionary,
+ map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int, string> &samples,
+ set<int> &parent_ids)
{
map<int, CSLocus *>::iterator it;
CSLocus *loc;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- string marker = types.count(loc->marker) ? types[loc->marker] : "";
- Datum **d = pmap->locus(loc->id);
+ string marker = types.count(loc->marker) ? types[loc->marker] : "";
+ Datum **d = pmap->locus(loc->id);
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL) continue;
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- //cerr << "Examining progeny " << samples[pmap->rev_sample_index(i)] << "; marker: " << loc->marker << "\n";
+ //cerr << "Examining progeny " << samples[pmap->rev_sample_index(i)] << "; marker: " << loc->marker << "\n";
- string m;
+ string m;
- if (marker.length() == 0) {
- m = dictionary[marker]["--"];
- } else {
- m = dictionary[marker].count(d[i]->gtype) ?
- dictionary[marker][d[i]->gtype] :
- dictionary[marker]["--"];
- }
- d[i]->trans_gtype = new char[m.length() + 1];
+ if (marker.length() == 0) {
+ m = dictionary[marker]["--"];
+ } else {
+ m = dictionary[marker].count(d[i]->gtype) ?
+ dictionary[marker][d[i]->gtype] :
+ dictionary[marker]["--"];
+ }
+ d[i]->trans_gtype = new char[m.length() + 1];
- //
- // If the genotype was corrected, output it in uppercase letters.
- //
- if (d[i]->corrected) {
- for (uint k = 0; k < m.length(); k++)
- d[i]->trans_gtype[k] = toupper(m[k]);
- d[i]->trans_gtype[m.length()] = '\0';
- } else {
- strcpy(d[i]->trans_gtype, m.c_str());
- }
- if (m != dictionary[marker]["--"])
- loc->trans_gcnt++;
- //cerr << " allele: " << d[i]->trans_gtype << "; trans_gcnt: " << loc->trans_gcnt << "\n";
- }
+ //
+ // If the genotype was corrected, output it in uppercase letters.
+ //
+ if (d[i]->corrected) {
+ for (uint k = 0; k < m.length(); k++)
+ d[i]->trans_gtype[k] = toupper(m[k]);
+ d[i]->trans_gtype[m.length()] = '\0';
+ } else {
+ strcpy(d[i]->trans_gtype, m.c_str());
+ }
+ if (m != dictionary[marker]["--"])
+ loc->trans_gcnt++;
+ //cerr << " allele: " << d[i]->trans_gtype << "; trans_gcnt: " << loc->trans_gcnt << "\n";
+ }
}
return 0;
}
-int tally_progeny_haplotypes(CSLocus *locus, PopMap<CSLocus> *pmap, set<int> &parent_ids,
- int &total, double &max, string &freq_str) {
+int tally_progeny_haplotypes(CSLocus *locus, PopMap<CSLocus> *pmap, set<int> &parent_ids,
+ int &total, double &max, string &freq_str) {
char gtype[id_len];
map<string, double> freq;
Datum **d = pmap->locus(locus->id);
@@ -1847,38 +1847,38 @@ int tally_progeny_haplotypes(CSLocus *locus, PopMap<CSLocus> *pmap, set<int> &pa
//cerr << "Examining marker: " << locus->id << "\n";
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- if (d[i] == NULL) continue;
-
- //cerr << " Sample: " << i << "; Haplotype: " << d[i]->obshap[0] << "; Genotype: " << d[i]->gtype << "\n";
- if (strcmp(d[i]->gtype, "--") != 0) {
- //
- // Automated corrections will uppercase genotypes, convert them back to lowercase
- // in order to tally them properly.
- //
- int j = 0;
- while (d[i]->gtype[j] != '\0') {
- gtype[j] = tolower(d[i]->gtype[j]);
- j++;
- }
- gtype[j] = '\0';
- freq[gtype]++;
- total++;
- }
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (d[i] == NULL) continue;
+
+ //cerr << " Sample: " << i << "; Haplotype: " << d[i]->obshap[0] << "; Genotype: " << d[i]->gtype << "\n";
+ if (strcmp(d[i]->gtype, "--") != 0) {
+ //
+ // Automated corrections will uppercase genotypes, convert them back to lowercase
+ // in order to tally them properly.
+ //
+ int j = 0;
+ while (d[i]->gtype[j] != '\0') {
+ gtype[j] = tolower(d[i]->gtype[j]);
+ j++;
+ }
+ gtype[j] = '\0';
+ freq[gtype]++;
+ total++;
+ }
}
if (total == 0)
- return 0;
+ return 0;
double frac;
stringstream s;
char f[id_len];
map<string, double>::iterator it;
for (it = freq.begin(); it != freq.end(); it++) {
- frac = (double) it->second / (double) total * 100;
- if (frac > max) max = frac;
- sprintf(f, "(%0.1f%%);", frac);
- s << it->first << ":" << it->second << f;
+ frac = (double) it->second / (double) total * 100;
+ if (frac > max) max = frac;
+ sprintf(f, "(%0.1f%%);", frac);
+ s << it->first << ":" << it->second << f;
}
freq_str = s.str().substr(0, s.str().length() - 1);
@@ -1889,7 +1889,7 @@ int tally_progeny_haplotypes(CSLocus *locus, PopMap<CSLocus> *pmap, set<int> &pa
int write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &parent_ids) {
if (map_type == none)
- return 0;
+ return 0;
stringstream pop_name;
pop_name << "batch_" << batch_id << ".markers.tsv";
@@ -1901,12 +1901,12 @@ int write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &par
if (fh.fail()) {
cerr << "Error opening markers SQL file '" << file << "'\n";
- exit(1);
+ exit(1);
}
- fh << "# SQL ID" << "\t"
- << "Batch ID" << "\t"
- << "Catalog Locus ID" << "\t"
+ fh << "# SQL ID" << "\t"
+ << "Batch ID" << "\t"
+ << "Catalog Locus ID" << "\t"
<< "Marker Type" << "\t"
<< "Total Genotypes" << "\t"
<< "Max" << "\t"
@@ -1923,36 +1923,36 @@ int write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &par
stringstream gtype_map;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
-
- if (loc->marker.length() == 0) continue;
-
- double max = 0.0;
- int total = 0;
- string freq, map;
- tally_progeny_haplotypes(loc, pmap, parent_ids, total, max, freq);
-
- sprintf(max_str, "%0.2f", max);
-
- //
- // Record the haplotype to genotype map.
- //
- gtype_map.str("");
- for (j = loc->gmap.begin(); j != loc->gmap.end(); j++)
- gtype_map << j->first << ":" << j->second << ";";
- map = gtype_map.str().substr(0, gtype_map.str().length() - 1);
-
- fh << 0 << "\t"
- << batch_id << "\t"
- << loc->id << "\t"
- << loc->marker << "\t"
- << total << "\t"
- << max_str << "\t"
- << freq << "\t"
- << loc->chisq << "\t"
- << loc->lnl << "\t"
+ loc = it->second;
+
+ if (loc->marker.length() == 0) continue;
+
+ double max = 0.0;
+ int total = 0;
+ string freq, map;
+ tally_progeny_haplotypes(loc, pmap, parent_ids, total, max, freq);
+
+ sprintf(max_str, "%0.2f", max);
+
+ //
+ // Record the haplotype to genotype map.
+ //
+ gtype_map.str("");
+ for (j = loc->gmap.begin(); j != loc->gmap.end(); j++)
+ gtype_map << j->first << ":" << j->second << ";";
+ map = gtype_map.str().substr(0, gtype_map.str().length() - 1);
+
+ fh << 0 << "\t"
+ << batch_id << "\t"
+ << loc->id << "\t"
+ << loc->marker << "\t"
+ << total << "\t"
+ << max_str << "\t"
+ << freq << "\t"
+ << loc->chisq << "\t"
+ << loc->lnl << "\t"
<< map << "\t"
- << (loc->uncor_marker.length() == 0 ? loc->marker : loc->uncor_marker) << "\n";
+ << (loc->uncor_marker.length() == 0 ? loc->marker : loc->uncor_marker) << "\n";
}
fh.close();
@@ -1967,36 +1967,36 @@ int write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, set<int> &par
if (fh.fail()) {
cerr << "Error opening genotypes SQL file '" << file << "'\n";
- exit(1);
+ exit(1);
}
- fh << "# SQL ID" << "\t"
- << "Batch ID" << "\t"
+ fh << "# SQL ID" << "\t"
+ << "Batch ID" << "\t"
<< "Catalog Locus ID" << "\t"
<< "Sample ID" << "\t"
<< "Genotype" << "\n";
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- if (loc->gcnt < progeny_limit)
- continue;
+ if (loc->gcnt < progeny_limit)
+ continue;
- Datum **d = pmap->locus(loc->id);
+ Datum **d = pmap->locus(loc->id);
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- fh << 0 << "\t"
- << batch_id << "\t"
- << loc->id << "\t"
- << pmap->rev_sample_index(i) << "\t";
+ fh << 0 << "\t"
+ << batch_id << "\t"
+ << loc->id << "\t"
+ << pmap->rev_sample_index(i) << "\t";
- if (d[i] == NULL)
- map_type == cp ? fh << "--\n" : fh << "-\n";
- else
- fh << d[i]->gtype << "\n";
- }
+ if (d[i] == NULL)
+ map_type == cp ? fh << "--\n" : fh << "-\n";
+ else
+ fh << d[i]->gtype << "\n";
+ }
}
fh.close();
@@ -2014,7 +2014,7 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
if (fh.fail()) {
cerr << "Error opening genomic output file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -2025,10 +2025,10 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
int num_loci = 0;
for (cit = catalog.begin(); cit != catalog.end(); cit++) {
- loc = cit->second;
- if (loc->hcnt < progeny_limit) continue;
+ loc = cit->second;
+ if (loc->hcnt < progeny_limit) continue;
- num_loci += loc->len - renz_len[enz];
+ num_loci += loc->len - renz_len[enz];
}
cerr << "Writing " << num_loci << " nucleotide positions to genomic file, '" << file << "'\n";
@@ -2048,70 +2048,70 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
char *p;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint i = 0; i < it->second.size(); i++) {
- loc = it->second[i];
-
- if (loc->hcnt < progeny_limit) continue;
-
- Datum **d = pmap->locus(loc->id);
- set<int> snp_locs;
- string obshap;
-
- for (uint i = 0; i < loc->snps.size(); i++)
- snp_locs.insert(loc->snps[i]->col);
-
- uint start = 0;
- uint end = loc->len;
- //
- // Check for the existence of the restriction enzyme cut site, mask off
- // its output.
- //
- for (uint n = 0; n < rcnt; n++)
- if (strncmp(loc->con, renz[enz][n], rlen) == 0)
- start += renz_len[enz];
- if (start == 0) {
- p = loc->con + (loc->len - rlen);
- for (uint n = rcnt; n < rcnt + rcnt; n++)
- if (strncmp(p, renz[enz][n], rlen) == 0)
- end -= renz_len[enz];
- }
-
- uint k = 0;
- for (uint n = start; n < end; n++) {
- fh << loc->id << "\t" << loc->loc.chr << "\t" << loc->loc.bp + n;
-
- if (snp_locs.count(n) == 0) {
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- a = encode_gtype(loc->con[n]);
- fh << "\t" << encoded_gtypes[a][a];
- }
- } else {
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- fh << "\t";
-
- if (d[j] == NULL)
- fh << "0";
- else
- switch (d[j]->obshap.size()) {
- case 1:
- a = encode_gtype(d[j]->obshap[0][k]);
- fh << encoded_gtypes[a][a];
- break;
- case 2:
- a = encode_gtype(d[j]->obshap[0][k]);
- b = encode_gtype(d[j]->obshap[1][k]);
- fh << encoded_gtypes[a][b];
- break;
- default:
- fh << "0";
- break;
- }
- }
- k++;
- }
- fh << "\n";
- }
- }
+ for (uint i = 0; i < it->second.size(); i++) {
+ loc = it->second[i];
+
+ if (loc->hcnt < progeny_limit) continue;
+
+ Datum **d = pmap->locus(loc->id);
+ set<int> snp_locs;
+ string obshap;
+
+ for (uint i = 0; i < loc->snps.size(); i++)
+ snp_locs.insert(loc->snps[i]->col);
+
+ uint start = 0;
+ uint end = loc->len;
+ //
+ // Check for the existence of the restriction enzyme cut site, mask off
+ // its output.
+ //
+ for (uint n = 0; n < rcnt; n++)
+ if (strncmp(loc->con, renz[enz][n], rlen) == 0)
+ start += renz_len[enz];
+ if (start == 0) {
+ p = loc->con + (loc->len - rlen);
+ for (uint n = rcnt; n < rcnt + rcnt; n++)
+ if (strncmp(p, renz[enz][n], rlen) == 0)
+ end -= renz_len[enz];
+ }
+
+ uint k = 0;
+ for (uint n = start; n < end; n++) {
+ fh << loc->id << "\t" << loc->loc.chr << "\t" << loc->loc.bp + n;
+
+ if (snp_locs.count(n) == 0) {
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ a = encode_gtype(loc->con[n]);
+ fh << "\t" << encoded_gtypes[a][a];
+ }
+ } else {
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ fh << "\t";
+
+ if (d[j] == NULL)
+ fh << "0";
+ else
+ switch (d[j]->obshap.size()) {
+ case 1:
+ a = encode_gtype(d[j]->obshap[0][k]);
+ fh << encoded_gtypes[a][a];
+ break;
+ case 2:
+ a = encode_gtype(d[j]->obshap[0][k]);
+ b = encode_gtype(d[j]->obshap[1][k]);
+ fh << encoded_gtypes[a][b];
+ break;
+ default:
+ fh << "0";
+ break;
+ }
+ }
+ k++;
+ }
+ fh << "\n";
+ }
+ }
}
fh.close();
@@ -2124,9 +2124,9 @@ int write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int,
stringstream pop_name;
pop_name << "batch_" << batch_id;
if (write_gtypes)
- pop_name << ".genotypes_" << progeny_limit << ".tsv";
- else
- pop_name << ".haplotypes_" << progeny_limit << ".tsv";
+ pop_name << ".genotypes_" << progeny_limit << ".tsv";
+ else
+ pop_name << ".haplotypes_" << progeny_limit << ".tsv";
string file = in_path + pop_name.str();
@@ -2134,7 +2134,7 @@ int write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int,
if (fh.fail()) {
cerr << "Error opening generic output file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -2145,11 +2145,11 @@ int write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int,
int num_loci = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- if (write_gtypes == false && loc->hcnt < progeny_limit) continue;
- if (write_gtypes == true && loc->gcnt < progeny_limit) continue;
+ loc = it->second;
+ if (write_gtypes == false && loc->hcnt < progeny_limit) continue;
+ if (write_gtypes == true && loc->gcnt < progeny_limit) continue;
- num_loci++;
+ num_loci++;
}
cerr << "Writing " << num_loci << " loci to " << (write_gtypes ? "genotype" : "observed haplotype") << " file, '" << file << "'\n";
@@ -2158,19 +2158,19 @@ int write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int,
//
fh << "# Catalog ID\t";
if (expand_id)
- fh << "\t";
+ fh << "\t";
if (write_gtypes)
- fh << "Marker\t";
+ fh << "Marker\t";
fh << "Cnt\t"
<< "Seg Dist\t";
map<int, string>::iterator s;
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (write_gtypes && parent_ids.count(pmap->rev_sample_index(i)))
- continue;
- fh << samples[pmap->rev_sample_index(i)];
- if (i < pmap->sample_cnt() - 1)
- fh << "\t";
+ if (write_gtypes && parent_ids.count(pmap->rev_sample_index(i)))
+ continue;
+ fh << samples[pmap->rev_sample_index(i)];
+ if (i < pmap->sample_cnt() - 1)
+ fh << "\t";
}
fh << "\n";
@@ -2178,55 +2178,55 @@ int write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int,
// Output each locus.
//
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- if (write_gtypes == false && loc->hcnt < progeny_limit) continue;
- if (write_gtypes == true && loc->gcnt < progeny_limit) continue;
+ if (write_gtypes == false && loc->hcnt < progeny_limit) continue;
+ if (write_gtypes == true && loc->gcnt < progeny_limit) continue;
- stringstream id;
- loc->annotation.length() > 0 ?
+ stringstream id;
+ loc->annotation.length() > 0 ?
id << loc->id << "|" << loc->annotation : id << loc->id;
- fh << id.str();
+ fh << id.str();
if (expand_id) {
if (loc->annotation.length() > 0)
id << "\t" << loc->id << "\t" << loc->annotation;
- else if (strlen(loc->loc.chr) > 0)
- id << "\t" << loc->id << "\t" << loc->loc.chr << "_" << loc->loc.bp;
- else
+ else if (strlen(loc->loc.chr) > 0)
+ id << "\t" << loc->id << "\t" << loc->loc.chr << "_" << loc->loc.bp;
+ else
id << "\t" << loc->id << "\t";
}
- if (write_gtypes)
- fh << "\t" << loc->marker;
-
- write_gtypes ? fh << "\t" << loc->gcnt : fh << "\t" << loc->hcnt;
- fh << "\t" << loc->chisq;
-
- Datum **d = pmap->locus(loc->id);
- string obshap;
-
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (write_gtypes && parent_ids.count(pmap->rev_sample_index(i)))
- continue;
- fh << "\t";
-
- if (d[i] == NULL)
- fh << "-";
- else
- if (write_gtypes) {
- fh << d[i]->gtype;
- } else {
- obshap = "";
- for (uint j = 0; j < d[i]->obshap.size(); j++)
- obshap += string(d[i]->obshap[j]) + "/";
- obshap = obshap.substr(0, obshap.length()-1);
- fh << obshap;
- }
- }
+ if (write_gtypes)
+ fh << "\t" << loc->marker;
+
+ write_gtypes ? fh << "\t" << loc->gcnt : fh << "\t" << loc->hcnt;
+ fh << "\t" << loc->chisq;
+
+ Datum **d = pmap->locus(loc->id);
+ string obshap;
+
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (write_gtypes && parent_ids.count(pmap->rev_sample_index(i)))
+ continue;
+ fh << "\t";
+
+ if (d[i] == NULL)
+ fh << "-";
+ else
+ if (write_gtypes) {
+ fh << d[i]->gtype;
+ } else {
+ obshap = "";
+ for (uint j = 0; j < d[i]->obshap.size(); j++)
+ obshap += string(d[i]->obshap[j]) + "/";
+ obshap = obshap.substr(0, obshap.length()-1);
+ fh << obshap;
+ }
+ }
- fh << "\n";
+ fh << "\n";
}
fh.close();
@@ -2234,8 +2234,8 @@ int write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int,
return 0;
}
-int
-write_joinmap(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, string> &types, map<int, string> &samples, set<int> &parent_ids)
+int
+write_joinmap(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, string> &types, map<int, string> &samples, set<int> &parent_ids)
{
stringstream pop_name;
pop_name << "batch_" << batch_id << ".genotypes_" << progeny_limit;
@@ -2245,7 +2245,7 @@ write_joinmap(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, s
if (fh.fail()) {
cerr << "Error opening joinmap output file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -2256,10 +2256,10 @@ write_joinmap(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, s
int num_loci = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- if (loc->trans_gcnt < progeny_limit) continue;
+ loc = it->second;
+ if (loc->trans_gcnt < progeny_limit) continue;
- num_loci++;
+ num_loci++;
}
cerr << "Writing " << num_loci << " loci to JoinMap file, '" << file << "'\n";
@@ -2281,57 +2281,57 @@ write_joinmap(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, s
// Output each locus.
//
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- if (loc->trans_gcnt < progeny_limit) continue;
+ if (loc->trans_gcnt < progeny_limit) continue;
- stringstream id;
- loc->annotation.length() > 0 ?
+ stringstream id;
+ loc->annotation.length() > 0 ?
id << loc->id << "|" << loc->annotation : id << loc->id;
- fh << id.str() << "\t";
+ fh << id.str() << "\t";
if (expand_id) {
- id.str("");
+ id.str("");
if (loc->annotation.length() > 0)
id << loc->id << "\t" << loc->annotation;
- else if (strlen(loc->loc.chr) > 0)
- id << loc->id << "\t" << loc->loc.chr << "_" << loc->loc.bp;
- else
+ else if (strlen(loc->loc.chr) > 0)
+ id << loc->id << "\t" << loc->loc.chr << "_" << loc->loc.bp;
+ else
id << loc->id << "\t";
fh << id.str() << "\t";
}
- if (types[loc->marker] == "lmx--")
- fh << "<lmxll>";
- else if (types[loc->marker] == "--xnp")
- fh << "<nnxnp>";
- else
- fh << "<" << types[loc->marker] << ">";
+ if (types[loc->marker] == "lmx--")
+ fh << "<lmxll>";
+ else if (types[loc->marker] == "--xnp")
+ fh << "<nnxnp>";
+ else
+ fh << "<" << types[loc->marker] << ">";
- Datum **d = pmap->locus(loc->id);
+ Datum **d = pmap->locus(loc->id);
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- fh << "\t";
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ fh << "\t";
- if (d[i] == NULL)
- map_type == cp ? fh << "--" : fh << "-";
- else
- fh << d[i]->trans_gtype;
- }
+ if (d[i] == NULL)
+ map_type == cp ? fh << "--" : fh << "-";
+ else
+ fh << d[i]->trans_gtype;
+ }
- fh << "\n";
+ fh << "\n";
}
fh << "\nindividual names:\n";
map<int, string>::iterator s;
for (s = samples.begin(); s != samples.end(); s++) {
- if (parent_ids.count(s->first)) continue;
- fh << s->second << "\n";
+ if (parent_ids.count(s->first)) continue;
+ fh << s->second << "\n";
}
fh.close();
@@ -2339,7 +2339,7 @@ write_joinmap(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, s
return 0;
}
-int
+int
write_onemap_mapmaker(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, string> &types, map<int, string> &samples, set<int> &parent_ids)
{
stringstream pop_name;
@@ -2350,7 +2350,7 @@ write_onemap_mapmaker(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<s
if (fh.fail()) {
cerr << "Error opening joinmap output file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -2361,10 +2361,10 @@ write_onemap_mapmaker(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<s
int num_loci = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- if (loc->trans_gcnt < progeny_limit) continue;
+ loc = it->second;
+ if (loc->trans_gcnt < progeny_limit) continue;
- num_loci++;
+ num_loci++;
}
cerr << "Writing " << num_loci << " loci to OneMap file, '" << file << "'\n";
@@ -2372,38 +2372,38 @@ write_onemap_mapmaker(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<s
// Output map type.
//
if (map_type == f2 )
- fh << "data type f2 intercross\n";
+ fh << "data type f2 intercross\n";
else if (map_type == bc1)
- fh << "data type f2 backcross\n";
+ fh << "data type f2 backcross\n";
//
- // Output the header: number of individuals, number of markers, number of
+ // Output the header: number of individuals, number of markers, number of
// quantitative traits (none).
//
fh << pmap->sample_cnt() - parent_ids.size() << " " << num_loci << " " << "0\n\n";
-
+
//
// Output each locus.
//
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- if (loc->trans_gcnt < progeny_limit) continue;
+ if (loc->trans_gcnt < progeny_limit) continue;
- fh << "*" << loc->id;
+ fh << "*" << loc->id;
- Datum **d = pmap->locus(loc->id);
+ Datum **d = pmap->locus(loc->id);
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- fh << " ";
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ fh << " ";
- if (d[i] == NULL)
- fh << "-";
- else
- fh << d[i]->trans_gtype;
- }
- fh << "\n";
+ if (d[i] == NULL)
+ fh << "-";
+ else
+ fh << d[i]->trans_gtype;
+ }
+ fh << "\n";
}
fh.close();
@@ -2411,7 +2411,7 @@ write_onemap_mapmaker(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<s
return 0;
}
-int
+int
write_onemap(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, string> &types, map<int, string> &samples, set<int> &parent_ids)
{
stringstream pop_name;
@@ -2422,7 +2422,7 @@ write_onemap(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, st
if (fh.fail()) {
cerr << "Error opening joinmap output file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -2433,10 +2433,10 @@ write_onemap(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, st
int num_loci = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- if (loc->trans_gcnt < progeny_limit) continue;
+ loc = it->second;
+ if (loc->trans_gcnt < progeny_limit) continue;
- num_loci++;
+ num_loci++;
}
cerr << "Writing " << num_loci << " loci to OneMap file, '" << file << "'\n";
@@ -2454,33 +2454,33 @@ write_onemap(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, st
// Output the header: number of individuals followed by number of markers.
//
fh << pmap->sample_cnt() - parent_ids.size() << "\t" << num_loci << "\n";
-
+
//
// Output each locus.
//
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- if (loc->trans_gcnt < progeny_limit) continue;
+ if (loc->trans_gcnt < progeny_limit) continue;
- fh << "*" << loc->id << " "
- << marker_types[types[loc->marker]] << "\t";
+ fh << "*" << loc->id << " "
+ << marker_types[types[loc->marker]] << "\t";
- Datum **d = pmap->locus(loc->id);
+ Datum **d = pmap->locus(loc->id);
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- if (d[i] == NULL)
- fh << "-";
- else
- fh << d[i]->trans_gtype;
+ if (d[i] == NULL)
+ fh << "-";
+ else
+ fh << d[i]->trans_gtype;
- if (i < pmap->sample_cnt() - 1)
- fh << ",";
- }
+ if (i < pmap->sample_cnt() - 1)
+ fh << ",";
+ }
- fh << "\n";
+ fh << "\n";
}
fh.close();
@@ -2488,8 +2488,8 @@ write_onemap(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, st
return 0;
}
-int
-write_rqtl(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, string> &types, map<int, string> &samples, set<int> &parent_ids)
+int
+write_rqtl(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, string> &types, map<int, string> &samples, set<int> &parent_ids)
{
stringstream pop_name;
pop_name << "batch_" << batch_id << ".genotypes_" << progeny_limit;
@@ -2499,7 +2499,7 @@ write_rqtl(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, stri
if (fh.fail()) {
cerr << "Error opening R/QTL output file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -2510,10 +2510,10 @@ write_rqtl(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, stri
int num_loci = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- if (loc->trans_gcnt < progeny_limit) continue;
+ loc = it->second;
+ if (loc->trans_gcnt < progeny_limit) continue;
- num_loci++;
+ num_loci++;
}
cerr << "Writing " << num_loci << " loci to R/QTL file, '" << file << "'\n";
@@ -2532,15 +2532,15 @@ write_rqtl(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, stri
<< "# Num Samples " << pmap->sample_cnt() - parent_ids.size() << "\n";
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- if (loc->gcnt < progeny_limit) continue;
+ loc = it->second;
+ if (loc->gcnt < progeny_limit) continue;
- fh << ",";
+ fh << ",";
- stringstream id;
- loc->annotation.length() > 0 ?
+ stringstream id;
+ loc->annotation.length() > 0 ?
id << loc->id << "|" << loc->annotation : id << loc->id;
- fh << id.str();
+ fh << id.str();
}
fh << "\n";
@@ -2548,29 +2548,29 @@ write_rqtl(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, stri
// Output the chromosome (if available) for each marker and then the location
//
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- if (loc->gcnt < progeny_limit) continue;
+ loc = it->second;
+ if (loc->gcnt < progeny_limit) continue;
- fh << ",";
+ fh << ",";
- string chr;
+ string chr;
chr = strlen(loc->loc.chr) > 0 ? loc->loc.chr : "1";
- fh << chr;
+ fh << chr;
}
fh << "\n";
int i = 1;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- if (loc->gcnt < progeny_limit) continue;
+ loc = it->second;
+ if (loc->gcnt < progeny_limit) continue;
- fh << ",";
+ fh << ",";
int bp = loc->loc.bp > 0 ? loc->loc.bp : i;
- fh << bp;
- i++;
+ fh << bp;
+ i++;
}
fh << "\n";
@@ -2579,23 +2579,23 @@ write_rqtl(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, stri
//
Datum *d;
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (parent_ids.count(pmap->rev_sample_index(i))) continue;
+ if (parent_ids.count(pmap->rev_sample_index(i))) continue;
- fh << samples[pmap->rev_sample_index(i)];
+ fh << samples[pmap->rev_sample_index(i)];
- for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- //if (loc->gcnt < progeny_limit) continue;
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ loc = it->second;
+ //if (loc->gcnt < progeny_limit) continue;
- d = pmap->datum(loc->id, pmap->rev_sample_index(i));
- fh << ",";
+ d = pmap->datum(loc->id, pmap->rev_sample_index(i));
+ fh << ",";
- if (d == NULL)
- map_type == cp ? fh << "--" : fh << "-";
- else
- fh << d->trans_gtype;
- }
- fh << "\n";
+ if (d == NULL)
+ map_type == cp ? fh << "--" : fh << "-";
+ else
+ fh << d->trans_gtype;
+ }
+ fh << "\n";
}
fh.close();
@@ -2615,31 +2615,31 @@ write_rqtl(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, stri
// // record whether those genotypes are heterozygous or homozygous.
// //
// foreach $key (keys %{$progeny}) {
-// my $alleles;
+// my $alleles;
-// print STDERR "Examining progeny $key\n" if ($debug);
-// //
-// // Discard progeny with more than one locus matched to this catalog tag.
-// //
-// @keys = keys %{$progeny->{$key}};
-// next if (scalar(@keys) > 1);
+// print STDERR "Examining progeny $key\n" if ($debug);
+// //
+// // Discard progeny with more than one locus matched to this catalog tag.
+// //
+// @keys = keys %{$progeny->{$key}};
+// next if (scalar(@keys) > 1);
-// $alleles = join("|", sort @{$progeny->{$key}->{$keys[0]}});
+// $alleles = join("|", sort @{$progeny->{$key}->{$keys[0]}});
-// if (!defined($allcnt{$alleles})) {
-// $allcnt{$alleles} = scalar(@{$progeny->{$key}->{$keys[0]}});
-// }
-// //print STDERR "Adding genotype $alleles\n";
+// if (!defined($allcnt{$alleles})) {
+// $allcnt{$alleles} = scalar(@{$progeny->{$key}->{$keys[0]}});
+// }
+// //print STDERR "Adding genotype $alleles\n";
-// $gtypes{$alleles}++;
+// $gtypes{$alleles}++;
-// foreach $allele (@{$progeny->{$key}->{$keys[0]}}) {
-// $uniqall{$allele}++;
-// }
+// foreach $allele (@{$progeny->{$key}->{$keys[0]}}) {
+// $uniqall{$allele}++;
+// }
// }
-
+
// //
-// // Examine the first parent alleles (the only alleles we have, since
+// // Examine the first parent alleles (the only alleles we have, since
// // we are imputing the second parent.
// //
// my @parents = keys %{$parents};
@@ -2648,31 +2648,31 @@ write_rqtl(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<string, stri
// $m = substr($marker, 0, 2);
// foreach $type (split(//, $m)) {
-// //print STDERR " Adding $type to genotypes\n" if ($debug);
+// //print STDERR " Adding $type to genotypes\n" if ($debug);
// $legal_genotypes{$type}++;
// }
// my @types = sort keys %legal_genotypes;
// if ($marker eq "lmxll") {
-// @keys = sort {$gtypes{$b} <=> $gtypes{$a}} keys %gtypes;
-// //
-// // Discard heterozygous alleles and find the highest frequency homozygote,
-// // this is the "l" in the "lmxll" marker.
-// //
-// while ($allcnt{$keys[0]} == 2) {
-// shift @keys;
-// }
-// $map->{$keys[0]} = shift @types;
-// print STDERR " Assinging '$keys[0]' to first parent genotype '", $map->{$keys[0]}, "'\n" if ($debug);
-
-// foreach $uall (sort {$uniqall{$b} <=> $uniqall{$a}} keys %uniqall) {
-// if ($uall ne $keys[0]) {
-// $allele = $uall;
-// last;
-// }
-// }
-// $map->{$allele} = shift @types;
-// print STDERR " Assinging '$allele' to first parent genotype '", $map->{$allele}, "'\n" if ($debug);
+// @keys = sort {$gtypes{$b} <=> $gtypes{$a}} keys %gtypes;
+// //
+// // Discard heterozygous alleles and find the highest frequency homozygote,
+// // this is the "l" in the "lmxll" marker.
+// //
+// while ($allcnt{$keys[0]} == 2) {
+// shift @keys;
+// }
+// $map->{$keys[0]} = shift @types;
+// print STDERR " Assinging '$keys[0]' to first parent genotype '", $map->{$keys[0]}, "'\n" if ($debug);
+
+// foreach $uall (sort {$uniqall{$b} <=> $uniqall{$a}} keys %uniqall) {
+// if ($uall ne $keys[0]) {
+// $allele = $uall;
+// last;
+// }
+// }
+// $map->{$allele} = shift @types;
+// print STDERR " Assinging '$allele' to first parent genotype '", $map->{$allele}, "'\n" if ($debug);
// }
// }
@@ -2683,28 +2683,28 @@ int load_marker_list(string path, set<int> &list) {
if (fh.fail()) {
cerr << "Error opening white/black list file '" << path << "'\n";
- exit(1);
+ exit(1);
}
int marker;
char *e;
while (fh.good()) {
- fh.getline(line, id_len);
+ fh.getline(line, id_len);
- if (strlen(line) == 0) continue;
+ if (strlen(line) == 0) continue;
- marker = (int) strtol(line, &e, 10);
+ marker = (int) strtol(line, &e, 10);
- if (*e == '\0')
- list.insert(marker);
+ if (*e == '\0')
+ list.insert(marker);
}
fh.close();
if (list.size() == 0) {
- cerr << "Unable to load any markers from '" << path << "'\n";
- help();
+ cerr << "Unable to load any markers from '" << path << "'\n";
+ help();
}
return 0;
@@ -2716,166 +2716,166 @@ bool hap_compare(pair<string, int> a, pair<string, int> b) {
int parse_command_line(int argc, char* argv[]) {
int c;
-
+
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
{"corr", no_argument, NULL, 'c'},
{"sql", no_argument, NULL, 's'},
- {"num_threads", required_argument, NULL, 'p'},
- {"batch_id", required_argument, NULL, 'b'},
- {"in_path", required_argument, NULL, 'P'},
- {"map_type", required_argument, NULL, 't'},
- {"out_type", required_argument, NULL, 'o'},
- {"progeny", required_argument, NULL, 'r'},
- {"min_depth", required_argument, NULL, 'm'},
- {"min_hom_seqs", required_argument, NULL, 'H'},
- {"min_het_seqs", required_argument, NULL, 'N'},
- {"max_het_seqs", required_argument, NULL, 'X'},
- {"renz", required_argument, NULL, 'e'},
- {"whitelist", required_argument, NULL, 'W'},
- {"blacklist", required_argument, NULL, 'B'},
- {"man_corr", required_argument, NULL, 'C'},
- {"lnl_lim", required_argument, NULL, 'L'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "hvcsib:p:t:o:r:P:m:e:H:N:X:W:B:C:L:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 'P':
- in_path = optarg;
- break;
- case 'b':
- batch_id = is_integer(optarg);
- if (batch_id < 0) {
- cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
- help();
- }
- break;
- case 't':
- if (strcasecmp(optarg, "cp") == 0)
+ {"num_threads", required_argument, NULL, 'p'},
+ {"batch_id", required_argument, NULL, 'b'},
+ {"in_path", required_argument, NULL, 'P'},
+ {"map_type", required_argument, NULL, 't'},
+ {"out_type", required_argument, NULL, 'o'},
+ {"progeny", required_argument, NULL, 'r'},
+ {"min_depth", required_argument, NULL, 'm'},
+ {"min_hom_seqs", required_argument, NULL, 'H'},
+ {"min_het_seqs", required_argument, NULL, 'N'},
+ {"max_het_seqs", required_argument, NULL, 'X'},
+ {"renz", required_argument, NULL, 'e'},
+ {"whitelist", required_argument, NULL, 'W'},
+ {"blacklist", required_argument, NULL, 'B'},
+ {"man_corr", required_argument, NULL, 'C'},
+ {"lnl_lim", required_argument, NULL, 'L'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hvcsib:p:t:o:r:P:m:e:H:N:X:W:B:C:L:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 'P':
+ in_path = optarg;
+ break;
+ case 'b':
+ batch_id = is_integer(optarg);
+ if (batch_id < 0) {
+ cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
+ help();
+ }
+ break;
+ case 't':
+ if (strcasecmp(optarg, "cp") == 0)
map_type = cp;
- else if (strcasecmp(optarg, "bc1") == 0)
- map_type = bc1;
- else if (strcasecmp(optarg, "f2") == 0)
- map_type = f2;
- else if (strcasecmp(optarg, "dh") == 0)
- map_type = dh;
- else if (strcasecmp(optarg, "gen") == 0)
- map_type = gen;
- else
- map_type = unk;
- break;
- case 'o':
- if (strcasecmp(optarg, "joinmap") == 0)
+ else if (strcasecmp(optarg, "bc1") == 0)
+ map_type = bc1;
+ else if (strcasecmp(optarg, "f2") == 0)
+ map_type = f2;
+ else if (strcasecmp(optarg, "dh") == 0)
+ map_type = dh;
+ else if (strcasecmp(optarg, "gen") == 0)
+ map_type = gen;
+ else
+ map_type = unk;
+ break;
+ case 'o':
+ if (strcasecmp(optarg, "joinmap") == 0)
out_type = joinmap;
- else if (strcasecmp(optarg, "rqtl") == 0)
- out_type = rqtl;
- else if (strcasecmp(optarg, "onemap") == 0)
- out_type = onemap;
- else if (strcasecmp(optarg, "genomic") == 0)
- out_type = genomic;
- break;
- case 'r':
- progeny_limit = atoi(optarg);
- break;
- case 'c':
- corrections = true;
- break;
- case 'L':
- lnl_limit = is_double(optarg);
- filter_lnl = true;
- break;
- case 'i':
- expand_id = true;
- break;
- case 's':
- sql_out = true;
- break;
- case 'W':
- wl_file = optarg;
- break;
- case 'B':
- bl_file = optarg;
- break;
- case 'C':
- man_corrections = true;
- cor_path = optarg;
- break;
- case 'm':
- min_stack_depth = is_integer(optarg);
- break;
- case 'H':
- min_hom_seqs = is_integer(optarg);
- break;
- case 'N':
- min_het_seqs = is_double(optarg);
- break;
- case 'X':
- max_het_seqs = is_double(optarg);
- break;
- case 'e':
- enz = optarg;
- break;
+ else if (strcasecmp(optarg, "rqtl") == 0)
+ out_type = rqtl;
+ else if (strcasecmp(optarg, "onemap") == 0)
+ out_type = onemap;
+ else if (strcasecmp(optarg, "genomic") == 0)
+ out_type = genomic;
+ break;
+ case 'r':
+ progeny_limit = atoi(optarg);
+ break;
+ case 'c':
+ corrections = true;
+ break;
+ case 'L':
+ lnl_limit = is_double(optarg);
+ filter_lnl = true;
+ break;
+ case 'i':
+ expand_id = true;
+ break;
+ case 's':
+ sql_out = true;
+ break;
+ case 'W':
+ wl_file = optarg;
+ break;
+ case 'B':
+ bl_file = optarg;
+ break;
+ case 'C':
+ man_corrections = true;
+ cor_path = optarg;
+ break;
+ case 'm':
+ min_stack_depth = is_integer(optarg);
+ break;
+ case 'H':
+ min_hom_seqs = is_integer(optarg);
+ break;
+ case 'N':
+ min_het_seqs = is_double(optarg);
+ break;
+ case 'X':
+ max_het_seqs = is_double(optarg);
+ break;
+ case 'e':
+ enz = optarg;
+ break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
- default:
- help();
- abort();
- }
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
+ default:
+ help();
+ abort();
+ }
}
if (in_path.length() == 0) {
- cerr << "You must specify a path to the directory containing Stacks output files.\n";
- help();
+ cerr << "You must specify a path to the directory containing Stacks output files.\n";
+ help();
}
- if (in_path.at(in_path.length() - 1) != '/')
- in_path += "/";
+ if (in_path.at(in_path.length() - 1) != '/')
+ in_path += "/";
if (batch_id < 0) {
- cerr << "You must specify a batch ID.\n";
- help();
+ cerr << "You must specify a batch ID.\n";
+ help();
}
if (map_type != cp &&
- map_type != dh &&
- map_type != bc1 &&
- map_type != f2 &&
- map_type != gen &&
- map_type != none) {
+ map_type != dh &&
+ map_type != bc1 &&
+ map_type != f2 &&
+ map_type != gen &&
+ map_type != none) {
cerr << "You must specify a valid map type. 'CP', 'DH', 'F2', 'BC1' and 'GEN' are the currently supported map types.\n";
help();
}
if (map_type != none && min_stack_depth > 0)
- cerr << "Warning: using a minimum stack depth when building genetic markers is not recommended.\n";
+ cerr << "Warning: using a minimum stack depth when building genetic markers is not recommended.\n";
if (out_type == genomic && enz.length() == 0) {
- cerr << "You must specify the restriction enzyme used with 'genomic' output.\n";
- help();
+ cerr << "You must specify the restriction enzyme used with 'genomic' output.\n";
+ help();
}
if (out_type == genomic && renz.count(enz) == 0) {
- cerr << "Unrecognized restriction enzyme specified: '" << enz.c_str() << "'.\n";
- help();
+ cerr << "Unrecognized restriction enzyme specified: '" << enz.c_str() << "'.\n";
+ help();
}
return 0;
@@ -2890,27 +2890,27 @@ void version() {
void help() {
std::cerr << "genotypes " << VERSION << "\n"
<< "genotypes -b batch_id -P path [-r min] [-m min] [-t map_type -o type] [-B blacklist] [-W whitelist] [-c] [-s] [-e renz] [-v] [-h]" << "\n"
- << " b: Batch ID to examine when exporting from the catalog.\n"
- << " r: minimum number of progeny required to print a marker.\n"
- << " c: make automated corrections to the data.\n"
- << " P: path to the Stacks output files.\n"
- << " t: map type to write. 'CP', 'DH', 'F2', 'BC1' and 'GEN' are the currently supported map types.\n"
- << " o: output file type to write, 'joinmap', 'onemap', 'rqtl', and 'genomic' are currently supported.\n"
- << " m: specify a minimum stack depth required before exporting a locus in a particular individual.\n"
- << " s: output a file to import results into an SQL database.\n"
- << " B: specify a file containing Blacklisted markers to be excluded from the export.\n"
- << " W: specify a file containign Whitelisted markers to include in the export.\n"
- << " e: restriction enzyme, required if generating 'genomic' output.\n"
- << " v: print program version." << "\n"
- << " h: display this help messsage." << "\n"
- << " Filtering options:\n"
- << " --lnl_lim [num]: filter loci with log likelihood values below this threshold.\n"
- << " Automated corrections options:\n"
- << " --min_hom_seqs: minimum number of reads required at a stack to call a homozygous genotype (default 5).\n"
- << " --min_het_seqs: below this minor allele frequency a stack is called a homozygote, above it (but below --max_het_seqs) it is called unknown (default 0.05).\n"
- << " --max_het_seqs: minimum frequency of minor allele to call a heterozygote (default 0.1).\n"
- << " Manual corrections options:\n"
- << " --cor_path <path>: path to file containing manual genotype corrections from a Stacks SQL database to incorporate into output.\n";
+ << " b: Batch ID to examine when exporting from the catalog.\n"
+ << " r: minimum number of progeny required to print a marker.\n"
+ << " c: make automated corrections to the data.\n"
+ << " P: path to the Stacks output files.\n"
+ << " t: map type to write. 'CP', 'DH', 'F2', 'BC1' and 'GEN' are the currently supported map types.\n"
+ << " o: output file type to write, 'joinmap', 'onemap', 'rqtl', and 'genomic' are currently supported.\n"
+ << " m: specify a minimum stack depth required before exporting a locus in a particular individual.\n"
+ << " s: output a file to import results into an SQL database.\n"
+ << " B: specify a file containing Blacklisted markers to be excluded from the export.\n"
+ << " W: specify a file containign Whitelisted markers to include in the export.\n"
+ << " e: restriction enzyme, required if generating 'genomic' output.\n"
+ << " v: print program version." << "\n"
+ << " h: display this help messsage." << "\n"
+ << " Filtering options:\n"
+ << " --lnl_lim [num]: filter loci with log likelihood values below this threshold.\n"
+ << " Automated corrections options:\n"
+ << " --min_hom_seqs: minimum number of reads required at a stack to call a homozygous genotype (default 5).\n"
+ << " --min_het_seqs: below this minor allele frequency a stack is called a homozygote, above it (but below --max_het_seqs) it is called unknown (default 0.05).\n"
+ << " --max_het_seqs: minimum frequency of minor allele to call a heterozygote (default 0.1).\n"
+ << " Manual corrections options:\n"
+ << " --cor_path <path>: path to file containing manual genotype corrections from a Stacks SQL database to incorporate into output.\n";
exit(0);
}
diff --git a/src/genotypes.h b/src/genotypes.h
index 8c5ea05..7a8476b 100644
--- a/src/genotypes.h
+++ b/src/genotypes.h
@@ -98,7 +98,7 @@ int check_homozygosity(vector<char *> &, int, char, char, string &);
int manual_corrections(string, PopMap<CSLocus> *);
int correct_cp_markers_missing_alleles(set<int> &, map<int, CSLocus *> &, PopMap<CSLocus> *);
-int calc_segregation_distortion(map<string, map<string, double> > &, map<int, CSLocus *> &, PopMap<CSLocus> *, set<int> &);
+int calc_segregation_distortion(map<string, map<string, double> > &, map<int, CSLocus *> &, PopMap<CSLocus> *, set<int> &);
double tally_generic_gtypes(int, PopMap<CSLocus> *, set<int> &, map<string, int> &);
double tally_translated_gtypes(int, PopMap<CSLocus> *, set<int> &, map<string, string> &, map<string, int> &);
double chisq_test(map<string, map<string, double> > &, map<string, int> &, string, double);
diff --git a/src/gzFasta.h b/src/gzFasta.h
index 59a522f..c289e7c 100644
--- a/src/gzFasta.h
+++ b/src/gzFasta.h
@@ -32,28 +32,28 @@ class GzFasta: public Input {
string buf;
public:
- GzFasta(const char *path) : Input() {
- this->gz_fh = gzopen(path, "rb");
- if (!this->gz_fh) {
- cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
+ GzFasta(const char *path) : Input() {
+ this->gz_fh = gzopen(path, "rb");
+ if (!this->gz_fh) {
+ cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
exit(EXIT_FAILURE);
- }
- #if ZLIB_VERNUM >= 0x1240
- gzbuffer(this->gz_fh, libz_buffer_size);
- #endif
+ }
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(this->gz_fh, libz_buffer_size);
+ #endif
};
- GzFasta(string path) : Input() {
- this->gz_fh = gzopen(path.c_str(), "rb");
- if (!this->gz_fh) {
- cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
+ GzFasta(string path) : Input() {
+ this->gz_fh = gzopen(path.c_str(), "rb");
+ if (!this->gz_fh) {
+ cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
exit(EXIT_FAILURE);
- }
- #if ZLIB_VERNUM >= 0x1240
- gzbuffer(this->gz_fh, libz_buffer_size);
- #endif
+ }
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(this->gz_fh, libz_buffer_size);
+ #endif
};
~GzFasta() {
- gzclose(this->gz_fh);
+ gzclose(this->gz_fh);
};
Seq *next_seq();
int next_seq(Seq &);
@@ -66,11 +66,11 @@ Seq *GzFasta::next_seq() {
// record.
//
while (this->line[0] != '>' && !gzeof(this->gz_fh)) {
- gzgets(this->gz_fh, this->line, max_len);
+ gzgets(this->gz_fh, this->line, max_len);
}
if (gzeof(this->gz_fh)) {
- return NULL;
+ return NULL;
}
//
@@ -94,22 +94,22 @@ Seq *GzFasta::next_seq() {
gzgets(this->gz_fh, this->line, max_len);
while (this->line[0] != '>' && !gzeof(this->gz_fh)) {
- len = strlen(this->line);
- if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
- if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
+ len = strlen(this->line);
+ if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
+ if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
- this->buf += this->line;
- this->line[0] = '\0';
- gzgets(this->gz_fh, this->line, max_len);
+ this->buf += this->line;
+ this->line[0] = '\0';
+ gzgets(this->gz_fh, this->line, max_len);
}
if (gzeof(this->gz_fh)) {
- len = strlen(this->line);
- if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
- if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
+ len = strlen(this->line);
+ if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
+ if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
- this->buf += this->line;
- this->line[0] = '\0';
+ this->buf += this->line;
+ this->line[0] = '\0';
}
s->seq = new char[this->buf.length() + 1];
@@ -126,11 +126,11 @@ int GzFasta::next_seq(Seq &s) {
// record.
//
while (this->line[0] != '>' && !gzeof(this->gz_fh)) {
- gzgets(this->gz_fh, this->line, max_len);
+ gzgets(this->gz_fh, this->line, max_len);
}
if (gzeof(this->gz_fh)) {
- return 0;
+ return 0;
}
//
@@ -152,22 +152,22 @@ int GzFasta::next_seq(Seq &s) {
gzgets(this->gz_fh, this->line, max_len);
while (this->line[0] != '>' && !gzeof(this->gz_fh)) {
- len = strlen(this->line);
- if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
- if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
+ len = strlen(this->line);
+ if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
+ if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
- this->buf += this->line;
- this->line[0] = '\0';
- gzgets(this->gz_fh, this->line, max_len);
+ this->buf += this->line;
+ this->line[0] = '\0';
+ gzgets(this->gz_fh, this->line, max_len);
}
if (gzeof(this->gz_fh)) {
- len = strlen(this->line);
- if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
- if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
+ len = strlen(this->line);
+ if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
+ if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
- this->buf += this->line;
- this->line[0] = '\0';
+ this->buf += this->line;
+ this->line[0] = '\0';
}
strcpy(s.seq, this->buf.c_str());
diff --git a/src/gzFastq.h b/src/gzFastq.h
index b5292ac..7c9b497 100644
--- a/src/gzFastq.h
+++ b/src/gzFastq.h
@@ -31,28 +31,28 @@ class GzFastq: public Input {
gzFile gz_fh;
public:
- GzFastq(string path) : Input() {
- this->gz_fh = gzopen(path.c_str(), "rb");
- if (!this->gz_fh) {
- cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
- exit(EXIT_FAILURE);
- }
- #if ZLIB_VERNUM >= 0x1240
- gzbuffer(this->gz_fh, libz_buffer_size);
- #endif
+ GzFastq(string path) : Input() {
+ this->gz_fh = gzopen(path.c_str(), "rb");
+ if (!this->gz_fh) {
+ cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
+ exit(EXIT_FAILURE);
+ }
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(this->gz_fh, libz_buffer_size);
+ #endif
};
- GzFastq(const char *path) : Input() {
- this->gz_fh = gzopen(path, "rb");
- if (!this->gz_fh) {
- cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
- exit(EXIT_FAILURE);
- }
- #if ZLIB_VERNUM >= 0x1240
- gzbuffer(this->gz_fh, libz_buffer_size);
- #endif
+ GzFastq(const char *path) : Input() {
+ this->gz_fh = gzopen(path, "rb");
+ if (!this->gz_fh) {
+ cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
+ exit(EXIT_FAILURE);
+ }
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(this->gz_fh, libz_buffer_size);
+ #endif
};
~GzFastq() {
- gzclose(this->gz_fh);
+ gzclose(this->gz_fh);
};
Seq *next_seq();
int next_seq(Seq &s);
@@ -68,11 +68,11 @@ Seq *GzFastq::next_seq() {
//
this->line[0] = '\0';
do {
- res = gzgets(this->gz_fh, this->line, max_len);
+ res = gzgets(this->gz_fh, this->line, max_len);
} while (this->line[0] != '@' && res != NULL);
if (res == NULL) {
- return NULL;
+ return NULL;
}
//
@@ -95,7 +95,7 @@ Seq *GzFastq::next_seq() {
gzgets(this->gz_fh, this->line, max_len);
if (gzeof(this->gz_fh)) {
- return NULL;
+ return NULL;
}
len = strlen(this->line);
@@ -112,7 +112,7 @@ Seq *GzFastq::next_seq() {
res = gzgets(this->gz_fh, this->line, max_len);
if (this->line[0] != '+' || res == NULL) {
- return NULL;
+ return NULL;
}
//
@@ -122,7 +122,7 @@ Seq *GzFastq::next_seq() {
res = gzgets(this->gz_fh, this->line, max_len);
if (res == NULL && strlen(this->line) == 0) {
- return NULL;
+ return NULL;
}
len = strlen(this->line);
@@ -134,7 +134,7 @@ Seq *GzFastq::next_seq() {
//
// Clear the line buffer so it is set up for the next record. If a '@'
- // appears in the quality scores read, it will break parsing next time
+ // appears in the quality scores read, it will break parsing next time
// it is called.
//
this->line[0] = '\0';
@@ -152,11 +152,11 @@ int GzFastq::next_seq(Seq &s) {
//
this->line[0] = '\0';
do {
- res = gzgets(this->gz_fh, this->line, max_len);
+ res = gzgets(this->gz_fh, this->line, max_len);
} while (this->line[0] != '@' && res != NULL);
if (res == NULL) {
- return 0;
+ return 0;
}
//
@@ -178,7 +178,7 @@ int GzFastq::next_seq(Seq &s) {
res = gzgets(this->gz_fh, this->line, max_len);
if (res == NULL) {
- return 0;
+ return 0;
}
len = strlen(this->line);
@@ -194,7 +194,7 @@ int GzFastq::next_seq(Seq &s) {
res = gzgets(this->gz_fh, this->line, max_len);
if (this->line[0] != '+' || res == NULL) {
- return 0;
+ return 0;
}
//
@@ -204,7 +204,7 @@ int GzFastq::next_seq(Seq &s) {
res = gzgets(this->gz_fh, this->line, max_len);
if (res == NULL && strlen(this->line) == 0) {
- return 0;
+ return 0;
}
len = strlen(this->line);
@@ -215,7 +215,7 @@ int GzFastq::next_seq(Seq &s) {
//
// Clear the line buffer so it is set up for the next record. If a '@'
- // appears in the quality scores read, it will break parsing next time
+ // appears in the quality scores read, it will break parsing next time
// it is called.
//
this->line[0] = '\0';
diff --git a/src/hstacks.cc b/src/hstacks.cc
index 5d9924d..e8d725d 100644
--- a/src/hstacks.cc
+++ b/src/hstacks.cc
@@ -68,12 +68,12 @@ int main (int argc, char* argv[]) {
map<int, HLocus *> sample;
map<int, HLocus *>::iterator it;
- size_t pos_1 = (*in_file).find_last_of("/");
- size_t pos_2 = (*in_file).find_last_of(".");
- string sample_id = (*in_file).substr(pos_1 + 1, (pos_2 - pos_1 - 1));
+ size_t pos_1 = (*in_file).find_last_of("/");
+ size_t pos_2 = (*in_file).find_last_of(".");
+ string sample_id = (*in_file).substr(pos_1 + 1, (pos_2 - pos_1 - 1));
- bool compressed = false;
- load_loci(*in_file, sample, false, false, compressed);
+ bool compressed = false;
+ load_loci(*in_file, sample, false, false, compressed);
//
// Give each locus a unique ID among all samples
@@ -146,12 +146,12 @@ int calc_kmer_distance(map<int, HLocus *> &loci, int stack_dist) {
// our map to a vector of integer keys.
//
vector<int> keys;
- for (it = loci.begin(); it != loci.end(); it++)
- keys.push_back(it->first);
+ for (it = loci.begin(); it != loci.end(); it++)
+ keys.push_back(it->first);
#pragma omp parallel private(i, j, tag_1, tag_2, allele)
- {
- #pragma omp for schedule(dynamic)
+ {
+ #pragma omp for schedule(dynamic)
for (i = 0; i < (int) keys.size(); i++) {
tag_1 = loci[keys[i]];
@@ -201,7 +201,7 @@ int calc_kmer_distance(map<int, HLocus *> &loci, int stack_dist) {
allele_cnts[*all_it]++;
for (cnt_it = allele_cnts.begin(); cnt_it != allele_cnts.end(); cnt_it++) {
- //cerr << " allele " << cnt_it->first << " has " << cnt_it->second << " hits\n";
+ //cerr << " allele " << cnt_it->first << " has " << cnt_it->second << " hits\n";
if (cnt_it->second < min_hits) continue;
@@ -212,8 +212,8 @@ int calc_kmer_distance(map<int, HLocus *> &loci, int stack_dist) {
d = dist(allele->second.c_str(), tag_2, cnt_it->first);
if (d < 0)
- cerr <<
- "Unknown error calculating distance between " <<
+ cerr <<
+ "Unknown error calculating distance between " <<
tag_1->id << " and " << tag_2->id << "; query allele: " << allele->first << "\n";
//cerr << " Distance: " << d << " CTAG_DIST: " << ctag_dist << "\n";
@@ -306,43 +306,43 @@ int calc_distance(map<int, HLocus *> &loci, int utag_dist) {
// OpenMP can't parallelize random access iterators, so we convert
// our map to a vector of integer keys.
vector<int> keys;
- for (it = loci.begin(); it != loci.end(); it++)
+ for (it = loci.begin(); it != loci.end(); it++)
keys.push_back(it->first);
#pragma omp parallel private(i, j, tag_1, tag_2)
- {
- #pragma omp for schedule(dynamic)
- for (i = 0; i < (int) keys.size(); i++) {
+ {
+ #pragma omp for schedule(dynamic)
+ for (i = 0; i < (int) keys.size(); i++) {
- tag_1 = loci[keys[i]];
+ tag_1 = loci[keys[i]];
- int d;
+ int d;
- for (j = 0; j < (int) keys.size(); j++) {
- tag_2 = loci[keys[j]];
+ for (j = 0; j < (int) keys.size(); j++) {
+ tag_2 = loci[keys[j]];
- // Don't compare tag_1 against itself.
- if (tag_1 == tag_2)
- continue;
+ // Don't compare tag_1 against itself.
+ if (tag_1 == tag_2)
+ continue;
- d = dist(tag_1, tag_2);
+ d = dist(tag_1, tag_2);
- //
- // Store the distance between these two sequences if it is
- // below the maximum distance.
- //
- if (d == utag_dist) {
+ //
+ // Store the distance between these two sequences if it is
+ // below the maximum distance.
+ //
+ if (d == utag_dist) {
if (tag_1->depth < stack_depth_min ||
tag_2->depth < stack_depth_min)
continue;
- tag_1->add_match(tag_2->uniq_id, d);
- }
- }
+ tag_1->add_match(tag_2->uniq_id, d);
+ }
+ }
- // Sort the vector of distances.
- sort(tag_1->matches.begin(), tag_1->matches.end(), compare_mdist);
- }
+ // Sort the vector of distances.
+ sort(tag_1->matches.begin(), tag_1->matches.end(), compare_mdist);
+ }
}
return 0;
@@ -358,9 +358,9 @@ int dist(HLocus *tag_1, HLocus *tag_2) {
// between the two sequences. Don't count wildcard 'N'
// nucleotides.
while (p < end) {
- dist += ((*p == *q) || (*q == 'N' || *p == 'N')) ? 0 : 1;
- p++;
- q++;
+ dist += ((*p == *q) || (*q == 'N' || *p == 'N')) ? 0 : 1;
+ p++;
+ q++;
}
return dist;
@@ -370,12 +370,12 @@ bool compare_mdist(Match *a, Match *b) {
return (a->dist < b->dist);
}
-int call_consensus(map<int, HLocus *> &loci, set<int> &merge_list,
+int call_consensus(map<int, HLocus *> &loci, set<int> &merge_list,
string &consensus, vector<SNP *> &snps, vector<string> &alleles) {
//
// Create a two-dimensional array, each row containing one read. For
// each unique tag that has been merged together, add the sequence for
- // that tag into our array as many times as it originally occurred.
+ // that tag into our array as many times as it originally occurred.
//
HLocus *tag;
set<int>::iterator j;
@@ -465,19 +465,19 @@ int call_alleles(vector<char *> &reads, vector<SNP *> &snps, vector<string> &all
vector<SNP *>::iterator snp;
if (snps.size() == 0)
- return 1;
+ return 1;
for (row = 0; row < height; row++) {
- allele.clear();
+ allele.clear();
- for (snp = snps.begin(); snp != snps.end(); snp++) {
- base = reads[row];
- base = base + (*snp)->col;
+ for (snp = snps.begin(); snp != snps.end(); snp++) {
+ base = reads[row];
+ base = base + (*snp)->col;
allele += *base;
- }
+ }
- if (allele.size() == snps.size())
- alleles.push_back(allele);
+ if (allele.size() == snps.size())
+ alleles.push_back(allele);
else
return 0;
}
@@ -515,25 +515,25 @@ int write_homologous_loci(map<int, HLocus *> &samples) {
int id = 1;
for (i = samples.begin(); i != samples.end(); i++) {
- tag_1 = i->second;
+ tag_1 = i->second;
- //
- // This tag may already have been merged by an earlier operation.
- //
- if (write_map.find(tag_1->uniq_id) != write_map.end())
- continue;
+ //
+ // This tag may already have been merged by an earlier operation.
+ //
+ if (write_map.find(tag_1->uniq_id) != write_map.end())
+ continue;
- set<int> unique_merge_list;
+ set<int> unique_merge_list;
set<string> unique_alleles;
- set<int>::iterator it;
+ set<int>::iterator it;
- trace_stack_graph(tag_1, samples, unique_merge_list);
+ trace_stack_graph(tag_1, samples, unique_merge_list);
//
// Call the consensus for this locus and identify SNPs and associated alleles.
//
string consensus;
- vector<SNP *> snps;
+ vector<SNP *> snps;
vector<string> alleles;
call_consensus(samples, unique_merge_list, consensus, snps, alleles);
@@ -542,7 +542,7 @@ int write_homologous_loci(map<int, HLocus *> &samples) {
// Output the consensus tag for a locus in this sample.
//
tag_file <<
- "0" << "\t" <<
+ "0" << "\t" <<
batch_id << "\t" <<
id << "\t" <<
tag_1->loc.chr << "\t" <<
@@ -550,7 +550,7 @@ int write_homologous_loci(map<int, HLocus *> &samples) {
"consensus" << "\t" <<
0 << "\t" <<
"" << "\t" <<
- consensus << "\t" <<
+ consensus << "\t" <<
0 << "\t" << // These flags are unused in hstacks, but important in ustacks
0 << "\t" <<
0 << "\n";
@@ -559,27 +559,27 @@ int write_homologous_loci(map<int, HLocus *> &samples) {
// Output the SNPs and alleles
//
string allele;
- vector<SNP *>::iterator s;
+ vector<SNP *>::iterator s;
set<string>::iterator u;
- for (s = snps.begin(); s != snps.end(); s++)
- snp_file <<
+ for (s = snps.begin(); s != snps.end(); s++)
+ snp_file <<
"0" << "\t" <<
batch_id << "\t" <<
- id << "\t" <<
- (*s)->col << "\t" <<
- (*s)->lratio << "\t" <<
- (*s)->rank_1 << "\t" <<
+ id << "\t" <<
+ (*s)->col << "\t" <<
+ (*s)->lratio << "\t" <<
+ (*s)->rank_1 << "\t" <<
(*s)->rank_2 << "\n";
- for (uint a = 0; a < alleles.size(); a++)
+ for (uint a = 0; a < alleles.size(); a++)
unique_alleles.insert(alleles[a]);
for (u = unique_alleles.begin(); u != unique_alleles.end(); u++)
- all_file <<
+ all_file <<
"0" << "\t" <<
batch_id << "\t" <<
- id << "\t" <<
+ id << "\t" <<
*u << "\t" <<
0 << "\t" <<
0 << "\n";
@@ -589,11 +589,11 @@ int write_homologous_loci(map<int, HLocus *> &samples) {
int sub_id = 0;
int a = 0;
- for (it = unique_merge_list.begin(); it != unique_merge_list.end(); it++) {
- tag_2 = samples[(*it)];
+ for (it = unique_merge_list.begin(); it != unique_merge_list.end(); it++) {
+ tag_2 = samples[(*it)];
- // Record the nodes that have been merged in this round.
- write_map.insert(tag_2->uniq_id);
+ // Record the nodes that have been merged in this round.
+ write_map.insert(tag_2->uniq_id);
//
// For each tag we are outputting, output the depth of coverage for each
@@ -604,25 +604,25 @@ int write_homologous_loci(map<int, HLocus *> &samples) {
char *p, *end;
end = tag_2->con + strlen(tag_2->con);
for (p = tag_2->con; p < end; p++)
- nuc_file
- << tag_2->sample_id << "_" << tag_2->id << "\t"
- << *p << "\t"
+ nuc_file
+ << tag_2->sample_id << "_" << tag_2->id << "\t"
+ << *p << "\t"
<< tag_2->depth << "\n";
}
//
// Output the consensus sequenes for all homologous loci.
//
- tag_file <<
- "0" << "\t" <<
+ tag_file <<
+ "0" << "\t" <<
batch_id << "\t" <<
id << "\t" <<
tag_2->loc.chr << "\t" <<
tag_2->loc.bp << "\t" <<
"primary" << "\t" <<
sub_id << "\t" <<
- tag_2->sample_id << "_" << tag_2->id << "\t" <<
- tag_2->con << "\t" <<
+ tag_2->sample_id << "_" << tag_2->id << "\t" <<
+ tag_2->con << "\t" <<
"" << "\t" << // These flags are unused in hstacks, but important in ustacks
"" << "\t" <<
"" << "\n";
@@ -630,18 +630,18 @@ int write_homologous_loci(map<int, HLocus *> &samples) {
allele = (alleles.size() == 0) ? "consensus" : alleles[a];
mat_file <<
- "0" << "\t" <<
- batch_id << "\t" <<
- id << "\t" <<
- tag_2->sample_id << "\t" <<
- tag_2->uniq_id << "\t" <<
- allele << "\n";
+ "0" << "\t" <<
+ batch_id << "\t" <<
+ id << "\t" <<
+ tag_2->sample_id << "\t" <<
+ tag_2->uniq_id << "\t" <<
+ allele << "\n";
sub_id++;
a++;
- }
+ }
- id++;
+ id++;
}
tag_file.close();
@@ -665,20 +665,20 @@ int trace_stack_graph(HLocus *tag_1, map<int, HLocus *> &loci, set<int> &unique_
merge_list.push(tag_1->uniq_id);
while (!merge_list.empty()) {
- tag_2 = loci[merge_list.front()];
- merge_list.pop();
-
- for (k = tag_2->matches.begin(); k != tag_2->matches.end(); k++) {
- ret = unique_merge_list.insert((*k)->cat_id);
-
- //
- // If this Tag has not already been added to the merge list (i.e. we were able
- // to insert it in to our unique_merge_list, which is a set), add it for consideration
- // later in the loop.
- //
- if (ret.second == true)
- merge_list.push((*k)->cat_id);
- }
+ tag_2 = loci[merge_list.front()];
+ merge_list.pop();
+
+ for (k = tag_2->matches.begin(); k != tag_2->matches.end(); k++) {
+ ret = unique_merge_list.insert((*k)->cat_id);
+
+ //
+ // If this Tag has not already been added to the merge list (i.e. we were able
+ // to insert it in to our unique_merge_list, which is a set), add it for consideration
+ // later in the loop.
+ //
+ if (ret.second == true)
+ merge_list.push((*k)->cat_id);
+ }
}
return 0;
@@ -696,14 +696,14 @@ int build_file_list(string in_path, vector<string> &sql_files) {
}
while ((dirp = readdir(dp)) != NULL) {
- d = string(dirp->d_name);
+ d = string(dirp->d_name);
- if (d.find("tags.tsv") != string::npos &&
+ if (d.find("tags.tsv") != string::npos &&
d.find("batch") == string::npos) {
- size_t pos = d.find(".tags.tsv");
- d = in_path + d.substr(0, pos);
- sql_files.push_back(d);
- }
+ size_t pos = d.find(".tags.tsv");
+ d = in_path + d.substr(0, pos);
+ sql_files.push_back(d);
+ }
}
closedir(dp);
@@ -713,7 +713,7 @@ int build_file_list(string in_path, vector<string> &sql_files) {
return 0;
}
-HLocus::~HLocus()
+HLocus::~HLocus()
{
vector<Match *>::iterator it;
@@ -721,8 +721,8 @@ HLocus::~HLocus()
delete *it;
}
-int
-HLocus::add_match(int id, int distance)
+int
+HLocus::add_match(int id, int distance)
{
Match *m = new Match;
m->cat_id = id;
@@ -733,8 +733,8 @@ HLocus::add_match(int id, int distance)
return 0;
}
-int
-HLocus::populate_alleles()
+int
+HLocus::populate_alleles()
{
this->strings.clear();
@@ -766,7 +766,7 @@ HLocus::populate_alleles()
char *p = this->con;
int i = 0;
while (*p != '\0') {
- if (*p == 'N')
+ if (*p == 'N')
col.push_back(i);
i++;
p++;
@@ -777,7 +777,7 @@ HLocus::populate_alleles()
if (n_cnt == 0) return 0;
//
- // If there are too many Ns in this stack, do not include it in the
+ // If there are too many Ns in this stack, do not include it in the
// search.
//
if (n_cnt > n_limit) {
@@ -824,81 +824,81 @@ HLocus::populate_alleles()
int parse_command_line(int argc, char* argv[]) {
int c;
-
+
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
- {"stack_dist", required_argument, NULL, 'n'},
- {"depth_min", required_argument, NULL, 'm'},
- {"inpath", required_argument, NULL, 'p'},
- {"outpath", required_argument, NULL, 'o'},
- {"n_limit", required_argument, NULL, 'N'},
- {"batch_id", required_argument, NULL, 'b'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "hvi:p:o:b:e:m:n:N:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 'v':
- version();
- break;
- case 'i':
- in_path = optarg;
- break;
- case 'o':
- out_path = optarg;
- break;
- case 'b':
- batch_id = atoi(optarg);
- break;
- case 'N':
- n_limit = atoi(optarg);
- break;
- case 'm':
- stack_depth_min = atoi(optarg);
- break;
- case 'n':
- stack_dist = atoi(optarg);
- break;
- case 'p':
- num_threads = atoi(optarg);
- break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
- default:
- cerr << "Unknown command line option '" << (char) c << "'\n";
- help();
- abort();
- }
+ {"stack_dist", required_argument, NULL, 'n'},
+ {"depth_min", required_argument, NULL, 'm'},
+ {"inpath", required_argument, NULL, 'p'},
+ {"outpath", required_argument, NULL, 'o'},
+ {"n_limit", required_argument, NULL, 'N'},
+ {"batch_id", required_argument, NULL, 'b'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hvi:p:o:b:e:m:n:N:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 'v':
+ version();
+ break;
+ case 'i':
+ in_path = optarg;
+ break;
+ case 'o':
+ out_path = optarg;
+ break;
+ case 'b':
+ batch_id = atoi(optarg);
+ break;
+ case 'N':
+ n_limit = atoi(optarg);
+ break;
+ case 'm':
+ stack_depth_min = atoi(optarg);
+ break;
+ case 'n':
+ stack_dist = atoi(optarg);
+ break;
+ case 'p':
+ num_threads = atoi(optarg);
+ break;
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
+ default:
+ cerr << "Unknown command line option '" << (char) c << "'\n";
+ help();
+ abort();
+ }
}
if (in_path.length() == 0) {
- cerr << "You must specify a path to a set of input files.\n";
- help();
+ cerr << "You must specify a path to a set of input files.\n";
+ help();
}
- if (in_path.at(in_path.length() - 1) != '/')
- in_path += "/";
+ if (in_path.at(in_path.length() - 1) != '/')
+ in_path += "/";
- if (out_path.length() == 0)
- out_path = ".";
+ if (out_path.length() == 0)
+ out_path = ".";
- if (out_path.at(out_path.length() - 1) != '/')
- out_path += "/";
+ if (out_path.at(out_path.length() - 1) != '/')
+ out_path += "/";
return 0;
}
@@ -912,14 +912,14 @@ void version() {
void help() {
std::cerr << "hstacks " << VERSION << "\n"
<< "hstacks -i path [-o path] [-b batch_id] [-n mismatches] [-m min] [-p min_threads] [-N limit] [-h]" << "\n"
- << " i: path to the set of SQL files from which to load loci." << "\n"
- << " o: output path to write results." << "\n"
- << " b: SQL Batch ID to insert into the output to identify a group of samples." << "\n"
+ << " i: path to the set of SQL files from which to load loci." << "\n"
+ << " o: output path to write results." << "\n"
+ << " b: SQL Batch ID to insert into the output to identify a group of samples." << "\n"
<< " m: minimum stack depth required for a locus to be included in the search." << "\n"
<< " n: number of mismatches to allow between stacks." << "\n"
<< " N: number of 'N' characters to allow in a stack (default: 4)." << "\n"
<< " p: enable parallel execution with num_threads threads.\n"
- << " h: display this help messsage." << "\n\n";
+ << " h: display this help messsage." << "\n\n";
exit(0);
}
diff --git a/src/input.cc b/src/input.cc
index d58b6a6..6252a25 100644
--- a/src/input.cc
+++ b/src/input.cc
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -21,44 +21,77 @@
//
// input.cc -- routines to read various formats of data into the XXX data structure.
//
-// Julian Catchen
-// jcatchen at uoregon.edu
-// University of Oregon
-//
-// $Id$
-//
#include "input.h"
-Seq::Seq() {
+Seq::Seq() {
this->id = NULL;
this->seq = NULL;
this->qual = NULL;
this->loc_str = NULL;
+ this->aln_type = pri_aln;
+ this->pct_aln = 1.0;
+}
+
+Seq::Seq(const Seq& other)
+ : loc(other.loc) {
+ if (other.id != NULL) {
+ id = new char[strlen(other.id)+1];
+ strcpy(id, other.id);
+ } else {
+ id = NULL;
+ }
+ if (other.seq != NULL) {
+ seq = new char[strlen(other.seq)+1];
+ strcpy(seq, other.seq);
+ } else {
+ seq = NULL;
+ }
+ if (other.qual != NULL) {
+ qual = new char[strlen(other.qual)+1];
+ strcpy(qual, other.qual);
+ } else {
+ qual = NULL;
+ }
+ if (other.loc_str != NULL) {
+ loc_str = new char[strlen(other.loc_str)+1];
+ strcpy(loc_str, other.loc_str);
+ } else {
+ loc_str = NULL;
+ }
+
+ pct_aln = other.pct_aln;
+ aln_type = other.aln_type;
}
-Seq::Seq(const char *id, const char *seq) {
+Seq::Seq(const char *id, const char *seq) {
this->id = new char[strlen(id) + 1];
this->seq = new char[strlen(seq) + 1];
this->qual = NULL;
this->loc_str = NULL;
- strcpy(this->id, id);
+ strcpy(this->id, id);
strcpy(this->seq, seq);
+
+ this->aln_type = pri_aln;
+ this->pct_aln = 1.0;
}
-Seq::Seq(const char *id, const char *seq, const char *qual) {
+Seq::Seq(const char *id, const char *seq, const char *qual) {
this->id = new char[strlen(id) + 1];
this->seq = new char[strlen(seq) + 1];
this->qual = new char[strlen(qual) + 1];
this->loc_str = NULL;
- strcpy(this->id, id);
- strcpy(this->seq, seq);
- strcpy(this->qual, qual);
+ strcpy(this->id, id);
+ strcpy(this->seq, seq);
+ strcpy(this->qual, qual);
+
+ this->aln_type = pri_aln;
+ this->pct_aln = 1.0;
}
-Seq::Seq(const char *id, const char *seq, const char *qual, const char *chr, uint bp, strand_type strand) {
+Seq::Seq(const char *id, const char *seq, const char *qual, const char *chr, uint bp, strand_type strand) {
this->id = new char[strlen(id) + 1];
this->qual = new char[strlen(qual) + 1];
this->loc_str = new char[strlen(chr) + 15];
@@ -70,14 +103,74 @@ Seq::Seq(const char *id, const char *seq, const char *qual, const char *chr, uin
sprintf(this->loc_str, "%s|%d|%c", chr, bp, strand == strand_plus ? '+' : '-');
//
- // Reverse complement sequences from the negative strand
+ // Reverse complement sequences from the negative strand
//
if (strand == strand_plus) {
- this->seq = new char[strlen(seq) + 1];
- strcpy(this->seq, seq);
+ this->seq = new char[strlen(seq) + 1];
+ strcpy(this->seq, seq);
} else {
- this->seq = rev_comp(seq);
+ this->seq = rev_comp(seq);
}
+
+ this->aln_type = pri_aln;
+ this->pct_aln = 1.0;
+}
+
+Seq::Seq(const char *id, const char *seq, const char *qual, const char *chr, uint bp, strand_type strand, alnt aln_type, double pct_aln) {
+ this->id = new char[strlen(id) + 1];
+ this->qual = new char[strlen(qual) + 1];
+ this->loc_str = new char[strlen(chr) + 15];
+
+ strcpy(this->id, id);
+ strcpy(this->qual, qual);
+ this->loc.set(chr, bp, strand);
+
+ sprintf(this->loc_str, "%s|%d|%c", chr, bp, strand == strand_plus ? '+' : '-');
+
+ //
+ // Reverse complement sequences from the negative strand
+ //
+ if (strand == strand_plus) {
+ this->seq = new char[strlen(seq) + 1];
+ strcpy(this->seq, seq);
+ } else {
+ this->seq = rev_comp(seq);
+ }
+
+ this->aln_type = aln_type;
+ this->pct_aln = pct_aln;
+}
+
+void swap(Seq& s1, Seq& s2) {
+ char *ptr;
+ alnt a;
+ double p;
+
+ ptr = s1.id;
+ s1.id = s2.id;
+ s2.id = ptr;
+
+ ptr = s1.seq;
+ s1.seq = s2.seq;
+ s2.seq = ptr;
+
+ ptr = s1.qual;
+ s1.qual = s2.qual;
+ s2.qual = ptr;
+
+ ptr = s1.loc_str;
+ s1.loc_str = s2.loc_str;
+ s2.loc_str = ptr;
+
+ a = s1.aln_type;
+ s1.aln_type = s2.aln_type;
+ s2.aln_type = a;
+
+ p = s1.pct_aln;
+ s1.pct_aln = s2.pct_aln;
+ s2.pct_aln = p;
+
+ swap(s1.loc, s2.loc);
}
Input::Input() {
@@ -93,7 +186,7 @@ Input::Input(const char *path) {
//
this->fh.open(path, ifstream::in);
- if (this->fh.fail())
+ if (this->fh.fail())
cerr << "Error opening input file '" << path << "'\n";
}
@@ -102,8 +195,8 @@ Input::~Input() {
this->fh.close();
}
-int
-parse_tsv(const char *line, vector<string> &parts)
+int
+parse_tsv(const char *line, vector<string> &parts)
{
const char *p, *q;
string part;
@@ -112,14 +205,14 @@ parse_tsv(const char *line, vector<string> &parts)
p = line;
do {
- for (q = p; *q != '\t' && *q != '\0'; q++);
- if (q - p == 0)
- part = "";
- else
- part.assign(p, (q - p));
- parts.push_back(part);
-
- p = q + 1;
+ for (q = p; *q != '\t' && *q != '\0'; q++);
+ if (q - p == 0)
+ part = "";
+ else
+ part.assign(p, (q - p));
+ parts.push_back(part);
+
+ p = q + 1;
} while (*q != '\0');
//for (size_t i = 0; i < parts.size(); i++)
@@ -129,8 +222,8 @@ parse_tsv(const char *line, vector<string> &parts)
return 0;
}
-int
-parse_ssv(const char *line, vector<string> &parts)
+int
+parse_ssv(const char *line, vector<string> &parts)
{
const char *p, *q;
string part;
@@ -139,14 +232,14 @@ parse_ssv(const char *line, vector<string> &parts)
p = line;
do {
- for (q = p; *q != ' ' && *q != '\0'; q++);
- if (q - p == 0)
- part = "";
- else
- part.assign(p, (q - p));
- parts.push_back(string(part));
-
- p = q + 1;
+ for (q = p; *q != ' ' && *q != '\0'; q++);
+ if (q - p == 0)
+ part = "";
+ else
+ part.assign(p, (q - p));
+ parts.push_back(string(part));
+
+ p = q + 1;
} while (*q != '\0');
return 0;
@@ -163,8 +256,8 @@ int read_line(ifstream &fh, char **line, int *size) {
// Make sure we read the entire line.
//
do {
- fh.clear();
- fh.getline(buf, max_len);
+ fh.clear();
+ fh.getline(buf, max_len);
blen = strlen(buf);
if (blen + llen <= (*size) - 1) {
@@ -179,7 +272,7 @@ int read_line(ifstream &fh, char **line, int *size) {
} while (fh.fail() && !fh.bad() && !fh.eof());
if (fh.eof() || fh.bad())
- return 0;
+ return 0;
return 1;
}
@@ -197,14 +290,14 @@ int read_gzip_line(gzFile &fh, char **line, int *size) {
// Make sure we read the entire line.
//
do {
- if (gzgets(fh, buf, max_len) == NULL) break;
+ if (gzgets(fh, buf, max_len) == NULL) break;
blen = strlen(buf);
- if (blen > 0 && buf[blen - 1] == '\n') {
- eol = true;
- buf[blen - 1] = '\0';
- }
+ if (blen > 0 && buf[blen - 1] == '\n') {
+ eol = true;
+ buf[blen - 1] = '\0';
+ }
if (blen + llen <= (*size) - 1) {
strcat(*line, buf);
@@ -218,7 +311,7 @@ int read_gzip_line(gzFile &fh, char **line, int *size) {
} while (!gzeof(fh) && !eol);
if (gzeof(fh))
- return 0;
+ return 0;
return 1;
}
@@ -229,18 +322,18 @@ is_comment(const char *line)
const char *p = line;
while (*p != '\0')
- switch(*p) {
- case '#':
- return true;
- break;
- case ' ':
- case '\t':
- p++;
- break;
- default:
- return false;
- break;
- }
+ switch(*p) {
+ case '#':
+ return true;
+ break;
+ case ' ':
+ case '\t':
+ p++;
+ break;
+ default:
+ return false;
+ break;
+ }
return false;
}
diff --git a/src/input.h b/src/input.h
index 92476c4..6a09284 100644
--- a/src/input.h
+++ b/src/input.h
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -41,7 +41,7 @@ using std::endl;
#include "utils.h"
#include "stacks.h"
-typedef unsigned int uint;
+enum alnt {pri_aln, sec_aln, sup_aln};
class Seq {
public:
@@ -49,20 +49,33 @@ class Seq {
char *seq;
char *qual;
- // Location information for a mapped sequence
+ //
+ // Information for an aligned sequence.
+ //
+ alnt aln_type;
+ double pct_aln;
char *loc_str;
PhyLoc loc;
- Seq( void );
+ Seq();
+ Seq(const Seq& other);
Seq(const char *, const char *);
Seq(const char *, const char *, const char *);
Seq(const char *, const char *, const char *, const char *, uint, strand_type);
- ~Seq( void ) {
- delete[] id;
- delete[] seq;
- delete[] qual;
- delete[] loc_str;
+ Seq(const char *, const char *, const char *, const char *, uint, strand_type, alnt, double);
+ ~Seq( void ) {
+ if (id != NULL)
+ delete[] id;
+ if (seq != NULL)
+ delete[] seq;
+ if (qual != NULL)
+ delete[] qual;
+ if (loc_str != NULL)
+ delete[] loc_str;
}
+ friend void swap(Seq&, Seq&);
+ Seq& operator=(Seq&& other) {swap(*this, other); return *this;}
+ Seq& operator=(const Seq& other) = delete;
};
//
diff --git a/src/kmer_filter.cc b/src/kmer_filter.cc
index d9bdfcd..c2971eb 100644
--- a/src/kmer_filter.cc
+++ b/src/kmer_filter.cc
@@ -19,7 +19,7 @@
//
//
-// kmer_filter --
+// kmer_filter --
//
#include "kmer_filter.h"
@@ -59,31 +59,31 @@ int main (int argc, char* argv[]) {
parse_command_line(argc, argv);
if (min_lim == 0)
- min_lim = (int) round((double) kmer_len * 0.80);
-
+ min_lim = (int) round((double) kmer_len * 0.80);
+
cerr << "Using a kmer size of " << kmer_len << "\n";
if (filter_rare_k) {
- cerr << "Filtering out reads by identifying rare kmers: On.\n"
- << " A kmer is considered rare when its coverage is at " << min_k_pct * 100 << "% or below the median kmer coverage for the read.\n"
- << " A read is dropped when it contains " << min_lim << " or more rare kmers in a row.\n";
+ cerr << "Filtering out reads by identifying rare kmers: On.\n"
+ << " A kmer is considered rare when its coverage is at " << min_k_pct * 100 << "% or below the median kmer coverage for the read.\n"
+ << " A read is dropped when it contains " << min_lim << " or more rare kmers in a row.\n";
} else
- cerr << "Filtering out reads by identifying rare kmers: Off.\n";
+ cerr << "Filtering out reads by identifying rare kmers: Off.\n";
if (filter_abundant_k) {
- cerr << "Filtering out reads by identifying abundant kmers: On.\n"
- << " Kmer is considered abundant when it occurs " << max_k_freq << " or more times.\n";
- if (max_lim == 0)
- cerr << " A read is dropped when it contains " << max_k_pct * 100 << "% or more abundant kmers.\n";
- else
- cerr << " A read is dropped when it contains " << max_lim << " or more abundant kmers.\n";
+ cerr << "Filtering out reads by identifying abundant kmers: On.\n"
+ << " Kmer is considered abundant when it occurs " << max_k_freq << " or more times.\n";
+ if (max_lim == 0)
+ cerr << " A read is dropped when it contains " << max_k_pct * 100 << "% or more abundant kmers.\n";
+ else
+ cerr << " A read is dropped when it contains " << max_lim << " or more abundant kmers.\n";
} else
- cerr << "Filtering out reads by identifying abundant kmers: Off.\n";
+ cerr << "Filtering out reads by identifying abundant kmers: Off.\n";
- if (normalize)
- cerr << "Normalizing read depth: On.\n"
- << " Read depth limit: " << normalize_lim << "x\n";
+ if (normalize)
+ cerr << "Normalizing read depth: On.\n"
+ << " Read depth limit: " << normalize_lim << "x\n";
else
- cerr << "Normalizing read depth: Off.\n";
+ cerr << "Normalizing read depth: Off.\n";
vector<pair<string, string> > files, pair_files;
map<string, map<string, long> > counters;
@@ -97,149 +97,149 @@ int main (int argc, char* argv[]) {
cerr << "Found " << pair_files.size() << " paired input file(s).\n";
if (filter_rare_k || filter_abundant_k || kmer_distr || write_k_freq) {
- cerr << "Generating kmer distribution...\n";
-
- if (read_k_freq)
- read_kmer_freq(k_freq_path, kmers, kmers_keys);
- else
- populate_kmers(pair_files, files, kmers, kmers_keys);
-
- // double kmer_med, kmer_mad;
- // calc_kmer_median(kmers, kmer_med, kmer_mad);
- // cerr << "Median kmer frequency: " << kmer_med << "; median absolute deviation: " << kmer_mad << "\n";
-
- if (kmer_distr) {
- generate_kmer_dist(kmers);
- if (write_k_freq == false)
- exit(0);
- }
-
- if (write_k_freq) {
- write_kmer_freq(k_freq_path, kmers);
- exit(0);
- }
-
- cerr << "Filtering reads by kmer frequency...\n";
-
- for (uint i = 0; i < pair_files.size(); i += 2) {
- cerr << "Processing paired file " << i+1 << " of " << (pair_files.size() / 2) << " [" << pair_files[i].second << "]\n";
-
- counters[pair_files[i].second]["total"] = 0;
- counters[pair_files[i].second]["retained"] = 0;
- counters[pair_files[i].second]["rare_k"] = 0;
- counters[pair_files[i].second]["abundant_k"] = 0;
-
- process_paired_reads(pair_files[i].first,
- pair_files[i].second,
- pair_files[i+1].first,
- pair_files[i+1].second,
- kmers,
- counters[pair_files[i].second]);
-
- cerr << " "
- << counters[pair_files[i].second]["total"] << " total reads; "
- << "-" << counters[pair_files[i].second]["rare_k"] << " rare k-mer reads; "
- << "-" << counters[pair_files[i].second]["abundant_k"] << " abundant k-mer reads; "
- << counters[pair_files[i].second]["retained"] << " retained reads.\n";
- }
-
- for (uint i = 0; i < files.size(); i++) {
- cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].second << "]\n";
-
- counters[files[i].second]["total"] = 0;
- counters[files[i].second]["retained"] = 0;
- counters[files[i].second]["rare_k"] = 0;
- counters[files[i].second]["abundant_k"] = 0;
-
- process_reads(files[i].first,
- files[i].second,
- kmers,
- counters[files[i].second]);
-
- cerr << " "
- << counters[files[i].second]["total"] << " total reads; "
- << "-" << counters[files[i].second]["rare_k"] << " rare k-mer reads; "
- << "-" << counters[files[i].second]["abundant_k"] << " abundant k-mer reads; "
- << counters[files[i].second]["retained"] << " retained reads.\n";
- }
-
- free_kmer_hash(kmers, kmers_keys);
-
- print_results(counters);
+ cerr << "Generating kmer distribution...\n";
+
+ if (read_k_freq)
+ read_kmer_freq(k_freq_path, kmers, kmers_keys);
+ else
+ populate_kmers(pair_files, files, kmers, kmers_keys);
+
+ // double kmer_med, kmer_mad;
+ // calc_kmer_median(kmers, kmer_med, kmer_mad);
+ // cerr << "Median kmer frequency: " << kmer_med << "; median absolute deviation: " << kmer_mad << "\n";
+
+ if (kmer_distr) {
+ generate_kmer_dist(kmers);
+ if (write_k_freq == false)
+ exit(0);
+ }
+
+ if (write_k_freq) {
+ write_kmer_freq(k_freq_path, kmers);
+ exit(0);
+ }
+
+ cerr << "Filtering reads by kmer frequency...\n";
+
+ for (uint i = 0; i < pair_files.size(); i += 2) {
+ cerr << "Processing paired file " << i+1 << " of " << (pair_files.size() / 2) << " [" << pair_files[i].second << "]\n";
+
+ counters[pair_files[i].second]["total"] = 0;
+ counters[pair_files[i].second]["retained"] = 0;
+ counters[pair_files[i].second]["rare_k"] = 0;
+ counters[pair_files[i].second]["abundant_k"] = 0;
+
+ process_paired_reads(pair_files[i].first,
+ pair_files[i].second,
+ pair_files[i+1].first,
+ pair_files[i+1].second,
+ kmers,
+ counters[pair_files[i].second]);
+
+ cerr << " "
+ << counters[pair_files[i].second]["total"] << " total reads; "
+ << "-" << counters[pair_files[i].second]["rare_k"] << " rare k-mer reads; "
+ << "-" << counters[pair_files[i].second]["abundant_k"] << " abundant k-mer reads; "
+ << counters[pair_files[i].second]["retained"] << " retained reads.\n";
+ }
+
+ for (uint i = 0; i < files.size(); i++) {
+ cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].second << "]\n";
+
+ counters[files[i].second]["total"] = 0;
+ counters[files[i].second]["retained"] = 0;
+ counters[files[i].second]["rare_k"] = 0;
+ counters[files[i].second]["abundant_k"] = 0;
+
+ process_reads(files[i].first,
+ files[i].second,
+ kmers,
+ counters[files[i].second]);
+
+ cerr << " "
+ << counters[files[i].second]["total"] << " total reads; "
+ << "-" << counters[files[i].second]["rare_k"] << " rare k-mer reads; "
+ << "-" << counters[files[i].second]["abundant_k"] << " abundant k-mer reads; "
+ << counters[files[i].second]["retained"] << " retained reads.\n";
+ }
+
+ free_kmer_hash(kmers, kmers_keys);
+
+ print_results(counters);
}
if (normalize) {
- cerr << "Normalizing read depth...\n";
-
- //
- // Add the remainder files from the previous step to the queue.
- //
- if (filter_rare_k || filter_abundant_k) {
- string file;
- int pos;
- for (uint i = 0; i < pair_files.size(); i += 2) {
- file = pair_files[i].second;
- pos = file.find_last_of(".");
- if (file.substr(pos - 2, 2) == ".1")
- pos -= 2;
- file = file.substr(0, pos) + ".rem.fil";
- file += out_file_type == FileT::fastq ? ".fq" : ".fa";
- cerr << "Adding remainder file generated in previous step to queue, '" << file << "\n";
- files.push_back(make_pair(pair_files[i].first, file));
- }
- }
-
- for (uint i = 0; i < pair_files.size(); i += 2) {
- cerr << "Processing paired files " << i+1 << " of " << (pair_files.size() / 2)
- << " [" << pair_files[i].second << " / " << pair_files[i+1].second << "]\n";
-
- counters[pair_files[i].second]["total"] = 0;
- counters[pair_files[i].second]["retained"] = 0;
- counters[pair_files[i].second]["overep"] = 0;
-
- normalize_paired_reads(pair_files[i].first,
- pair_files[i].second,
- pair_files[i+1].first,
- pair_files[i+1].second,
- kmers, kmers_keys,
- counters[pair_files[i].second]);
-
- cerr << " "
- << counters[pair_files[i].second]["total"] << " total reads; "
- << "-" << counters[pair_files[i].second]["overep"] << " over-represented reads; "
- << counters[pair_files[i].second]["retained"] << " retained reads.\n";
- }
-
- for (uint i = 0; i < files.size(); i++) {
- cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].second << "]\n";
-
- counters[files[i].second]["total"] = 0;
- counters[files[i].second]["retained"] = 0;
- counters[files[i].second]["overep"] = 0;
-
- normalize_reads(files[i].first,
- files[i].second,
- kmers, kmers_keys,
- counters[files[i].second]);
-
- cerr << " "
- << counters[files[i].second]["total"] << " total reads; "
- << "-" << counters[files[i].second]["overep"] << " over-represented reads; "
- << counters[files[i].second]["retained"] << " retained reads.\n";
- }
-
- free_kmer_hash(kmers, kmers_keys);
+ cerr << "Normalizing read depth...\n";
+
+ //
+ // Add the remainder files from the previous step to the queue.
+ //
+ if (filter_rare_k || filter_abundant_k) {
+ string file;
+ int pos;
+ for (uint i = 0; i < pair_files.size(); i += 2) {
+ file = pair_files[i].second;
+ pos = file.find_last_of(".");
+ if (file.substr(pos - 2, 2) == ".1")
+ pos -= 2;
+ file = file.substr(0, pos) + ".rem.fil";
+ file += out_file_type == FileT::fastq ? ".fq" : ".fa";
+ cerr << "Adding remainder file generated in previous step to queue, '" << file << "\n";
+ files.push_back(make_pair(pair_files[i].first, file));
+ }
+ }
+
+ for (uint i = 0; i < pair_files.size(); i += 2) {
+ cerr << "Processing paired files " << i+1 << " of " << (pair_files.size() / 2)
+ << " [" << pair_files[i].second << " / " << pair_files[i+1].second << "]\n";
+
+ counters[pair_files[i].second]["total"] = 0;
+ counters[pair_files[i].second]["retained"] = 0;
+ counters[pair_files[i].second]["overep"] = 0;
+
+ normalize_paired_reads(pair_files[i].first,
+ pair_files[i].second,
+ pair_files[i+1].first,
+ pair_files[i+1].second,
+ kmers, kmers_keys,
+ counters[pair_files[i].second]);
+
+ cerr << " "
+ << counters[pair_files[i].second]["total"] << " total reads; "
+ << "-" << counters[pair_files[i].second]["overep"] << " over-represented reads; "
+ << counters[pair_files[i].second]["retained"] << " retained reads.\n";
+ }
+
+ for (uint i = 0; i < files.size(); i++) {
+ cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].second << "]\n";
+
+ counters[files[i].second]["total"] = 0;
+ counters[files[i].second]["retained"] = 0;
+ counters[files[i].second]["overep"] = 0;
+
+ normalize_reads(files[i].first,
+ files[i].second,
+ kmers, kmers_keys,
+ counters[files[i].second]);
+
+ cerr << " "
+ << counters[files[i].second]["total"] << " total reads; "
+ << "-" << counters[files[i].second]["overep"] << " over-represented reads; "
+ << counters[files[i].second]["retained"] << " retained reads.\n";
+ }
+
+ free_kmer_hash(kmers, kmers_keys);
}
return 0;
}
-int process_paired_reads(string in_path_1,
- string in_file_1,
- string in_path_2,
- string in_file_2,
- SeqKmerHash &kmers,
- map<string, long> &counter) {
+int process_paired_reads(string in_path_1,
+ string in_file_1,
+ string in_path_2,
+ string in_file_2,
+ SeqKmerHash &kmers,
+ map<string, long> &counter) {
Input *fh_1, *fh_2;
ofstream *discard_fh_1, *discard_fh_2;
int pos;
@@ -272,16 +272,16 @@ int process_paired_reads(string in_path_1,
path = out_path + in_file_1.substr(0, pos) + ".fil" + in_file_1.substr(pos);
ofstream *ofh_1 = new ofstream(path.c_str(), ifstream::out);
if (ofh_1->fail()) {
- cerr << "Error opening filtered output file '" << path << "'\n";
- exit(1);
+ cerr << "Error opening filtered output file '" << path << "'\n";
+ exit(1);
}
pos = in_file_2.find_last_of(".");
path = out_path + in_file_2.substr(0, pos) + ".fil" + in_file_2.substr(pos);
ofstream *ofh_2 = new ofstream(path.c_str(), ifstream::out);
if (ofh_2->fail()) {
- cerr << "Error opening filtered paired output file '" << path << "'\n";
- exit(1);
+ cerr << "Error opening filtered paired output file '" << path << "'\n";
+ exit(1);
}
pos = in_file_2.find_last_of(".");
@@ -289,46 +289,46 @@ int process_paired_reads(string in_path_1,
// Pull the ".2" suffix off the paired file name to make the remainder file name.
//
if (in_file_2.substr(pos - 2, 2) == ".2")
- pos -= 2;
+ pos -= 2;
path = out_path + in_file_2.substr(0, pos) + ".rem.fil";
path += out_file_type == FileT::fastq ? ".fq" : ".fa";
ofstream *rem_fh = new ofstream(path.c_str(), ifstream::out);
if (rem_fh->fail()) {
- cerr << "Error opening filtered remainder output file '" << path << "'\n";
- exit(1);
+ cerr << "Error opening filtered remainder output file '" << path << "'\n";
+ exit(1);
}
//
// Open a file for recording discarded reads
//
if (discards) {
- pos = in_file_1.find_last_of(".");
- path = out_path + in_file_1.substr(0, pos) + ".discards" + in_file_1.substr(pos);
- discard_fh_1 = new ofstream(path.c_str(), ifstream::out);
-
- if (discard_fh_1->fail()) {
- cerr << "Error opening discard output file '" << path << "'\n";
- exit(1);
- }
- pos = in_file_2.find_last_of(".");
- path = out_path + in_file_2.substr(0, pos) + ".discards" + in_file_2.substr(pos);
- discard_fh_2 = new ofstream(path.c_str(), ifstream::out);
-
- if (discard_fh_2->fail()) {
- cerr << "Error opening discard output file '" << path << "'\n";
- exit(1);
- }
+ pos = in_file_1.find_last_of(".");
+ path = out_path + in_file_1.substr(0, pos) + ".discards" + in_file_1.substr(pos);
+ discard_fh_1 = new ofstream(path.c_str(), ifstream::out);
+
+ if (discard_fh_1->fail()) {
+ cerr << "Error opening discard output file '" << path << "'\n";
+ exit(1);
+ }
+ pos = in_file_2.find_last_of(".");
+ path = out_path + in_file_2.substr(0, pos) + ".discards" + in_file_2.substr(pos);
+ discard_fh_2 = new ofstream(path.c_str(), ifstream::out);
+
+ if (discard_fh_2->fail()) {
+ cerr << "Error opening discard output file '" << path << "'\n";
+ exit(1);
+ }
}
//
- // Read in the first record, initializing the Seq object s. Then
+ // Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
//
Seq *s_1 = fh_1->next_seq();
Seq *s_2 = fh_2->next_seq();
if (s_1 == NULL || s_2 == NULL) {
- cerr << "Unable to allocate Seq object.\n";
- exit(1);
+ cerr << "Unable to allocate Seq object.\n";
+ exit(1);
}
int rare_k, abundant_k, num_kmers, max_kmer_lim;
@@ -337,93 +337,93 @@ int process_paired_reads(string in_path_1,
long i = 1;
do {
- if (i % 10000 == 0) cerr << " Processing short read pair " << i << " \r";
- counter["total"] += 2;
- stringstream msg_1, msg_2;
-
- retain_1 = true;
- retain_2 = true;
- num_kmers = strlen(s_1->seq) - kmer_len + 1;
- max_kmer_lim = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
-
- //
- // Drop the first sequence if it has too many rare or abundant kmers.
- //
- kmer_lookup(kmers, s_1->seq, kmer, num_kmers, rare_k, abundant_k);
-
- if (filter_rare_k && rare_k > 0) {
- counter["rare_k"]++;
- retain_1 = false;
- msg_1 << "rare_k_" << rare_k;
- }
-
- if (retain_1 && filter_abundant_k && abundant_k > max_kmer_lim) {
- counter["abundant_k"]++;
- retain_1 = false;
- msg_1 << "abundant_k_" << abundant_k;
- }
-
- rare_k = 0;
- abundant_k = 0;
- num_kmers = strlen(s_2->seq) - kmer_len + 1;
- max_kmer_lim = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
-
- //
- // Drop the second sequence if it has too many rare or abundant kmers.
- //
- kmer_lookup(kmers, s_2->seq, kmer, num_kmers, rare_k, abundant_k);
-
- if (filter_rare_k && rare_k > 0) {
- counter["rare_k"]++;
- retain_2 = false;
- msg_2 << "rare_k_" << rare_k;
- }
-
- if (retain_2 && filter_abundant_k && abundant_k > max_kmer_lim) {
- counter["abundant_k"]++;
- retain_2 = false;
- msg_2 << "abundant_k_" << abundant_k;
- }
-
- if (retain_1 && retain_2) {
- counter["retained"] += 2;
- out_file_type == FileT::fastq ?
- write_fastq(ofh_1, s_1) : write_fasta(ofh_1, s_1);
- out_file_type == FileT::fastq ?
- write_fastq(ofh_2, s_2) : write_fasta(ofh_2, s_2);
- }
-
- if (retain_1 && !retain_2) {
- counter["retained"]++;
- out_file_type == FileT::fastq ?
- write_fastq(rem_fh, s_1) : write_fasta(rem_fh, s_1);
- }
-
- if (!retain_1 && retain_2) {
- counter["retained"]++;
- out_file_type == FileT::fastq ?
- write_fastq(rem_fh, s_2) : write_fasta(rem_fh, s_2);
- }
-
- if (discards && !retain_1)
- out_file_type == FileT::fastq ?
- write_fastq(discard_fh_1, s_1, msg_1.str()) : write_fasta(discard_fh_1, s_1, msg_1.str());
- if (discards && !retain_2)
- out_file_type == FileT::fastq ?
- write_fastq(discard_fh_2, s_2, msg_2.str()) : write_fasta(discard_fh_2, s_2, msg_2.str());
-
- delete s_1;
- delete s_2;
-
- i++;
+ if (i % 10000 == 0) cerr << " Processing short read pair " << i << " \r";
+ counter["total"] += 2;
+ stringstream msg_1, msg_2;
+
+ retain_1 = true;
+ retain_2 = true;
+ num_kmers = strlen(s_1->seq) - kmer_len + 1;
+ max_kmer_lim = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
+
+ //
+ // Drop the first sequence if it has too many rare or abundant kmers.
+ //
+ kmer_lookup(kmers, s_1->seq, kmer, num_kmers, rare_k, abundant_k);
+
+ if (filter_rare_k && rare_k > 0) {
+ counter["rare_k"]++;
+ retain_1 = false;
+ msg_1 << "rare_k_" << rare_k;
+ }
+
+ if (retain_1 && filter_abundant_k && abundant_k > max_kmer_lim) {
+ counter["abundant_k"]++;
+ retain_1 = false;
+ msg_1 << "abundant_k_" << abundant_k;
+ }
+
+ rare_k = 0;
+ abundant_k = 0;
+ num_kmers = strlen(s_2->seq) - kmer_len + 1;
+ max_kmer_lim = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
+
+ //
+ // Drop the second sequence if it has too many rare or abundant kmers.
+ //
+ kmer_lookup(kmers, s_2->seq, kmer, num_kmers, rare_k, abundant_k);
+
+ if (filter_rare_k && rare_k > 0) {
+ counter["rare_k"]++;
+ retain_2 = false;
+ msg_2 << "rare_k_" << rare_k;
+ }
+
+ if (retain_2 && filter_abundant_k && abundant_k > max_kmer_lim) {
+ counter["abundant_k"]++;
+ retain_2 = false;
+ msg_2 << "abundant_k_" << abundant_k;
+ }
+
+ if (retain_1 && retain_2) {
+ counter["retained"] += 2;
+ out_file_type == FileT::fastq ?
+ write_fastq(ofh_1, s_1) : write_fasta(ofh_1, s_1);
+ out_file_type == FileT::fastq ?
+ write_fastq(ofh_2, s_2) : write_fasta(ofh_2, s_2);
+ }
+
+ if (retain_1 && !retain_2) {
+ counter["retained"]++;
+ out_file_type == FileT::fastq ?
+ write_fastq(rem_fh, s_1) : write_fasta(rem_fh, s_1);
+ }
+
+ if (!retain_1 && retain_2) {
+ counter["retained"]++;
+ out_file_type == FileT::fastq ?
+ write_fastq(rem_fh, s_2) : write_fasta(rem_fh, s_2);
+ }
+
+ if (discards && !retain_1)
+ out_file_type == FileT::fastq ?
+ write_fastq(discard_fh_1, s_1, msg_1.str()) : write_fasta(discard_fh_1, s_1, msg_1.str());
+ if (discards && !retain_2)
+ out_file_type == FileT::fastq ?
+ write_fastq(discard_fh_2, s_2, msg_2.str()) : write_fasta(discard_fh_2, s_2, msg_2.str());
+
+ delete s_1;
+ delete s_2;
+
+ i++;
} while ((s_1 = fh_1->next_seq()) != NULL &&
- (s_2 = fh_2->next_seq()) != NULL);
+ (s_2 = fh_2->next_seq()) != NULL);
delete [] kmer;
if (discards) {
- delete discard_fh_1;
- delete discard_fh_2;
+ delete discard_fh_1;
+ delete discard_fh_2;
}
//
@@ -438,10 +438,10 @@ int process_paired_reads(string in_path_1,
return 0;
}
-int process_reads(string in_path,
- string in_file,
- SeqKmerHash &kmers,
- map<string, long> &counter) {
+int process_reads(string in_path,
+ string in_file,
+ SeqKmerHash &kmers,
+ map<string, long> &counter) {
Input *fh;
ofstream *discard_fh;
int pos;
@@ -464,81 +464,81 @@ int process_reads(string in_path,
//
pos = in_file.find_last_of(".");
path = out_path + in_file.substr(0, pos) + ".fil" + in_file.substr(pos);
- ofstream *out_fh = new ofstream(path.c_str(), ifstream::out);
+ ofstream *out_fh = new ofstream(path.c_str(), ifstream::out);
if (out_fh->fail()) {
- cerr << "Error opening output file '" << path << "'\n";
- exit(1);
+ cerr << "Error opening output file '" << path << "'\n";
+ exit(1);
}
//
// Open a file for recording discarded reads
//
if (discards) {
- pos = in_file.find_last_of(".");
- path = out_path + in_file.substr(0, pos) + ".discards" + in_file.substr(pos);
- discard_fh = new ofstream(path.c_str(), ifstream::out);
-
- if (discard_fh->fail()) {
- cerr << "Error opening discard output file '" << path << "'\n";
- exit(1);
- }
+ pos = in_file.find_last_of(".");
+ path = out_path + in_file.substr(0, pos) + ".discards" + in_file.substr(pos);
+ discard_fh = new ofstream(path.c_str(), ifstream::out);
+
+ if (discard_fh->fail()) {
+ cerr << "Error opening discard output file '" << path << "'\n";
+ exit(1);
+ }
}
//
- // Read in the first record, initializing the Seq object s. Then
+ // Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
//
Seq *s = fh->next_seq();
if (s == NULL) {
- cerr << "Unable to allocate Seq object.\n";
- exit(1);
+ cerr << "Unable to allocate Seq object.\n";
+ exit(1);
}
-
+
int rare_k, abundant_k, num_kmers, max_kmer_lim;
bool retain;
char *kmer = new char[kmer_len + 1];
long i = 1;
do {
- if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
- counter["total"]++;
- stringstream msg;
-
- //
- // Drop this sequence if it has too many rare or abundant kmers.
- //
- retain = true;
- num_kmers = strlen(s->seq) - kmer_len + 1;
- max_kmer_lim = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
-
- kmer_lookup(kmers, s->seq, kmer, num_kmers, rare_k, abundant_k);
-
- if (filter_rare_k && rare_k > 0) {
- counter["rare_k"]++;
- retain = false;
- msg << "rare_k_" << rare_k;
- }
-
- if (retain && filter_abundant_k && abundant_k > max_kmer_lim) {
- counter["abundant_k"]++;
- retain = false;
- msg << "abundant_k_" << abundant_k;
- }
-
- if (retain) {
- counter["retained"]++;
- out_file_type == FileT::fastq ?
- write_fastq(out_fh, s) : write_fasta(out_fh, s);
- }
-
- if (discards && !retain)
- out_file_type == FileT::fastq ?
- write_fastq(discard_fh, s, msg.str()) : write_fasta(discard_fh, s, msg.str());
-
- delete s;
-
- i++;
+ if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
+ counter["total"]++;
+ stringstream msg;
+
+ //
+ // Drop this sequence if it has too many rare or abundant kmers.
+ //
+ retain = true;
+ num_kmers = strlen(s->seq) - kmer_len + 1;
+ max_kmer_lim = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
+
+ kmer_lookup(kmers, s->seq, kmer, num_kmers, rare_k, abundant_k);
+
+ if (filter_rare_k && rare_k > 0) {
+ counter["rare_k"]++;
+ retain = false;
+ msg << "rare_k_" << rare_k;
+ }
+
+ if (retain && filter_abundant_k && abundant_k > max_kmer_lim) {
+ counter["abundant_k"]++;
+ retain = false;
+ msg << "abundant_k_" << abundant_k;
+ }
+
+ if (retain) {
+ counter["retained"]++;
+ out_file_type == FileT::fastq ?
+ write_fastq(out_fh, s) : write_fasta(out_fh, s);
+ }
+
+ if (discards && !retain)
+ out_file_type == FileT::fastq ?
+ write_fastq(discard_fh, s, msg.str()) : write_fasta(discard_fh, s, msg.str());
+
+ delete s;
+
+ i++;
} while ((s = fh->next_seq()) != NULL);
delete [] kmer;
@@ -554,13 +554,13 @@ int process_reads(string in_path,
return 0;
}
-int
-normalize_paired_reads(string in_path_1,
- string in_file_1,
- string in_path_2,
- string in_file_2,
- SeqKmerHash &kmers, vector<char *> &kmer_keys,
- map<string, long> &counter)
+int
+normalize_paired_reads(string in_path_1,
+ string in_file_1,
+ string in_path_2,
+ string in_file_2,
+ SeqKmerHash &kmers, vector<char *> &kmer_keys,
+ map<string, long> &counter)
{
Input *fh_1, *fh_2;
ofstream *discard_fh_1, *discard_fh_2;
@@ -569,156 +569,156 @@ normalize_paired_reads(string in_path_1,
int pos;
if (filter_abundant_k || filter_rare_k) {
- //
- // If we already filtered the data, open the files we created in the output
- // directory to normalize.
- //
- pos = in_file_1.find_last_of(".");
- path_1 = out_path + in_file_1.substr(0, pos) + ".fil" + in_file_1.substr(pos);
-
- pos = in_file_2.find_last_of(".");
- path_2 = out_path + in_file_2.substr(0, pos) + ".fil" + in_file_2.substr(pos);
-
- if (in_file_type == FileT::fastq) {
- fh_1 = new Fastq(path_1);
- fh_2 = new Fastq(path_2);
- } else if (in_file_type == FileT::gzfastq) {
- fh_1 = new Fastq(path_1);
- fh_2 = new Fastq(path_2);
- } else if (in_file_type == FileT::fasta) {
- fh_1 = new Fasta(path_1);
- fh_2 = new Fasta(path_2);
- } else if (in_file_type == FileT::gzfasta) {
- fh_1 = new Fasta(path_1);
- fh_2 = new Fasta(path_2);
- }
+ //
+ // If we already filtered the data, open the files we created in the output
+ // directory to normalize.
+ //
+ pos = in_file_1.find_last_of(".");
+ path_1 = out_path + in_file_1.substr(0, pos) + ".fil" + in_file_1.substr(pos);
+
+ pos = in_file_2.find_last_of(".");
+ path_2 = out_path + in_file_2.substr(0, pos) + ".fil" + in_file_2.substr(pos);
+
+ if (in_file_type == FileT::fastq) {
+ fh_1 = new Fastq(path_1);
+ fh_2 = new Fastq(path_2);
+ } else if (in_file_type == FileT::gzfastq) {
+ fh_1 = new Fastq(path_1);
+ fh_2 = new Fastq(path_2);
+ } else if (in_file_type == FileT::fasta) {
+ fh_1 = new Fasta(path_1);
+ fh_2 = new Fasta(path_2);
+ } else if (in_file_type == FileT::gzfasta) {
+ fh_1 = new Fasta(path_1);
+ fh_2 = new Fasta(path_2);
+ }
} else {
- //
- // Otherwise, open unmodified files.
- //
- path_1 = in_path_1 + in_file_1;
- path_2 = in_path_2 + in_file_2;
-
- if (in_file_type == FileT::fastq) {
- fh_1 = new Fastq(path_1);
- fh_2 = new Fastq(path_2);
- } else if (in_file_type == FileT::gzfastq) {
- fh_1 = new GzFastq(path_1 + ".gz");
- fh_2 = new GzFastq(path_2 + ".gz");
- } else if (in_file_type == FileT::fasta) {
- fh_1 = new Fasta(path_1);
- fh_2 = new Fasta(path_2);
- } else if (in_file_type == FileT::gzfasta) {
- fh_1 = new GzFasta(path_1 + ".gz");
- fh_2 = new GzFasta(path_2 + ".gz");
- } else if (in_file_type == FileT::bustard) {
- fh_1 = new Bustard(path_1);
- fh_2 = new Bustard(path_2);
- }
+ //
+ // Otherwise, open unmodified files.
+ //
+ path_1 = in_path_1 + in_file_1;
+ path_2 = in_path_2 + in_file_2;
+
+ if (in_file_type == FileT::fastq) {
+ fh_1 = new Fastq(path_1);
+ fh_2 = new Fastq(path_2);
+ } else if (in_file_type == FileT::gzfastq) {
+ fh_1 = new GzFastq(path_1 + ".gz");
+ fh_2 = new GzFastq(path_2 + ".gz");
+ } else if (in_file_type == FileT::fasta) {
+ fh_1 = new Fasta(path_1);
+ fh_2 = new Fasta(path_2);
+ } else if (in_file_type == FileT::gzfasta) {
+ fh_1 = new GzFasta(path_1 + ".gz");
+ fh_2 = new GzFasta(path_2 + ".gz");
+ } else if (in_file_type == FileT::bustard) {
+ fh_1 = new Bustard(path_1);
+ fh_2 = new Bustard(path_2);
+ }
}
//
// Open the output files.
//
if (filter_abundant_k || filter_rare_k) {
- pos = in_file_1.find_last_of(".");
- path_1 = out_path + in_file_1.substr(0, pos) + ".fil.norm" + in_file_1.substr(pos);
- ofh_1 = new ofstream(path_1.c_str(), ifstream::out);
-
- if (ofh_1->fail()) {
- cerr << "Error opening normalized output file '" << path_1 << "'\n";
- exit(1);
- }
-
- pos = in_file_2.find_last_of(".");
- path_2 = out_path + in_file_2.substr(0, pos) + ".fil.norm" + in_file_2.substr(pos);
- ofh_2 = new ofstream(path_2.c_str(), ifstream::out);
-
- if (ofh_2->fail()) {
- cerr << "Error opening normalized paired output file '" << path_2 << "'\n";
- exit(1);
- }
-
- if (in_file_2.substr(pos - 2, 2) == ".2")
- pos -= 2;
- path_2 = out_path + in_file_2.substr(0, pos) + ".fil.norm.rem";
- path_2 += out_file_type == FileT::fastq ? ".fq" : ".fa";
- rem_fh = new ofstream(path_2.c_str(), ifstream::out);
-
- if (rem_fh->fail()) {
- cerr << "Error opening normalized remainder output file '" << path_2 << "'\n";
- exit(1);
- }
+ pos = in_file_1.find_last_of(".");
+ path_1 = out_path + in_file_1.substr(0, pos) + ".fil.norm" + in_file_1.substr(pos);
+ ofh_1 = new ofstream(path_1.c_str(), ifstream::out);
+
+ if (ofh_1->fail()) {
+ cerr << "Error opening normalized output file '" << path_1 << "'\n";
+ exit(1);
+ }
+
+ pos = in_file_2.find_last_of(".");
+ path_2 = out_path + in_file_2.substr(0, pos) + ".fil.norm" + in_file_2.substr(pos);
+ ofh_2 = new ofstream(path_2.c_str(), ifstream::out);
+
+ if (ofh_2->fail()) {
+ cerr << "Error opening normalized paired output file '" << path_2 << "'\n";
+ exit(1);
+ }
+
+ if (in_file_2.substr(pos - 2, 2) == ".2")
+ pos -= 2;
+ path_2 = out_path + in_file_2.substr(0, pos) + ".fil.norm.rem";
+ path_2 += out_file_type == FileT::fastq ? ".fq" : ".fa";
+ rem_fh = new ofstream(path_2.c_str(), ifstream::out);
+
+ if (rem_fh->fail()) {
+ cerr << "Error opening normalized remainder output file '" << path_2 << "'\n";
+ exit(1);
+ }
} else {
- pos = in_file_1.find_last_of(".");
- path_1 = out_path + in_file_1.substr(0, pos) + ".norm" + in_file_1.substr(pos);
- ofh_1 = new ofstream(path_1.c_str(), ifstream::out);
-
- if (ofh_1->fail()) {
- cerr << "Error opening normalized output file '" << path_1 << "'\n";
- exit(1);
- }
-
- pos = in_file_2.find_last_of(".");
- path_2 = out_path + in_file_2.substr(0, pos) + ".norm" + in_file_2.substr(pos);
- ofh_2 = new ofstream(path_2.c_str(), ifstream::out);
-
- if (ofh_2->fail()) {
- cerr << "Error opening normalized paired output file '" << path_2 << "'\n";
- exit(1);
- }
-
- if (in_file_2.substr(pos - 2, 2) == ".2")
- pos -= 2;
- path_2 = out_path + in_file_2.substr(0, pos) + ".norm.rem";
- path_2 += out_file_type == FileT::fastq ? ".fq" : ".fa";
- rem_fh = new ofstream(path_2.c_str(), ifstream::out);
-
- if (rem_fh->fail()) {
- cerr << "Error opening normalized remainder output file '" << path_2 << "'\n";
- exit(1);
- }
+ pos = in_file_1.find_last_of(".");
+ path_1 = out_path + in_file_1.substr(0, pos) + ".norm" + in_file_1.substr(pos);
+ ofh_1 = new ofstream(path_1.c_str(), ifstream::out);
+
+ if (ofh_1->fail()) {
+ cerr << "Error opening normalized output file '" << path_1 << "'\n";
+ exit(1);
+ }
+
+ pos = in_file_2.find_last_of(".");
+ path_2 = out_path + in_file_2.substr(0, pos) + ".norm" + in_file_2.substr(pos);
+ ofh_2 = new ofstream(path_2.c_str(), ifstream::out);
+
+ if (ofh_2->fail()) {
+ cerr << "Error opening normalized paired output file '" << path_2 << "'\n";
+ exit(1);
+ }
+
+ if (in_file_2.substr(pos - 2, 2) == ".2")
+ pos -= 2;
+ path_2 = out_path + in_file_2.substr(0, pos) + ".norm.rem";
+ path_2 += out_file_type == FileT::fastq ? ".fq" : ".fa";
+ rem_fh = new ofstream(path_2.c_str(), ifstream::out);
+
+ if (rem_fh->fail()) {
+ cerr << "Error opening normalized remainder output file '" << path_2 << "'\n";
+ exit(1);
+ }
}
//
// Open a file for recording discarded reads
//
if (discards) {
- pos = in_file_1.find_last_of(".");
- if (filter_abundant_k || filter_rare_k)
- path_1 = out_path + in_file_1.substr(0, pos) + ".fil.discards" + in_file_1.substr(pos);
- else
- path_1 = out_path + in_file_1.substr(0, pos) + ".discards" + in_file_1.substr(pos);
- discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
-
- if (discard_fh_1->fail()) {
- cerr << "Error opening discard output file '" << path_1 << "'\n";
- exit(1);
- }
-
- pos = in_file_2.find_last_of(".");
- if (filter_abundant_k || filter_rare_k)
- path_2 = out_path + in_file_2.substr(0, pos) + ".fil.discards" + in_file_2.substr(pos);
- else
- path_2 = out_path + in_file_2.substr(0, pos) + ".discards" + in_file_2.substr(pos);
- discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
-
- if (discard_fh_2->fail()) {
- cerr << "Error opening discard output file '" << path_1 << "'\n";
- exit(1);
- }
+ pos = in_file_1.find_last_of(".");
+ if (filter_abundant_k || filter_rare_k)
+ path_1 = out_path + in_file_1.substr(0, pos) + ".fil.discards" + in_file_1.substr(pos);
+ else
+ path_1 = out_path + in_file_1.substr(0, pos) + ".discards" + in_file_1.substr(pos);
+ discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
+
+ if (discard_fh_1->fail()) {
+ cerr << "Error opening discard output file '" << path_1 << "'\n";
+ exit(1);
+ }
+
+ pos = in_file_2.find_last_of(".");
+ if (filter_abundant_k || filter_rare_k)
+ path_2 = out_path + in_file_2.substr(0, pos) + ".fil.discards" + in_file_2.substr(pos);
+ else
+ path_2 = out_path + in_file_2.substr(0, pos) + ".discards" + in_file_2.substr(pos);
+ discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
+
+ if (discard_fh_2->fail()) {
+ cerr << "Error opening discard output file '" << path_1 << "'\n";
+ exit(1);
+ }
}
//
- // Read in the first record, initializing the Seq object s. Then
+ // Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
//
Seq *s_1 = fh_1->next_seq();
Seq *s_2 = fh_2->next_seq();
if (s_1 == NULL || s_2 == NULL) {
- cerr << "Unable to allocate Seq object.\n";
- exit(1);
+ cerr << "Unable to allocate Seq object.\n";
+ exit(1);
}
int num_kmers;
@@ -727,68 +727,68 @@ normalize_paired_reads(string in_path_1,
long i = 1;
do {
- if (i % 10000 == 0) cerr << " Processing short read pair " << i << " \r";
- counter["total"] += 2;
-
- retain_1 = true;
- retain_2 = true;
- num_kmers = strlen(s_1->seq) - kmer_len + 1;
-
- //
- // Drop the first sequence if it has too many rare or abundant kmers.
- //
- retain_1 = normalize_kmer_lookup(kmers, s_1->seq, kmer, num_kmers, kmer_keys);
-
- num_kmers = strlen(s_2->seq) - kmer_len + 1;
-
- //
- // Drop the second sequence if it has too many rare or abundant kmers.
- //
- retain_2 = normalize_kmer_lookup(kmers, s_2->seq, kmer, num_kmers, kmer_keys);
-
- if (retain_1 && retain_2) {
- counter["retained"] += 2;
- out_file_type == FileT::fastq ?
- write_fastq(ofh_1, s_1) : write_fasta(ofh_1, s_1);
- out_file_type == FileT::fastq ?
- write_fastq(ofh_2, s_2) : write_fasta(ofh_2, s_2);
- } else {
- counter["overep"] +=2;
- }
-
- if (retain_1 && !retain_2) {
- counter["retained"]++;
- counter["overep"]++;
- out_file_type == FileT::fastq ?
- write_fastq(rem_fh, s_1) : write_fasta(rem_fh, s_1);
- }
-
- if (!retain_1 && retain_2) {
- counter["retained"]++;
- counter["overep"]++;
- out_file_type == FileT::fastq ?
- write_fastq(rem_fh, s_2) : write_fasta(rem_fh, s_2);
- }
-
- if (discards && !retain_1)
- out_file_type == FileT::fastq ?
- write_fastq(discard_fh_1, s_1) : write_fasta(discard_fh_1, s_1);
- if (discards && !retain_2)
- out_file_type == FileT::fastq ?
- write_fastq(discard_fh_2, s_2) : write_fasta(discard_fh_2, s_2);
-
- delete s_1;
- delete s_2;
-
- i++;
+ if (i % 10000 == 0) cerr << " Processing short read pair " << i << " \r";
+ counter["total"] += 2;
+
+ retain_1 = true;
+ retain_2 = true;
+ num_kmers = strlen(s_1->seq) - kmer_len + 1;
+
+ //
+ // Drop the first sequence if it has too many rare or abundant kmers.
+ //
+ retain_1 = normalize_kmer_lookup(kmers, s_1->seq, kmer, num_kmers, kmer_keys);
+
+ num_kmers = strlen(s_2->seq) - kmer_len + 1;
+
+ //
+ // Drop the second sequence if it has too many rare or abundant kmers.
+ //
+ retain_2 = normalize_kmer_lookup(kmers, s_2->seq, kmer, num_kmers, kmer_keys);
+
+ if (retain_1 && retain_2) {
+ counter["retained"] += 2;
+ out_file_type == FileT::fastq ?
+ write_fastq(ofh_1, s_1) : write_fasta(ofh_1, s_1);
+ out_file_type == FileT::fastq ?
+ write_fastq(ofh_2, s_2) : write_fasta(ofh_2, s_2);
+ } else {
+ counter["overep"] +=2;
+ }
+
+ if (retain_1 && !retain_2) {
+ counter["retained"]++;
+ counter["overep"]++;
+ out_file_type == FileT::fastq ?
+ write_fastq(rem_fh, s_1) : write_fasta(rem_fh, s_1);
+ }
+
+ if (!retain_1 && retain_2) {
+ counter["retained"]++;
+ counter["overep"]++;
+ out_file_type == FileT::fastq ?
+ write_fastq(rem_fh, s_2) : write_fasta(rem_fh, s_2);
+ }
+
+ if (discards && !retain_1)
+ out_file_type == FileT::fastq ?
+ write_fastq(discard_fh_1, s_1) : write_fasta(discard_fh_1, s_1);
+ if (discards && !retain_2)
+ out_file_type == FileT::fastq ?
+ write_fastq(discard_fh_2, s_2) : write_fasta(discard_fh_2, s_2);
+
+ delete s_1;
+ delete s_2;
+
+ i++;
} while ((s_1 = fh_1->next_seq()) != NULL &&
- (s_2 = fh_2->next_seq()) != NULL);
+ (s_2 = fh_2->next_seq()) != NULL);
delete [] kmer;
if (discards) {
- delete discard_fh_1;
- delete discard_fh_2;
+ delete discard_fh_1;
+ delete discard_fh_2;
}
//
@@ -803,11 +803,11 @@ normalize_paired_reads(string in_path_1,
return 0;
}
-int
-normalize_reads(string in_path,
- string in_file,
- SeqKmerHash &kmers, vector<char *> &kmer_keys,
- map<string, long> &counter)
+int
+normalize_reads(string in_path,
+ string in_file,
+ SeqKmerHash &kmers, vector<char *> &kmer_keys,
+ map<string, long> &counter)
{
Input *fh;
ofstream *discard_fh;
@@ -816,111 +816,111 @@ normalize_reads(string in_path,
int pos = in_file.find_last_of(".");
if (filter_abundant_k || filter_rare_k) {
- if (in_file.substr(pos - 4, 4) == ".fil")
- path = out_path + in_file;
- else
- path = out_path + in_file.substr(0, pos) + ".fil" + in_file.substr(pos);
-
- if (in_file_type == FileT::fastq)
- fh = new Fastq(path);
- else if (in_file_type == FileT::gzfastq)
- fh = new Fastq(path);
- else if (in_file_type == FileT::fasta)
- fh = new Fasta(path);
- else if (in_file_type == FileT::gzfasta)
- fh = new Fasta(path);
- else if (in_file_type == FileT::bustard)
- fh = new Bustard(path);
+ if (in_file.substr(pos - 4, 4) == ".fil")
+ path = out_path + in_file;
+ else
+ path = out_path + in_file.substr(0, pos) + ".fil" + in_file.substr(pos);
+
+ if (in_file_type == FileT::fastq)
+ fh = new Fastq(path);
+ else if (in_file_type == FileT::gzfastq)
+ fh = new Fastq(path);
+ else if (in_file_type == FileT::fasta)
+ fh = new Fasta(path);
+ else if (in_file_type == FileT::gzfasta)
+ fh = new Fasta(path);
+ else if (in_file_type == FileT::bustard)
+ fh = new Bustard(path);
} else {
- path = in_path + in_file;
-
- if (in_file_type == FileT::fastq)
- fh = new Fastq(path);
- else if (in_file_type == FileT::gzfastq)
- fh = new GzFastq(path + ".gz");
- else if (in_file_type == FileT::fasta)
- fh = new Fasta(path);
- else if (in_file_type == FileT::gzfasta)
- fh = new GzFasta(path + ".gz");
- else if (in_file_type == FileT::bustard)
- fh = new Bustard(path);
+ path = in_path + in_file;
+
+ if (in_file_type == FileT::fastq)
+ fh = new Fastq(path);
+ else if (in_file_type == FileT::gzfastq)
+ fh = new GzFastq(path + ".gz");
+ else if (in_file_type == FileT::fasta)
+ fh = new Fasta(path);
+ else if (in_file_type == FileT::gzfasta)
+ fh = new GzFasta(path + ".gz");
+ else if (in_file_type == FileT::bustard)
+ fh = new Bustard(path);
}
//
// Open the output file.
//
// if (filter_abundant_k || filter_rare_k) {
- // path = out_path + in_file.substr(0, pos) + ".norm" + in_file.substr(pos);
+ // path = out_path + in_file.substr(0, pos) + ".norm" + in_file.substr(pos);
// } else {
- // path = out_path + in_file.substr(0, pos) + ".norm" + in_file.substr(pos);
+ // path = out_path + in_file.substr(0, pos) + ".norm" + in_file.substr(pos);
// }
path = out_path + in_file.substr(0, pos) + ".norm" + in_file.substr(pos);
- ofstream *out_fh = new ofstream(path.c_str(), ifstream::out);
+ ofstream *out_fh = new ofstream(path.c_str(), ifstream::out);
if (out_fh->fail()) {
- cerr << "Error opening normalized output file '" << path << "'\n";
- exit(1);
+ cerr << "Error opening normalized output file '" << path << "'\n";
+ exit(1);
}
//
// Open a file for recording discarded reads
//
if (discards) {
- if (filter_abundant_k || filter_rare_k)
- path = out_path + in_file.substr(0, pos) + ".fil.discards" + in_file.substr(pos);
- else
- path = out_path + in_file.substr(0, pos) + ".discards" + in_file.substr(pos);
- discard_fh = new ofstream(path.c_str(), ifstream::out);
-
- if (discard_fh->fail()) {
- cerr << "Error opening discard output file '" << path << "'\n";
- exit(1);
- }
+ if (filter_abundant_k || filter_rare_k)
+ path = out_path + in_file.substr(0, pos) + ".fil.discards" + in_file.substr(pos);
+ else
+ path = out_path + in_file.substr(0, pos) + ".discards" + in_file.substr(pos);
+ discard_fh = new ofstream(path.c_str(), ifstream::out);
+
+ if (discard_fh->fail()) {
+ cerr << "Error opening discard output file '" << path << "'\n";
+ exit(1);
+ }
}
//
- // Read in the first record, initializing the Seq object s. Then
+ // Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
//
Seq *s = fh->next_seq();
if (s == NULL) {
- cerr << "Unable to allocate Seq object.\n";
- exit(1);
+ cerr << "Unable to allocate Seq object.\n";
+ exit(1);
}
-
+
int num_kmers;
bool retain;
char *kmer = new char[kmer_len + 1];
long i = 1;
do {
- if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
- counter["total"]++;
-
- //
- // Drop this sequence if it has too many rare or abundant kmers.
- //
- retain = true;
- num_kmers = strlen(s->seq) - kmer_len + 1;
-
- retain = normalize_kmer_lookup(kmers, s->seq, kmer, num_kmers, kmer_keys);
-
- if (retain) {
- counter["retained"]++;
- out_file_type == FileT::fastq ?
- write_fastq(out_fh, s) : write_fasta(out_fh, s);
- } else {
- counter["overep"]++;
- }
-
- if (discards && !retain)
- out_file_type == FileT::fastq ?
- write_fastq(discard_fh, s) : write_fasta(discard_fh, s);
-
- delete s;
-
- i++;
+ if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
+ counter["total"]++;
+
+ //
+ // Drop this sequence if it has too many rare or abundant kmers.
+ //
+ retain = true;
+ num_kmers = strlen(s->seq) - kmer_len + 1;
+
+ retain = normalize_kmer_lookup(kmers, s->seq, kmer, num_kmers, kmer_keys);
+
+ if (retain) {
+ counter["retained"]++;
+ out_file_type == FileT::fastq ?
+ write_fastq(out_fh, s) : write_fasta(out_fh, s);
+ } else {
+ counter["overep"]++;
+ }
+
+ if (discards && !retain)
+ out_file_type == FileT::fastq ?
+ write_fastq(discard_fh, s) : write_fasta(discard_fh, s);
+
+ delete s;
+
+ i++;
} while ((s = fh->next_seq()) != NULL);
delete [] kmer;
@@ -936,11 +936,11 @@ normalize_reads(string in_path,
return 0;
}
-int
-populate_kmers(vector<pair<string, string> > &pair_files,
- vector<pair<string, string> > &files,
- SeqKmerHash &kmers,
- vector<char *> &kmers_keys)
+int
+populate_kmers(vector<pair<string, string> > &pair_files,
+ vector<pair<string, string> > &files,
+ SeqKmerHash &kmers,
+ vector<char *> &kmers_keys)
{
//
// Break each read down into k-mers and create a hash map of those k-mers
@@ -949,15 +949,15 @@ populate_kmers(vector<pair<string, string> > &pair_files,
uint j = 1;
uint cnt = files.size() + pair_files.size();
for (uint i = 0; i < files.size(); i++) {
- cerr << "Generating kmers from file " << j << " of " << cnt << " [" << files[i].second << "]\n";
- process_file_kmers(files[i].first + files[i].second, kmers, kmers_keys);
- j++;
+ cerr << "Generating kmers from file " << j << " of " << cnt << " [" << files[i].second << "]\n";
+ process_file_kmers(files[i].first + files[i].second, kmers, kmers_keys);
+ j++;
}
for (uint i = 0; i < pair_files.size(); i++) {
- cerr << "Generating kmers from file " << j << " of " << cnt << " [" << pair_files[i].second << "]\n";
- process_file_kmers(pair_files[i].first + pair_files[i].second, kmers, kmers_keys);
- j++;
+ cerr << "Generating kmers from file " << j << " of " << cnt << " [" << pair_files[i].second << "]\n";
+ process_file_kmers(pair_files[i].first + pair_files[i].second, kmers, kmers_keys);
+ j++;
}
cerr << kmers.size() << " unique k-mers recorded.\n";
@@ -965,15 +965,15 @@ populate_kmers(vector<pair<string, string> > &pair_files,
return 0;
}
-int
-read_kmer_freq(string in_path, SeqKmerHash &kmer_map, vector<char *> &kmer_map_keys)
+int
+read_kmer_freq(string in_path, SeqKmerHash &kmer_map, vector<char *> &kmer_map_keys)
{
cerr << "Reading kmer frequencies from '" << in_path.c_str() << "'...\n";
ifstream fh(in_path.c_str(), ifstream::in);
if (fh.fail()) {
- cerr << "Error opening rare kmer frequency input file '" << in_path << "'\n";
- exit(1);
+ cerr << "Error opening rare kmer frequency input file '" << in_path << "'\n";
+ exit(1);
}
char *hash_key;
@@ -986,55 +986,55 @@ read_kmer_freq(string in_path, SeqKmerHash &kmer_map, vector<char *> &kmer_map_k
long i = 1;
while (fh.good()) {
- if (i % 10000 == 0) cerr << " Processing kmer " << i << " \r";
-
- fh.getline(line, max_len);
-
- len = strlen(line);
- if (len == 0) continue;
-
- //
- // Check that there is no carraige return in the buffer.
- //
- if (line[len - 1] == '\r') line[len - 1] = '\0';
-
- //
- // Ignore comments
- //
- if (line[0] == '#') continue;
-
- //
- // Parse the kmer and the number of times it occurred
- // <kmer> <tab> <integer>
- //
- parse_tsv(line, parts);
-
- if (parts.size() != 2) {
- cerr << "kmer frequencies are not formated correctly: expecting two, tab separated columns, found " << parts.size() << ".\n";
- exit(0);
- }
-
- strcpy(kmer, parts[1].c_str());
- cnt = is_integer(kmer);
- if (cnt < 0) {
- cerr << "Non integer found in second column.\n";
- exit(0);
- }
-
- strcpy(kmer, parts[0].c_str());
- exists = kmer_map.count(kmer) == 0 ? false : true;
-
- if (exists) {
- cerr << "Warning: kmer '" << kmer << "' already exists in the kmer hash map.\n";
- hash_key = kmer;
- kmer_map[hash_key] += cnt;
- } else {
- hash_key = new char [strlen(kmer) + 1];
- strcpy(hash_key, kmer);
- kmer_map_keys.push_back(hash_key);
- kmer_map[hash_key] = cnt;
- }
- i++;
+ if (i % 10000 == 0) cerr << " Processing kmer " << i << " \r";
+
+ fh.getline(line, max_len);
+
+ len = strlen(line);
+ if (len == 0) continue;
+
+ //
+ // Check that there is no carraige return in the buffer.
+ //
+ if (line[len - 1] == '\r') line[len - 1] = '\0';
+
+ //
+ // Ignore comments
+ //
+ if (line[0] == '#') continue;
+
+ //
+ // Parse the kmer and the number of times it occurred
+ // <kmer> <tab> <integer>
+ //
+ parse_tsv(line, parts);
+
+ if (parts.size() != 2) {
+ cerr << "kmer frequencies are not formated correctly: expecting two, tab separated columns, found " << parts.size() << ".\n";
+ exit(0);
+ }
+
+ strcpy(kmer, parts[1].c_str());
+ cnt = is_integer(kmer);
+ if (cnt < 0) {
+ cerr << "Non integer found in second column.\n";
+ exit(0);
+ }
+
+ strcpy(kmer, parts[0].c_str());
+ exists = kmer_map.count(kmer) == 0 ? false : true;
+
+ if (exists) {
+ cerr << "Warning: kmer '" << kmer << "' already exists in the kmer hash map.\n";
+ hash_key = kmer;
+ kmer_map[hash_key] += cnt;
+ } else {
+ hash_key = new char [strlen(kmer) + 1];
+ strcpy(hash_key, kmer);
+ kmer_map_keys.push_back(hash_key);
+ kmer_map[hash_key] = cnt;
+ }
+ i++;
}
fh.close();
@@ -1054,8 +1054,8 @@ write_kmer_freq(string path, SeqKmerHash &kmer_map)
ofstream out_fh(path.c_str(), ifstream::out);
if (out_fh.fail()) {
- cerr << "Error opening rare kmer output file '" << path << "'\n";
- exit(1);
+ cerr << "Error opening rare kmer output file '" << path << "'\n";
+ exit(1);
}
SeqKmerHash::iterator i;
@@ -1063,7 +1063,7 @@ write_kmer_freq(string path, SeqKmerHash &kmer_map)
out_fh << "# Kmer\tCount\n";
for (i = kmer_map.begin(); i != kmer_map.end(); i++) {
- out_fh << i->first << "\t" << i->second << "\n";
+ out_fh << i->first << "\t" << i->second << "\n";
}
out_fh.close();
@@ -1098,8 +1098,8 @@ process_file_kmers(string path, SeqKmerHash &kmer_map, vector<char *> &kmer_map_
//
Seq *s = fh->next_seq();
if (s == NULL) {
- cerr << "Unable to allocate Seq object.\n";
- exit(1);
+ cerr << "Unable to allocate Seq object.\n";
+ exit(1);
}
int num_kmers;
@@ -1107,34 +1107,34 @@ process_file_kmers(string path, SeqKmerHash &kmer_map, vector<char *> &kmer_map_
long i = 1;
do {
- if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
+ if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
- num_kmers = strlen(s->seq) - kmer_len + 1;
+ num_kmers = strlen(s->seq) - kmer_len + 1;
- //
- // Generate and hash the kmers for this raw read
- //
- kmer[kmer_len] = '\0';
+ //
+ // Generate and hash the kmers for this raw read
+ //
+ kmer[kmer_len] = '\0';
- for (j = 0; j < num_kmers; j++) {
- strncpy(kmer, s->seq + j, kmer_len);
+ for (j = 0; j < num_kmers; j++) {
+ strncpy(kmer, s->seq + j, kmer_len);
- exists = kmer_map.count(kmer) == 0 ? false : true;
+ exists = kmer_map.count(kmer) == 0 ? false : true;
- if (exists) {
- hash_key = kmer;
- } else {
- hash_key = new char [kmer_len + 1];
- strcpy(hash_key, kmer);
- kmer_map_keys.push_back(hash_key);
- }
+ if (exists) {
+ hash_key = kmer;
+ } else {
+ hash_key = new char [kmer_len + 1];
+ strcpy(hash_key, kmer);
+ kmer_map_keys.push_back(hash_key);
+ }
- kmer_map[hash_key]++;
- }
+ kmer_map[hash_key]++;
+ }
- delete s;
+ delete s;
- i++;
+ i++;
} while ((s = fh->next_seq()) != NULL);
delete [] kmer;
@@ -1147,7 +1147,7 @@ process_file_kmers(string path, SeqKmerHash &kmer_map, vector<char *> &kmer_map_
return 0;
}
-int
+int
generate_kmer_dist(SeqKmerHash &kmer_map)
{
SeqKmerHash::iterator i;
@@ -1156,18 +1156,18 @@ generate_kmer_dist(SeqKmerHash &kmer_map)
cerr << "Generating kmer distribution...\n";
for (i = kmer_map.begin(); i != kmer_map.end(); i++)
- bins[i->second]++;
+ bins[i->second]++;
map<uint, uint>::iterator j;
vector<pair<uint, uint> > sorted_kmers;
for (j = bins.begin(); j != bins.end(); j++)
- sorted_kmers.push_back(make_pair(j->first, j->second));
+ sorted_kmers.push_back(make_pair(j->first, j->second));
cout << "KmerFrequency\tCount\n";
for (unsigned long k = 0; k < sorted_kmers.size(); k++)
- cout << sorted_kmers[k].first << "\t" << sorted_kmers[k].second << "\n";
+ cout << sorted_kmers[k].first << "\t" << sorted_kmers[k].second << "\n";
return 0;
}
@@ -1185,13 +1185,13 @@ calc_kmer_median(SeqKmerHash &kmers, double &kmer_med, double &kmer_mad)
SeqKmerHash::iterator i;
for (i = kmers.begin(); i != kmers.end(); i++)
- freqs.push_back(i->second);
+ freqs.push_back(i->second);
sort(freqs.begin(), freqs.end());
- kmer_med = num_kmers % 2 == 0 ?
- (double) (freqs[num_kmers / 2 - 1] + freqs[num_kmers / 2]) / 2.0 :
- (double) freqs[num_kmers / 2 - 1];
+ kmer_med = num_kmers % 2 == 0 ?
+ (double) (freqs[num_kmers / 2 - 1] + freqs[num_kmers / 2]) / 2.0 :
+ (double) freqs[num_kmers / 2 - 1];
//
// Calculate the median absolute deviation.
@@ -1199,27 +1199,27 @@ calc_kmer_median(SeqKmerHash &kmers, double &kmer_med, double &kmer_mad)
residuals.reserve(num_kmers);
for (int j = 0; j < num_kmers; j++)
- residuals.push_back(abs(freqs[j] - (int) kmer_med));
+ residuals.push_back(abs(freqs[j] - (int) kmer_med));
sort(residuals.begin(), residuals.end());
- kmer_mad = num_kmers % 2 == 0 ?
- (double) (residuals[num_kmers / 2 - 1] + residuals[num_kmers / 2]) / 2.0 :
- (double) residuals[num_kmers / 2 - 1];
+ kmer_mad = num_kmers % 2 == 0 ?
+ (double) (residuals[num_kmers / 2 - 1] + residuals[num_kmers / 2]) / 2.0 :
+ (double) residuals[num_kmers / 2 - 1];
return 0;
}
-int
+int
kmer_map_cmp(pair<char *, long> a, pair<char *, long> b)
{
return (a.second < b.second);
}
-inline bool
-normalize_kmer_lookup(SeqKmerHash &kmer_map,
- char *read, char *kmer,
- int num_kmers,
- vector<char *> &kmer_keys)
+inline bool
+normalize_kmer_lookup(SeqKmerHash &kmer_map,
+ char *read, char *kmer,
+ int num_kmers,
+ vector<char *> &kmer_keys)
{
kmer[kmer_len] = '\0';
int cnt = 0;
@@ -1233,25 +1233,25 @@ normalize_kmer_lookup(SeqKmerHash &kmer_map,
// cout << "# " << read << "\n";
for (int j = 0; j < num_kmers; j++) {
- strncpy(kmer, read + j, kmer_len);
+ strncpy(kmer, read + j, kmer_len);
- cnt = kmer_map.count(kmer) > 0 ? kmer_map[kmer] : 0;
- sorted_cnts.push_back(cnt);
+ cnt = kmer_map.count(kmer) > 0 ? kmer_map[kmer] : 0;
+ sorted_cnts.push_back(cnt);
- // cout << kmer << "\t" << j << "\t" << cnt << "\n";
+ // cout << kmer << "\t" << j << "\t" << cnt << "\n";
}
//
// Calculate the median kmer frequency along the read.
//
sort(sorted_cnts.begin(), sorted_cnts.end());
- double median = num_kmers % 2 == 0 ?
- (double) (sorted_cnts[num_kmers / 2 - 1] + sorted_cnts[num_kmers / 2]) / 2.0 :
- (double) sorted_cnts[num_kmers / 2 - 1];
+ double median = num_kmers % 2 == 0 ?
+ (double) (sorted_cnts[num_kmers / 2 - 1] + sorted_cnts[num_kmers / 2]) / 2.0 :
+ (double) sorted_cnts[num_kmers / 2 - 1];
// cout << "# median: " << median << "\n";
if (median > normalize_lim)
- retain = false;
+ retain = false;
//
// Generate and hash the kmers for this raw read
@@ -1261,29 +1261,29 @@ normalize_kmer_lookup(SeqKmerHash &kmer_map,
kmer[kmer_len] = '\0';
for (int j = 0; j < num_kmers; j++) {
- strncpy(kmer, read + j, kmer_len);
+ strncpy(kmer, read + j, kmer_len);
- exists = kmer_map.count(kmer) == 0 ? false : true;
+ exists = kmer_map.count(kmer) == 0 ? false : true;
- if (exists) {
- hash_key = kmer;
- } else {
- hash_key = new char [kmer_len + 1];
- strcpy(hash_key, kmer);
- kmer_keys.push_back(hash_key);
- }
+ if (exists) {
+ hash_key = kmer;
+ } else {
+ hash_key = new char [kmer_len + 1];
+ strcpy(hash_key, kmer);
+ kmer_keys.push_back(hash_key);
+ }
- kmer_map[hash_key]++;
+ kmer_map[hash_key]++;
}
return retain;
}
-inline int
-kmer_lookup(SeqKmerHash &kmer_map,
- char *read, char *kmer,
- int num_kmers,
- int &rare_k, int &abundant_k)
+inline int
+kmer_lookup(SeqKmerHash &kmer_map,
+ char *read, char *kmer,
+ int num_kmers,
+ int &rare_k, int &abundant_k)
{
//
// Generate kmers from this read, lookup kmer frequency in dataset.
@@ -1300,65 +1300,65 @@ kmer_lookup(SeqKmerHash &kmer_map,
// cout << "# " << read << "\n";
for (int j = 0; j < num_kmers; j++) {
- strncpy(kmer, read + j, kmer_len);
+ strncpy(kmer, read + j, kmer_len);
- cnt = kmer_map[kmer];
- cnts.push_back(cnt);
- sorted_cnts.push_back(cnt);
+ cnt = kmer_map[kmer];
+ cnts.push_back(cnt);
+ sorted_cnts.push_back(cnt);
- // cout << kmer << "\t" << j << "\t" << cnt << "\n";
+ // cout << kmer << "\t" << j << "\t" << cnt << "\n";
- if (cnt >= max_k_freq) abundant_k++;
+ if (cnt >= max_k_freq) abundant_k++;
}
//
// Calculate the median kmer frequency along the read.
//
sort(sorted_cnts.begin(), sorted_cnts.end());
- double median = num_kmers % 2 == 0 ?
- (double) (sorted_cnts[num_kmers / 2 - 1] + sorted_cnts[num_kmers / 2]) / 2.0 :
- (double) sorted_cnts[num_kmers / 2 - 1];
+ double median = num_kmers % 2 == 0 ?
+ (double) (sorted_cnts[num_kmers / 2 - 1] + sorted_cnts[num_kmers / 2]) / 2.0 :
+ (double) sorted_cnts[num_kmers / 2 - 1];
// cout << "# median: " << median << "\n";
double bound = round(median * min_k_pct);
// cout << "# kmer cov bound: " << bound << "\n";
//
- // Look for runs of rare kmers.
+ // Look for runs of rare kmers.
//
- // We will slide a window across the read, f represents the front of the window, b
- // represents the back. Each time a kmer is below the bound we will increment run_cnt,
- // which represents the number of kmers in the window below the bound. If 2/3 of the
+ // We will slide a window across the read, f represents the front of the window, b
+ // represents the back. Each time a kmer is below the bound we will increment run_cnt,
+ // which represents the number of kmers in the window below the bound. If 2/3 of the
// kmers in the window go below the bound, assume a sequencing error has occurred.
- //
+ //
int run_cnt = 0;
int b = 0;
for (int f = 0; f < num_kmers; f++) {
- if (f >= kmer_len) {
- b++;
- if (cnts[b] <= bound)
- run_cnt--;
- }
- if (cnts[f] <= bound) {
- run_cnt++;
- if (run_cnt >= min_lim) {
- rare_k++;
- // cout << "# Rejecting read, position: " << f << "; run_cnt: " << run_cnt << "\n";
- return 0;
- }
- }
- // cout << "# b: " << b << "; f: " << f << "; run_cnt: " << run_cnt << "; counts[front]: " << cnts[f] << "; bound: " << bound << "\n";
+ if (f >= kmer_len) {
+ b++;
+ if (cnts[b] <= bound)
+ run_cnt--;
+ }
+ if (cnts[f] <= bound) {
+ run_cnt++;
+ if (run_cnt >= min_lim) {
+ rare_k++;
+ // cout << "# Rejecting read, position: " << f << "; run_cnt: " << run_cnt << "\n";
+ return 0;
+ }
+ }
+ // cout << "# b: " << b << "; f: " << f << "; run_cnt: " << run_cnt << "; counts[front]: " << cnts[f] << "; bound: " << bound << "\n";
}
// cout << "\n\n";
return 0;
}
-// inline int
-// kmer_lookup(SeqKmerHash &kmer_map,
-// char *read, char *kmer,
-// int num_kmers,
-// int &rare_k, int &abundant_k, bool &complex)
+// inline int
+// kmer_lookup(SeqKmerHash &kmer_map,
+// char *read, char *kmer,
+// int num_kmers,
+// int &rare_k, int &abundant_k, bool &complex)
// {
// //
// // Generate kmers from this read, lookup kmer frequency in dataset.
@@ -1376,26 +1376,26 @@ kmer_lookup(SeqKmerHash &kmer_map,
// // cout << "# " << read << "\n";
// for (int j = 0; j < num_kmers; j++) {
-// strncpy(kmer, read + j, kmer_len);
-
-// cnt = kmer_map[kmer];
-
-// if (cnt >= 100000)
-// cnts.push_back(100000);
-// else if (cnt >= 10000)
-// cnts.push_back(10000);
-// else if (cnt >= 1000)
-// cnts.push_back(1000);
-// else if (cnt >= 100)
-// cnts.push_back(100);
-// else if (cnt >= 10)
-// cnts.push_back(10);
-// else
-// cnts.push_back(1);
-
-// // cout << kmer << "\t" << j << "\t" << cnt << "\n";
-
-// if (cnt >= max_k_freq) abundant_k++;
+// strncpy(kmer, read + j, kmer_len);
+
+// cnt = kmer_map[kmer];
+
+// if (cnt >= 100000)
+// cnts.push_back(100000);
+// else if (cnt >= 10000)
+// cnts.push_back(10000);
+// else if (cnt >= 1000)
+// cnts.push_back(1000);
+// else if (cnt >= 100)
+// cnts.push_back(100);
+// else if (cnt >= 10)
+// cnts.push_back(10);
+// else
+// cnts.push_back(1);
+
+// // cout << kmer << "\t" << j << "\t" << cnt << "\n";
+
+// if (cnt >= max_k_freq) abundant_k++;
// }
// // //
@@ -1405,78 +1405,78 @@ kmer_lookup(SeqKmerHash &kmer_map,
// // int cov = cnts[0];
// // cout << "\nDetermining transitions:\n" << kmer << "\t" << "0" << "\t" << cnts[0] << "\n";
// // for (int j = 1; j < num_kmers; j++)
-// // if (cnts[j] != cov) {
-// // cov = cnts[j];
-// // t++;
-// // cout << kmer << "\t" << j << "\t" << cnts[j] << ": Transition." << "\n";
-// // } else {
-// // cout << kmer << "\t" << j << "\t" << cnts[j] << "\n";
-// // }
+// // if (cnts[j] != cov) {
+// // cov = cnts[j];
+// // t++;
+// // cout << kmer << "\t" << j << "\t" << cnts[j] << ": Transition." << "\n";
+// // } else {
+// // cout << kmer << "\t" << j << "\t" << cnts[j] << "\n";
+// // }
// // cerr << t << " total cnts.\n";
// //
-// // Look for runs of kmers at various orders of magnitude.
+// // Look for runs of kmers at various orders of magnitude.
// //
-// // We will slide a window across the read, f represents the front of the window, b
-// // represents the back. Each time a kmer is below the bound we will increment run_cnt,
-// // which represents the number of kmers in the window below the bound. If 2/3 of the
+// // We will slide a window across the read, f represents the front of the window, b
+// // represents the back. Each time a kmer is below the bound we will increment run_cnt,
+// // which represents the number of kmers in the window below the bound. If 2/3 of the
// // kmers in the window go below the bound, assume a sequencing error has occurred.
// // Run counters:
// // 1 10 100 1k 10k 100k
-// // runs[0] runs[1] runs[2] runs[3] runs[4] runs[5]
+// // runs[0] runs[1] runs[2] runs[3] runs[4] runs[5]
// int runs[6] = {0};
// int prev_cnt, run_cnt, tot_trans;
// int f = 0;
// while (f < num_kmers) {
-// tot_trans = 0;
-// run_cnt = 1;
-// prev_cnt = cnts[f];
-// f++;
-
-// while (f < num_kmers && cnts[f] == prev_cnt) {
-// // cout << "# window front: " << f << "; run_cnt: " << run_cnt << "; prev_cnt: " << prev_cnt << "\n";
-// f++;
-// run_cnt++;
-// }
-
-// if (run_cnt >= rare_k_lim) {
-// // cout << "# found transition run, position: " << f-1 << "; run_cnt: " << run_cnt << "\n";
-// switch(prev_cnt) {
-// case 1:
-// runs[0]++;
-// break;
-// case 10:
-// runs[1]++;
-// break;
-// case 100:
-// runs[2]++;
-// break;
-// case 1000:
-// runs[3]++;
-// break;
-// case 10000:
-// runs[4]++;
-// break;
-// case 100000:
-// runs[5]++;
-// break;
-// }
-// }
-
-// for (int j = 0; j < 6; j++)
-// if (runs[j] > 0) tot_trans++;
-
-// // cout << "# Total transitions: " << tot_trans << "\n";
-
-// if (tot_trans >= transition_lim) {
-// // cout << "# Rejecting read.\n";
-// rare_k++;
-// return 0;
-// }
+// tot_trans = 0;
+// run_cnt = 1;
+// prev_cnt = cnts[f];
+// f++;
+
+// while (f < num_kmers && cnts[f] == prev_cnt) {
+// // cout << "# window front: " << f << "; run_cnt: " << run_cnt << "; prev_cnt: " << prev_cnt << "\n";
+// f++;
+// run_cnt++;
+// }
+
+// if (run_cnt >= rare_k_lim) {
+// // cout << "# found transition run, position: " << f-1 << "; run_cnt: " << run_cnt << "\n";
+// switch(prev_cnt) {
+// case 1:
+// runs[0]++;
+// break;
+// case 10:
+// runs[1]++;
+// break;
+// case 100:
+// runs[2]++;
+// break;
+// case 1000:
+// runs[3]++;
+// break;
+// case 10000:
+// runs[4]++;
+// break;
+// case 100000:
+// runs[5]++;
+// break;
+// }
+// }
+
+// for (int j = 0; j < 6; j++)
+// if (runs[j] > 0) tot_trans++;
+
+// // cout << "# Total transitions: " << tot_trans << "\n";
+
+// if (tot_trans >= transition_lim) {
+// // cout << "# Rejecting read.\n";
+// rare_k++;
+// return 0;
+// }
// }
// // cout << "\n\n";
@@ -1484,11 +1484,11 @@ kmer_lookup(SeqKmerHash &kmer_map,
// return 0;
// }
-int
+int
free_kmer_hash(SeqKmerHash &kmer_map, vector<char *> &kmer_map_keys)
{
for (uint i = 0; i < kmer_map_keys.size(); i++) {
- delete [] kmer_map_keys[i];
+ delete [] kmer_map_keys[i];
}
kmer_map_keys.clear();
@@ -1504,24 +1504,24 @@ int print_results(map<string, map<string, long> > &counters) {
ofstream log(log_path.c_str());
if (log.fail()) {
- cerr << "Unable to open log file '" << log_path << "'\n";
- return 0;
+ cerr << "Unable to open log file '" << log_path << "'\n";
+ return 0;
}
cerr << "Outputing details to log: '" << log_path << "'\n\n";
log << "File\t"
- << "Retained Reads\t"
- << "Rare K\t"
- << "Abundant K\t"
- << "Total\n";
+ << "Retained Reads\t"
+ << "Rare K\t"
+ << "Abundant K\t"
+ << "Total\n";
for (it = counters.begin(); it != counters.end(); it++) {
- log << it->first << "\t"
- << it->second["retained"] << "\t"
- << it->second["rare_k"] << "\t"
- << it->second["abundant_k"] << "\t"
- << it->second["total"] << "\n";
+ log << it->first << "\t"
+ << it->second["retained"] << "\t"
+ << it->second["rare_k"] << "\t"
+ << it->second["abundant_k"] << "\t"
+ << it->second["total"] << "\n";
}
map<string, long> c;
@@ -1531,20 +1531,20 @@ int print_results(map<string, map<string, long> > &counters) {
// Total up the individual counters
//
for (it = counters.begin(); it != counters.end(); it++) {
- c["total"] += it->second["total"];
- c["retained"] += it->second["retained"];
- c["rare_k"] += it->second["rare_k"];
- c["abundant_k"] += it->second["abundant_k"];
+ c["total"] += it->second["total"];
+ c["retained"] += it->second["retained"];
+ c["rare_k"] += it->second["rare_k"];
+ c["abundant_k"] += it->second["abundant_k"];
}
- cerr <<
- c["total"] << " total sequences;\n"
- << " " << c["rare_k"] << " rare k-mer reads;\n"
- << " " << c["abundant_k"] << " abundant k-mer reads;\n"
- << c["retained"] << " retained reads.\n";
+ cerr <<
+ c["total"] << " total sequences;\n"
+ << " " << c["rare_k"] << " rare k-mer reads;\n"
+ << " " << c["abundant_k"] << " abundant k-mer reads;\n"
+ << c["retained"] << " retained reads.\n";
- log << "Total Sequences\t" << c["total"] << "\n"
- << "Retained Reads\t" << c["retained"] << "\n";
+ log << "Total Sequences\t" << c["total"] << "\n"
+ << "Retained Reads\t" << c["retained"] << "\n";
log.close();
@@ -1559,64 +1559,64 @@ int build_file_list(vector<string> &in_files, vector<pair<string, string> > &fil
// Scan a directory for a list of files.
//
if (in_path.length() > 0) {
- struct dirent *direntry;
-
- DIR *dir = opendir(in_path.c_str());
-
- if (dir == NULL) {
- cerr << "Unable to open directory '" << in_path << "' for reading.\n";
- exit(1);
- }
-
- while ((direntry = readdir(dir)) != NULL) {
- file = direntry->d_name;
-
- if (file.substr(0, 1) == ".")
- continue;
-
- //
- // If the file is gzip'ed, remove the '.gz' suffix.
- //
- pos = file.find_last_of(".");
- if ((in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) &&
- file.substr(pos) == ".gz") {
- file = file.substr(0, pos);
- pos = file.find_last_of(".");
- }
-
- //
- // Check that the remaining file name has the right suffix.
- //
- suffix = file.substr(pos + 1);
- if (in_file_type == FileT::fastq && (suffix.substr(0, 2) == "fq" || suffix.substr(0, 5) == "fastq"))
- files.push_back(make_pair(in_path, file));
- else if (in_file_type == FileT::fasta && (suffix.substr(0, 2) == "fa" || suffix.substr(0, 5) == "fasta"))
- files.push_back(make_pair(in_path, file));
- }
-
- if (files.size() == 0)
- cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
+ struct dirent *direntry;
+
+ DIR *dir = opendir(in_path.c_str());
+
+ if (dir == NULL) {
+ cerr << "Unable to open directory '" << in_path << "' for reading.\n";
+ exit(1);
+ }
+
+ while ((direntry = readdir(dir)) != NULL) {
+ file = direntry->d_name;
+
+ if (file.substr(0, 1) == ".")
+ continue;
+
+ //
+ // If the file is gzip'ed, remove the '.gz' suffix.
+ //
+ pos = file.find_last_of(".");
+ if ((in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) &&
+ file.substr(pos) == ".gz") {
+ file = file.substr(0, pos);
+ pos = file.find_last_of(".");
+ }
+
+ //
+ // Check that the remaining file name has the right suffix.
+ //
+ suffix = file.substr(pos + 1);
+ if (in_file_type == FileT::fastq && (suffix.substr(0, 2) == "fq" || suffix.substr(0, 5) == "fastq"))
+ files.push_back(make_pair(in_path, file));
+ else if (in_file_type == FileT::fasta && (suffix.substr(0, 2) == "fa" || suffix.substr(0, 5) == "fasta"))
+ files.push_back(make_pair(in_path, file));
+ }
+
+ if (files.size() == 0)
+ cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
} else {
- string path;
-
- for (uint i = 0; i < in_files.size(); i++) {
- //
- // Files specified directly:
- // Break off file path and store path and file name.
- // Check if this is a gzip'ed file and if so, remove 'gz' suffix.
- //
- file = in_files[i];
- pos = file.find_last_of(".");
- if ((in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) &&
- file.substr(pos) == ".gz") {
- file = file.substr(0, pos);
- pos = file.find_last_of(".");
- }
- pos = file.find_last_of("/");
- path = file.substr(0, pos + 1);
- files.push_back(make_pair(path, file.substr(pos+1)));
- }
+ string path;
+
+ for (uint i = 0; i < in_files.size(); i++) {
+ //
+ // Files specified directly:
+ // Break off file path and store path and file name.
+ // Check if this is a gzip'ed file and if so, remove 'gz' suffix.
+ //
+ file = in_files[i];
+ pos = file.find_last_of(".");
+ if ((in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) &&
+ file.substr(pos) == ".gz") {
+ file = file.substr(0, pos);
+ pos = file.find_last_of(".");
+ }
+ pos = file.find_last_of("/");
+ path = file.substr(0, pos + 1);
+ files.push_back(make_pair(path, file.substr(pos+1)));
+ }
}
return 0;
@@ -1625,166 +1625,166 @@ int build_file_list(vector<string> &in_files, vector<pair<string, string> > &fil
int parse_command_line(int argc, char* argv[]) {
string pair_1, pair_2;
int c;
-
+
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
- {"discards", no_argument, NULL, 'D'},
- {"pair_1", required_argument, NULL, '1'},
- {"pair_2", required_argument, NULL, '2'},
- {"infile_type", required_argument, NULL, 'i'},
- {"outfile_type", required_argument, NULL, 'y'},
- {"file", required_argument, NULL, 'f'},
- {"path", required_argument, NULL, 'p'},
- {"outpath", required_argument, NULL, 'o'},
- {"k_dist", no_argument, NULL, 'I'},
- {"rare", no_argument, NULL, 'R'},
- {"abundant", no_argument, NULL, 'A'},
- {"normalize", required_argument, NULL, 'N'},
- {"k_len", required_argument, NULL, 'K'},
- {"max_k_freq", required_argument, NULL, 'M'},
- {"min_lim", required_argument, NULL, 'F'},
- {"max_lim", required_argument, NULL, 'G'},
- {"min_k_pct", required_argument, NULL, 'P'},
- {"read_k_freq", required_argument, NULL, 'r'},
- {"write_k_freq", required_argument, NULL, 'w'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "hvRADkP:N:I:w:r:K:F:G:M:m:i:y:f:o:t:p:1:2:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 'i':
+ {"discards", no_argument, NULL, 'D'},
+ {"pair_1", required_argument, NULL, '1'},
+ {"pair_2", required_argument, NULL, '2'},
+ {"infile_type", required_argument, NULL, 'i'},
+ {"outfile_type", required_argument, NULL, 'y'},
+ {"file", required_argument, NULL, 'f'},
+ {"path", required_argument, NULL, 'p'},
+ {"outpath", required_argument, NULL, 'o'},
+ {"k_dist", no_argument, NULL, 'I'},
+ {"rare", no_argument, NULL, 'R'},
+ {"abundant", no_argument, NULL, 'A'},
+ {"normalize", required_argument, NULL, 'N'},
+ {"k_len", required_argument, NULL, 'K'},
+ {"max_k_freq", required_argument, NULL, 'M'},
+ {"min_lim", required_argument, NULL, 'F'},
+ {"max_lim", required_argument, NULL, 'G'},
+ {"min_k_pct", required_argument, NULL, 'P'},
+ {"read_k_freq", required_argument, NULL, 'r'},
+ {"write_k_freq", required_argument, NULL, 'w'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hvRADkP:N:I:w:r:K:F:G:M:m:i:y:f:o:t:p:1:2:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 'i':
if (strcasecmp(optarg, "fasta") == 0)
in_file_type = FileT::fasta;
- else if (strcasecmp(optarg, "gzfasta") == 0)
- in_file_type = FileT::gzfasta;
- else if (strcasecmp(optarg, "gzfastq") == 0)
- in_file_type = FileT::gzfastq;
- else
- in_file_type = FileT::fastq;
- break;
- case 'y':
+ else if (strcasecmp(optarg, "gzfasta") == 0)
+ in_file_type = FileT::gzfasta;
+ else if (strcasecmp(optarg, "gzfastq") == 0)
+ in_file_type = FileT::gzfastq;
+ else
+ in_file_type = FileT::fastq;
+ break;
+ case 'y':
if (strcasecmp(optarg, "fasta") == 0)
out_file_type = FileT::fasta;
- else
- out_file_type = FileT::fastq;
- break;
- case 'f':
- in_files.push_back(optarg);
- break;
- case '1':
- pair_1 = optarg;
- break;
- case '2':
- pair_2 = optarg;
- if (pair_1.length() == 0) help();
- in_pair_files.push_back(pair_1);
- in_pair_files.push_back(pair_2);
- pair_1 = "";
- pair_2 = "";
- break;
- case 'p':
- in_path = optarg;
- break;
- case 'o':
- out_path = optarg;
- break;
- case 'D':
- discards = true;
- break;
- case 'I':
- kmer_distr = true;
- break;
- case 'R':
- filter_rare_k = true;
- break;
- case 'A':
- filter_abundant_k = true;
- break;
- case 'N':
- normalize = true;
- normalize_lim = is_integer(optarg);
- break;
- case 'K':
- kmer_len = is_integer(optarg);
- break;
- case 'M':
- max_k_freq = is_integer(optarg);
- break;
- case 'F':
- min_lim = is_integer(optarg);
- break;
- case 'G':
- max_lim = is_integer(optarg);
- break;
- case 'P':
- min_k_pct = is_double(optarg);
- break;
- case 'r':
- read_k_freq = true;
- k_freq_path = optarg;
- break;
- case 'w':
- write_k_freq = true;
- k_freq_path = optarg;
- break;
+ else
+ out_file_type = FileT::fastq;
+ break;
+ case 'f':
+ in_files.push_back(optarg);
+ break;
+ case '1':
+ pair_1 = optarg;
+ break;
+ case '2':
+ pair_2 = optarg;
+ if (pair_1.length() == 0) help();
+ in_pair_files.push_back(pair_1);
+ in_pair_files.push_back(pair_2);
+ pair_1 = "";
+ pair_2 = "";
+ break;
+ case 'p':
+ in_path = optarg;
+ break;
+ case 'o':
+ out_path = optarg;
+ break;
+ case 'D':
+ discards = true;
+ break;
+ case 'I':
+ kmer_distr = true;
+ break;
+ case 'R':
+ filter_rare_k = true;
+ break;
+ case 'A':
+ filter_abundant_k = true;
+ break;
+ case 'N':
+ normalize = true;
+ normalize_lim = is_integer(optarg);
+ break;
+ case 'K':
+ kmer_len = is_integer(optarg);
+ break;
+ case 'M':
+ max_k_freq = is_integer(optarg);
+ break;
+ case 'F':
+ min_lim = is_integer(optarg);
+ break;
+ case 'G':
+ max_lim = is_integer(optarg);
+ break;
+ case 'P':
+ min_k_pct = is_double(optarg);
+ break;
+ case 'r':
+ read_k_freq = true;
+ k_freq_path = optarg;
+ break;
+ case 'w':
+ write_k_freq = true;
+ k_freq_path = optarg;
+ break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
-
- default:
- cerr << "Unknown command line option '" << (char) c << "'\n";
- help();
- abort();
- }
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
+
+ default:
+ cerr << "Unknown command line option '" << (char) c << "'\n";
+ help();
+ abort();
+ }
}
if (in_files.size() == 0 && in_pair_files.size() == 0 && in_path.length() == 0) {
- cerr << "You must specify an input file of a directory path to a set of input files.\n";
- help();
+ cerr << "You must specify an input file of a directory path to a set of input files.\n";
+ help();
}
if (in_files.size() > 0 && in_path.length() > 0) {
- cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
- help();
+ cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
+ help();
}
- if (in_path.length() > 0 && in_path.at(in_path.length() - 1) != '/')
- in_path += "/";
+ if (in_path.length() > 0 && in_path.at(in_path.length() - 1) != '/')
+ in_path += "/";
- if (out_path.length() == 0)
- out_path = ".";
+ if (out_path.length() == 0)
+ out_path = ".";
- if (out_path.at(out_path.length() - 1) != '/')
- out_path += "/";
+ if (out_path.at(out_path.length() - 1) != '/')
+ out_path += "/";
if (in_file_type == FileT::unknown)
- in_file_type = FileT::fastq;
+ in_file_type = FileT::fastq;
if (read_k_freq && write_k_freq) {
- cerr << "You may either read a set of kmer frequencies, or write kmer frequencies, not both.\n";
- help();
+ cerr << "You may either read a set of kmer frequencies, or write kmer frequencies, not both.\n";
+ help();
}
if (min_k_pct < 0.0 || min_k_pct > 1.0) {
- cerr << "Percentage to consider a kmer rare must be between 0 and 1.0.\n";
- help();
+ cerr << "Percentage to consider a kmer rare must be between 0 and 1.0.\n";
+ help();
}
//
@@ -1792,8 +1792,8 @@ int parse_command_line(int argc, char* argv[]) {
//
struct stat info;
if (stat(out_path.c_str(), &info) != 0) {
- cerr << "Unable to locate the specified output path, '" << out_path << "'\n";
- exit(1);
+ cerr << "Unable to locate the specified output path, '" << out_path << "'\n";
+ exit(1);
}
return 0;
@@ -1808,31 +1808,31 @@ void version() {
void help() {
std::cerr << "kmer_filter " << VERSION << "\n"
<< "kmer_filter [-f in_file_1 [-f in_file_2...] | -p in_dir] [-1 pair_1 -2 pair_2 [-1 pair_1...]] -o out_dir [-i type] [-y type] [-D] [-h]\n"
- << " f: path to the input file if processing single-end seqeunces.\n"
- << " i: input file type, either 'bustard' for the Illumina BUSTARD output files, 'fasta', 'fastq', 'gzfasta', or 'gzfastq' (default 'fastq').\n"
- << " p: path to a directory of files (for single-end files only).\n"
- << " 1: specify the first in a pair of files to be processed together.\n"
- << " 2: specify the second in a pair of files to be processed together.\n"
- << " o: path to output the processed files.\n"
- << " y: output type, either 'fastq' or 'fasta' (default fastq).\n"
- << " D: capture discarded reads to a file.\n"
- << " h: display this help messsage.\n\n"
- << " Filtering options:\n"
- << " --rare: turn on filtering based on rare k-mers.\n"
- << " --abundant: turn on filtering based on abundant k-mers.\n"
- << " --k_len <len>: specify k-mer size (default 15).\n\n"
- << " Advanced filtering options:\n"
- << " --max_k_freq <value>: specify the number of times a kmer must occur to be considered abundant (default 20,000).\n"
- << " --min_lim <value>: specify number of rare kmers occuring in a row required to discard a read (default 80% of the k-mer length).\n"
- << " --max_lim <value>: specify number of abundant kmers required to discard a read (default 80% of the k-mers in a read).\n\n"
- << " Normalize data:\n"
- << " --normalize <depth>: normalize read depth according to k-mer coverage.\n\n"
- << " Characterizing K-mers:\n"
- << " --write_k_freq: write kmers along with their frequency of occurrence and exit.\n"
- << " --k_dist: print k-mer frequency distribution and exit.\n\n"
- << " Advanced input options:\n"
- << " --read_k_freq <path>: read a set of kmers along with their frequencies of occurrence instead of reading raw input files.\n"
- << "\n";
+ << " f: path to the input file if processing single-end seqeunces.\n"
+ << " i: input file type, either 'bustard' for the Illumina BUSTARD output files, 'fasta', 'fastq', 'gzfasta', or 'gzfastq' (default 'fastq').\n"
+ << " p: path to a directory of files (for single-end files only).\n"
+ << " 1: specify the first in a pair of files to be processed together.\n"
+ << " 2: specify the second in a pair of files to be processed together.\n"
+ << " o: path to output the processed files.\n"
+ << " y: output type, either 'fastq' or 'fasta' (default fastq).\n"
+ << " D: capture discarded reads to a file.\n"
+ << " h: display this help messsage.\n\n"
+ << " Filtering options:\n"
+ << " --rare: turn on filtering based on rare k-mers.\n"
+ << " --abundant: turn on filtering based on abundant k-mers.\n"
+ << " --k_len <len>: specify k-mer size (default 15).\n\n"
+ << " Advanced filtering options:\n"
+ << " --max_k_freq <value>: specify the number of times a kmer must occur to be considered abundant (default 20,000).\n"
+ << " --min_lim <value>: specify number of rare kmers occuring in a row required to discard a read (default 80% of the k-mer length).\n"
+ << " --max_lim <value>: specify number of abundant kmers required to discard a read (default 80% of the k-mers in a read).\n\n"
+ << " Normalize data:\n"
+ << " --normalize <depth>: normalize read depth according to k-mer coverage.\n\n"
+ << " Characterizing K-mers:\n"
+ << " --write_k_freq: write kmers along with their frequency of occurrence and exit.\n"
+ << " --k_dist: print k-mer frequency distribution and exit.\n\n"
+ << " Advanced input options:\n"
+ << " --read_k_freq <path>: read a set of kmers along with their frequencies of occurrence instead of reading raw input files.\n"
+ << "\n";
exit(0);
}
diff --git a/src/kmer_filter.h b/src/kmer_filter.h
index 8d5efb8..e893b40 100644
--- a/src/kmer_filter.h
+++ b/src/kmer_filter.h
@@ -21,7 +21,7 @@
#ifndef __KMER_FILTER_H__
#define __KMER_FILTER_H__
-#include "constants.h"
+#include "constants.h"
#include <stdlib.h>
#include <getopt.h> // Process command-line options
diff --git a/src/kmers.cc b/src/kmers.cc
index d1bff17..72224a6 100644
--- a/src/kmers.cc
+++ b/src/kmers.cc
@@ -29,7 +29,7 @@ int determine_kmer_length(int read_len, int dist) {
//
// If distance allowed between sequences is 0, then k-mer length equals read length.
//
- if (dist == 0)
+ if (dist == 0)
return read_len;
//
@@ -63,7 +63,7 @@ int calc_min_kmer_matches(int kmer_len, int dist, int read_len, bool exit_err) {
min_matches = read_len - span;
if (min_matches <= 0) {
- cerr <<
+ cerr <<
"Warning: combination of k-mer length (" << kmer_len << ") and edit distance (" << dist << ") allows for " <<
"sequences to be missed by the matching algorithm.\n";
}
@@ -96,16 +96,16 @@ generate_kmers_lazily(const char *seq, uint kmer_len, uint num_kmers, vector<cha
const char *k = seq;
if (num_kmers > kmers.size()) {
- int new_kmers = num_kmers - kmers.size();
+ int new_kmers = num_kmers - kmers.size();
for (int i = 0; i < new_kmers; i++) {
- kmer = new char[kmer_len + 1];
- kmers.push_back(kmer);
- }
+ kmer = new char[kmer_len + 1];
+ kmers.push_back(kmer);
+ }
}
for (uint i = 0; i < num_kmers; i++) {
- kmer = kmers.at(i);
+ kmer = kmers.at(i);
strncpy(kmer, k, kmer_len);
kmer[kmer_len] = '\0';
k++;
@@ -134,7 +134,7 @@ generate_kmers(const char *seq, int kmer_len, int num_kmers, vector<char *> &kme
int generate_permutations(map<int, char **> &pstrings, int width) {
int i, j, rem, div, num;
char *p;
- //
+ //
// Given a k-mer that allows wildcards -- 'N' characters, we need to generate all
// possible k-mers. To do so, we will generate a range of numbers that we convert to
// base 4, assuming that 0 = 'A', 1 = 'C', 2 = 'G', 3 = 'T'.
@@ -150,7 +150,7 @@ int generate_permutations(map<int, char **> &pstrings, int width) {
strings[i] = new char[width + 1];
for (i = 0; i < range; i++) {
- for (j = 0; j < width; j++)
+ for (j = 0; j < width; j++)
strings[i][j] = 'A';
strings[i][width] = '\0';
}
@@ -248,7 +248,7 @@ populate_kmer_hash(map<int, Locus *> &catalog, CatKmerHashMap &kmer_map, vector<
for (it = catalog.begin(); it != catalog.end(); it++) {
tag = it->second;
- num_kmers = strlen(tag->con) - kmer_len + 1;
+ num_kmers = strlen(tag->con) - kmer_len + 1;
//
// Iterate through the possible Catalog alleles
@@ -302,7 +302,7 @@ populate_kmer_hash(map<int, Locus *> &catalog, KmerHashMap &kmer_map, vector<cha
for (it = catalog.begin(); it != catalog.end(); it++) {
tag = it->second;
- num_kmers = strlen(tag->con) - kmer_len + 1;
+ num_kmers = strlen(tag->con) - kmer_len + 1;
//
// Iterate through the possible Catalog alleles
@@ -313,24 +313,24 @@ populate_kmer_hash(map<int, Locus *> &catalog, KmerHashMap &kmer_map, vector<cha
//
generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
- allele_it = allele_map.insert(allele_it, make_pair(allele_index, make_pair(allele->first, tag->id)));
+ allele_it = allele_map.insert(allele_it, make_pair(allele_index, make_pair(allele->first, tag->id)));
for (int j = 0; j < num_kmers; j++) {
hash_key = kmers[j];
- map_it = kmer_map.find(hash_key);
+ map_it = kmer_map.find(hash_key);
if (map_it != kmer_map.end()) {
- map_it->second.push_back(allele_index);
+ map_it->second.push_back(allele_index);
delete [] kmers[j];
- } else {
- kmer_map[hash_key].push_back(allele_index);
+ } else {
+ kmer_map[hash_key].push_back(allele_index);
kmer_map_keys.push_back(hash_key);
- }
+ }
}
kmers.clear();
- allele_index++;
+ allele_index++;
}
}
@@ -362,7 +362,7 @@ populate_kmer_hash(map<int, CLocus *> &catalog, KmerHashMap &kmer_map, vector<ch
for (it = catalog.begin(); it != catalog.end(); it++) {
tag = it->second;
- num_kmers = strlen(tag->con) - kmer_len + 1;
+ num_kmers = strlen(tag->con) - kmer_len + 1;
//
// Iterate through the possible Catalog alleles
@@ -373,24 +373,24 @@ populate_kmer_hash(map<int, CLocus *> &catalog, KmerHashMap &kmer_map, vector<ch
//
generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
- allele_it = allele_map.insert(allele_it, make_pair(allele_index, make_pair(allele->first, tag->id)));
+ allele_it = allele_map.insert(allele_it, make_pair(allele_index, make_pair(allele->first, tag->id)));
for (int j = 0; j < num_kmers; j++) {
hash_key = kmers[j];
- map_it = kmer_map.find(hash_key);
+ map_it = kmer_map.find(hash_key);
if (map_it != kmer_map.end()) {
- map_it->second.push_back(allele_index);
+ map_it->second.push_back(allele_index);
delete [] kmers[j];
- } else {
- kmer_map[hash_key].push_back(allele_index);
+ } else {
+ kmer_map[hash_key].push_back(allele_index);
kmer_map_keys.push_back(hash_key);
- }
+ }
}
kmers.clear();
- allele_index++;
+ allele_index++;
}
}
@@ -399,8 +399,8 @@ populate_kmer_hash(map<int, CLocus *> &catalog, KmerHashMap &kmer_map, vector<ch
return 0;
}
-int
-free_kmer_hash(CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys)
+int
+free_kmer_hash(CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys)
{
for (uint i = 0; i < kmer_map_keys.size(); i++) {
kmer_map[kmer_map_keys[i]].clear();
@@ -415,8 +415,8 @@ free_kmer_hash(CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys)
return 0;
}
-int
-free_kmer_hash(KmerHashMap &kmer_map, vector<char *> &kmer_map_keys)
+int
+free_kmer_hash(KmerHashMap &kmer_map, vector<char *> &kmer_map_keys)
{
for (uint i = 0; i < kmer_map_keys.size(); i++) {
kmer_map[kmer_map_keys[i]].clear();
@@ -442,7 +442,7 @@ int dist(const char *tag_1, Locus *tag_2, allele_type allele) {
vector<pair<allele_type, string> >::iterator it;
for (it = tag_2->strings.begin(); it != tag_2->strings.end(); it++)
- if (it->first == allele)
+ if (it->first == allele)
q = it->second.c_str();
if (q == NULL) return -1;
@@ -452,7 +452,7 @@ int dist(const char *tag_1, Locus *tag_2, allele_type allele) {
// between the two sequences.
while (p < p_end && q < q_end) {
dist += (*p == *q) ? 0 : 1;
- p++;
+ p++;
q++;
}
@@ -556,7 +556,7 @@ int dist(MergedStack *tag_1, MergedStack *tag_2) {
//
while (p < p_end && q < q_end) {
dist += (*p == *q) ? 0 : 1;
- p++;
+ p++;
q++;
}
@@ -588,7 +588,7 @@ int dist(MergedStack *tag_1, char *seq) {
//
while (p < p_end && q < q_end) {
dist += (*p == *q) ? 0 : 1;
- p++;
+ p++;
q++;
}
@@ -608,7 +608,7 @@ int dump_kmer_map(KmerHashMap &kmer_map) {
int i = 1;
for (kit = kmer_map.begin(); kit != kmer_map.end(); kit++) {
cerr << "Key #" << i << " " << kit->first << ": ";
- for (vit = (kit->second).begin(); vit != (kit->second).end(); vit++)
+ for (vit = (kit->second).begin(); vit != (kit->second).end(); vit++)
cerr << " " << *vit;
cerr << "\n";
i++;
diff --git a/src/locus.cc b/src/locus.cc
index 7daee85..ef31b68 100644
--- a/src/locus.cc
+++ b/src/locus.cc
@@ -23,8 +23,8 @@
//
#include "locus.h"
-uint
-Locus::sort_bp(uint k)
+uint
+Locus::sort_bp(uint k)
{
if (this->loc.strand == strand_plus)
return this->loc.bp + k;
@@ -32,8 +32,8 @@ Locus::sort_bp(uint k)
return (k == 0 ? this->loc.bp - this->len + 1 : this->loc.bp - k);
}
-int
-Locus::snp_index(uint col)
+int
+Locus::snp_index(uint col)
{
for (uint i = 0; i < this->snps.size(); i++)
if (this->snps[i]->col == col)
@@ -41,8 +41,8 @@ Locus::snp_index(uint col)
return -1;
}
-int
-Locus::add_consensus(const char *seq)
+int
+Locus::add_consensus(const char *seq)
{
if (this->con != NULL)
delete [] this->con;
@@ -54,8 +54,8 @@ Locus::add_consensus(const char *seq)
return 0;
}
-int
-Locus::add_model(const char *seq)
+int
+Locus::add_model(const char *seq)
{
if (this->model != NULL)
delete [] this->model;
@@ -63,18 +63,21 @@ Locus::add_model(const char *seq)
this->model = new char[this->len + 1];
strncpy(this->model, seq, this->len);
this->model[this->len] = '\0';
-
+
return 0;
}
-int
-Locus::populate_alleles()
+int
+Locus::populate_alleles()
{
vector<SNP *>::iterator i;
map<string, int>::iterator j;
string s;
int k;
+ if (this->len > strlen(this->con))
+ cerr << "Recorded locus->len: " << this->len << "; consensus length: " << strlen(this->con) << "\n";
+
//
// Is this effective?
//
@@ -105,13 +108,13 @@ Locus::populate_alleles()
return 0;
}
-bool
-bp_compare(Locus *a, Locus *b)
+bool
+bp_compare(Locus *a, Locus *b)
{
return (a->sort_bp() < b->sort_bp());
}
-QLocus::~QLocus()
+QLocus::~QLocus()
{
vector<Match *>::iterator it;
@@ -119,8 +122,8 @@ QLocus::~QLocus()
delete *it;
}
-int
-QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type, int distance)
+int
+QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type, int distance)
{
Match *m = new Match;
@@ -134,8 +137,8 @@ QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type,
return 0;
}
-int
-QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type, int distance, string cigar)
+int
+QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type, int distance, string cigar)
{
Match *m = new Match;
@@ -144,14 +147,14 @@ QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type,
m->query_type = query_type;
m->dist = distance;
m->cigar = cigar;
-
+
this->matches.push_back(m);
return 0;
}
-int
-QLocus::add_match(int catalog_id, allele_type cat_type)
+int
+QLocus::add_match(int catalog_id, allele_type cat_type)
{
Match *m = new Match;
@@ -173,6 +176,6 @@ QLocus::clear_matches()
for (it = this->matches.begin(); it != this->matches.end(); it++)
delete *it;
this->matches.clear();
-
+
return 0;
}
diff --git a/src/locus.h b/src/locus.h
index 864c7de..f6a14e7 100644
--- a/src/locus.h
+++ b/src/locus.h
@@ -70,20 +70,20 @@ class Locus {
map<string, int> alleles; // Map of the allelic configuration of SNPs in this stack along with the count of each
vector<pair<allele_type, string> > strings; // Strings for matching (representing the various allele combinations)
- Locus() {
- id = 0;
- sample_id = 0;
- depth = 0;
+ Locus() {
+ id = 0;
+ sample_id = 0;
+ depth = 0;
model = NULL;
- con = NULL;
+ con = NULL;
len = 0;
lnl = 0.0;
blacklisted = false;
deleveraged = false;
lumberjackstack = false;
}
- virtual ~Locus() {
- delete [] con;
+ virtual ~Locus() {
+ delete [] con;
delete [] model;
for (uint i = 0; i < snps.size(); i++)
delete snps[i];
@@ -124,8 +124,8 @@ class CLocus : public Locus {
vector<pair<int, int> > sources; // Sample/ID pairs for the sources contributing to this catalog entry
uint match_cnt;
- CLocus() : Locus() {
- this->match_cnt = 0;
+ CLocus() : Locus() {
+ this->match_cnt = 0;
};
int merge_snps(QLocus *);
@@ -133,17 +133,17 @@ class CLocus : public Locus {
};
//
-// Catalog Summary Locus Class; used in genotypes and populations, records a catalog
-// locus with summary information derived from individuals in the population.
+// Catalog Summary Locus Class; used in genotypes and populations, records a catalog
+// locus with summary information derived from individuals in the population.
//
class CSLocus : public Locus {
public:
- CSLocus() : Locus() {
- this->f = 0.0;
- this->cnt = 0;
- this->hcnt = 0;
- this->gcnt = 0;
- this->trans_gcnt = 0;
+ CSLocus() : Locus() {
+ this->f = 0.0;
+ this->cnt = 0;
+ this->hcnt = 0;
+ this->gcnt = 0;
+ this->trans_gcnt = 0;
this->chisq = 1.0;
this->confounded_cnt = 0;
};
@@ -153,12 +153,12 @@ public:
map<string, int> hap_cnts; // Counts of each observed haplotype for this locus in the population.
double f; // Inbreeder's coefficient
map<string, string> gmap; // Observed haplotype to genotype map for this locus.
- int confounded_cnt; // Number of samples/progeny containing confounded loci (more than one
+ int confounded_cnt; // Number of samples/progeny containing confounded loci (more than one
// locus from an individual sample matches this catalog locus).
int hcnt; // Number of samples/progeny containing a haplotype for this locus.
int cnt; // Number of samples/progeny containing data for this locus.
int gcnt; // Number of progeny containing a valid genotype.
- int trans_gcnt; // Number of progeny containing a valid
+ int trans_gcnt; // Number of progeny containing a valid
// genotype, translated for a particular map type.
double chisq; // Chi squared p-value testing the null hypothesis of no segregation distortion.
};
diff --git a/src/log_utils.cc b/src/log_utils.cc
index 9085644..7fa799c 100644
--- a/src/log_utils.cc
+++ b/src/log_utils.cc
@@ -37,8 +37,8 @@ init_log(ofstream &fh, int argc, char **argv)
// Write the command line that was executed.
//
for (int i = 0; i < argc; i++) {
- fh << argv[i];
- if (i < argc - 1) fh << " ";
+ fh << argv[i];
+ if (i < argc - 1) fh << " ";
}
fh << "\n" << argv[0] << " version " << VERSION << " executed " << date << "\n\n";
diff --git a/src/models.cc b/src/models.cc
index d458ed7..548513f 100644
--- a/src/models.cc
+++ b/src/models.cc
@@ -29,18 +29,18 @@
//
#include "models.h"
-snp_type
-call_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool record_snps)
+snp_type
+call_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool record_snps)
{
vector<pair<char, int> > nuc;
map<char, int>::iterator i;
int total = 0;
for (i = n.begin(); i != n.end(); i++) {
- if (i->first != 'N') {
- total += i->second;
- nuc.push_back(make_pair(i->first, i->second));
- }
+ if (i->first != 'N') {
+ total += i->second;
+ nuc.push_back(make_pair(i->first, i->second));
+ }
}
sort(nuc.begin(), nuc.end(), compare_pair);
@@ -49,17 +49,17 @@ call_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool record_s
// If this column was simply uncalled Ns, return.
//
if (nuc[0].second == 0) {
- if (record_snps) {
- SNP *snp = new SNP;
- snp->type = snp_type_unk;
- snp->col = col;
- snp->lratio = 0;
- snp->rank_1 = 'N';
- snp->rank_2 = '-';
-
- tag->snps.push_back(snp);
- }
- return snp_type_unk;
+ if (record_snps) {
+ SNP *snp = new SNP;
+ snp->type = snp_type_unk;
+ snp->col = col;
+ snp->lratio = 0;
+ snp->rank_1 = 'N';
+ snp->rank_2 = '-';
+
+ tag->snps.push_back(snp);
+ }
+ return snp_type_unk;
}
//
@@ -80,83 +80,83 @@ call_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool record_s
l_ratio = (nuc_1 * log(nuc_1 / total));
if (total - nuc_1 > 0)
- l_ratio += ((total - nuc_1) * log((total - nuc_1) / (3 * total)));
+ l_ratio += ((total - nuc_1) * log((total - nuc_1) / (3 * total)));
if (nuc_1 + nuc_2 > 0)
- l_ratio -= ((nuc_1 + nuc_2) * log((nuc_1 + nuc_2) / (2 * total)));
+ l_ratio -= ((nuc_1 + nuc_2) * log((nuc_1 + nuc_2) / (2 * total)));
if (nuc_3 + nuc_4 > 0)
- l_ratio -= ((nuc_3 + nuc_4) * log((nuc_3 + nuc_4) / (2 * total)));
+ l_ratio -= ((nuc_3 + nuc_4) * log((nuc_3 + nuc_4) / (2 * total)));
l_ratio *= 2;
snp_type res;
if (l_ratio <= heterozygote_limit) {
- //
+ //
// This locus is a heterozygote.
- //
- if (record_snps) {
- SNP *snp = new SNP;
- snp->type = snp_type_het;
- snp->col = col;
- snp->lratio = l_ratio;
- snp->rank_1 = nuc[0].first;
- snp->rank_2 = nuc[1].first;
-
- tag->snps.push_back(snp);
- }
- res = snp_type_het;
+ //
+ if (record_snps) {
+ SNP *snp = new SNP;
+ snp->type = snp_type_het;
+ snp->col = col;
+ snp->lratio = l_ratio;
+ snp->rank_1 = nuc[0].first;
+ snp->rank_2 = nuc[1].first;
+
+ tag->snps.push_back(snp);
+ }
+ res = snp_type_het;
} else if (l_ratio >= homozygote_limit) {
- //
+ //
// This locus is a homozygote.
- //
- if (record_snps) {
- SNP *snp = new SNP;
- snp->type = snp_type_hom;
- snp->col = col;
- snp->lratio = l_ratio;
- snp->rank_1 = nuc[0].first;
- snp->rank_2 = '-';
-
- tag->snps.push_back(snp);
- }
- res = snp_type_hom;
+ //
+ if (record_snps) {
+ SNP *snp = new SNP;
+ snp->type = snp_type_hom;
+ snp->col = col;
+ snp->lratio = l_ratio;
+ snp->rank_1 = nuc[0].first;
+ snp->rank_2 = '-';
+
+ tag->snps.push_back(snp);
+ }
+ res = snp_type_hom;
} else {
- //
+ //
// Unknown whether this is a heterozygote or homozygote.
- //
- if (record_snps) {
- SNP *snp = new SNP;
- snp->type = snp_type_unk;
- snp->col = col;
- snp->lratio = l_ratio;
- snp->rank_1 = nuc[0].first;
- snp->rank_2 = nuc[1].second > 0 ? nuc[1].first : '-';
-
- tag->snps.push_back(snp);
- }
-
- res = snp_type_unk;
+ //
+ if (record_snps) {
+ SNP *snp = new SNP;
+ snp->type = snp_type_unk;
+ snp->col = col;
+ snp->lratio = l_ratio;
+ snp->rank_1 = nuc[0].first;
+ snp->rank_2 = nuc[1].second > 0 ? nuc[1].first : '-';
+
+ tag->snps.push_back(snp);
+ }
+
+ res = snp_type_unk;
}
return res;
}
-snp_type
-call_multinomial_snp(Locus *tag, int col, map<char, int> &n)
+snp_type
+call_multinomial_snp(Locus *tag, int col, map<char, int> &n)
{
vector<pair<char, int> > nuc;
map<char, int>::iterator i;
int total = 0;
for (i = n.begin(); i != n.end(); i++) {
- if (i->first != 'N') {
- total += i->second;
- nuc.push_back(make_pair(i->first, i->second));
- }
+ if (i->first != 'N') {
+ total += i->second;
+ nuc.push_back(make_pair(i->first, i->second));
+ }
}
sort(nuc.begin(), nuc.end(), compare_pair);
@@ -165,13 +165,13 @@ call_multinomial_snp(Locus *tag, int col, map<char, int> &n)
// If this column was simply uncalled Ns, return.
//
if (nuc[0].second == 0) {
- tag->snps[col]->type = snp_type_unk;
- tag->snps[col]->col = col;
- tag->snps[col]->lratio = 0;
- tag->snps[col]->rank_1 = 'N';
- tag->snps[col]->rank_2 = '-';
+ tag->snps[col]->type = snp_type_unk;
+ tag->snps[col]->col = col;
+ tag->snps[col]->lratio = 0;
+ tag->snps[col]->rank_1 = 'N';
+ tag->snps[col]->rank_2 = '-';
- return snp_type_unk;
+ return snp_type_unk;
}
//
@@ -192,70 +192,70 @@ call_multinomial_snp(Locus *tag, int col, map<char, int> &n)
l_ratio = (nuc_1 * log(nuc_1 / total));
if (total - nuc_1 > 0)
- l_ratio += ((total - nuc_1) * log((total - nuc_1) / (3 * total)));
+ l_ratio += ((total - nuc_1) * log((total - nuc_1) / (3 * total)));
if (nuc_1 + nuc_2 > 0)
- l_ratio -= ((nuc_1 + nuc_2) * log((nuc_1 + nuc_2) / (2 * total)));
+ l_ratio -= ((nuc_1 + nuc_2) * log((nuc_1 + nuc_2) / (2 * total)));
if (nuc_3 + nuc_4 > 0)
- l_ratio -= ((nuc_3 + nuc_4) * log((nuc_3 + nuc_4) / (2 * total)));
+ l_ratio -= ((nuc_3 + nuc_4) * log((nuc_3 + nuc_4) / (2 * total)));
l_ratio *= 2;
snp_type res;
if (l_ratio <= heterozygote_limit) {
- //
+ //
// This locus is a heterozygote.
- //
- tag->snps[col]->type = snp_type_het;
- tag->snps[col]->col = col;
- tag->snps[col]->lratio = l_ratio;
- tag->snps[col]->rank_1 = nuc[0].first;
- tag->snps[col]->rank_2 = nuc[1].first;
+ //
+ tag->snps[col]->type = snp_type_het;
+ tag->snps[col]->col = col;
+ tag->snps[col]->lratio = l_ratio;
+ tag->snps[col]->rank_1 = nuc[0].first;
+ tag->snps[col]->rank_2 = nuc[1].first;
- res = snp_type_het;
+ res = snp_type_het;
} else if (l_ratio >= homozygote_limit) {
- //
+ //
// This locus is a homozygote.
- //
- tag->snps[col]->type = snp_type_hom;
- tag->snps[col]->col = col;
- tag->snps[col]->lratio = l_ratio;
- tag->snps[col]->rank_1 = nuc[0].first;
- tag->snps[col]->rank_2 = '-';
+ //
+ tag->snps[col]->type = snp_type_hom;
+ tag->snps[col]->col = col;
+ tag->snps[col]->lratio = l_ratio;
+ tag->snps[col]->rank_1 = nuc[0].first;
+ tag->snps[col]->rank_2 = '-';
- res = snp_type_hom;
+ res = snp_type_hom;
} else {
- //
+ //
// Unknown whether this is a heterozygote or homozygote.
- //
- tag->snps[col]->type = snp_type_unk;
- tag->snps[col]->col = col;
- tag->snps[col]->lratio = l_ratio;
- tag->snps[col]->rank_1 = nuc[0].first;
- tag->snps[col]->rank_2 = nuc[1].second > 0 ? nuc[1].first : '-';
-
- res = snp_type_unk;
+ //
+ tag->snps[col]->type = snp_type_unk;
+ tag->snps[col]->col = col;
+ tag->snps[col]->lratio = l_ratio;
+ tag->snps[col]->rank_1 = nuc[0].first;
+ tag->snps[col]->rank_2 = nuc[1].second > 0 ? nuc[1].first : '-';
+
+ res = snp_type_unk;
}
return res;
}
-snp_type
-call_bounded_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool record_snps)
+snp_type
+call_bounded_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool record_snps)
{
vector<pair<char, int> > nuc;
map<char, int>::iterator i;
double total = 0.0;
for (i = n.begin(); i != n.end(); i++) {
- if (i->first != 'N') {
- total += i->second;
- nuc.push_back(make_pair(i->first, i->second));
- }
+ if (i->first != 'N') {
+ total += i->second;
+ nuc.push_back(make_pair(i->first, i->second));
+ }
}
sort(nuc.begin(), nuc.end(), compare_pair);
@@ -264,17 +264,17 @@ call_bounded_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool
// If this column was simply uncalled Ns, return.
//
if (nuc[0].second == 0) {
- if (record_snps) {
- SNP *snp = new SNP;
- snp->type = snp_type_unk;
- snp->col = col;
- snp->lratio = 0;
- snp->rank_1 = 'N';
- snp->rank_2 = '-';
-
- tag->snps.push_back(snp);
- }
- return snp_type_unk;
+ if (record_snps) {
+ SNP *snp = new SNP;
+ snp->type = snp_type_unk;
+ snp->col = col;
+ snp->lratio = 0;
+ snp->rank_1 = 'N';
+ snp->rank_2 = '-';
+
+ tag->snps.push_back(snp);
+ }
+ return snp_type_unk;
}
double nuc_1 = nuc[0].second;
@@ -298,14 +298,14 @@ call_bounded_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool
// Check if the error rate is above or below the specified bound.
//
if (epsilon_hom < bound_low)
- epsilon_hom = bound_low;
+ epsilon_hom = bound_low;
else if (epsilon_hom > bound_high)
- epsilon_hom = bound_high;
+ epsilon_hom = bound_high;
if (epsilon_het < bound_low)
- epsilon_het = bound_low;
+ epsilon_het = bound_low;
else if (epsilon_het > bound_high)
- epsilon_het = bound_high;
+ epsilon_het = bound_high;
//
// Calculate the log likelihood for the homozygous and heterozygous genotypes.
@@ -316,84 +316,84 @@ call_bounded_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool
double ln_L_het = (nuc_1 + nuc_2) * log(0.5 - (epsilon_het / 4.0));
ln_L_het += epsilon_het > 0 ? ((nuc_3 + nuc_4) * log(epsilon_het / 4.0)) : 0;
- //
+ //
// Calculate the likelihood ratio.
//
double l_ratio = 2 * (ln_L_hom - ln_L_het);
- // cerr << " Nuc_1: " << nuc_1 << " Nuc_2: " << nuc_2 << " Nuc_3: " << nuc_3 << " Nuc_4: " << nuc_4
- // << " epsilon homozygote: " << epsilon_hom
- // << " epsilon heterozygote: " << epsilon_het
- // << " Log likelihood hom: " << ln_L_hom
- // << " Log likelihood het: " << ln_L_het
- // << " Likelihood ratio: " << l_ratio << "\n";
+ // cerr << " Nuc_1: " << nuc_1 << " Nuc_2: " << nuc_2 << " Nuc_3: " << nuc_3 << " Nuc_4: " << nuc_4
+ // << " epsilon homozygote: " << epsilon_hom
+ // << " epsilon heterozygote: " << epsilon_het
+ // << " Log likelihood hom: " << ln_L_hom
+ // << " Log likelihood het: " << ln_L_het
+ // << " Likelihood ratio: " << l_ratio << "\n";
snp_type res;
if (l_ratio <= heterozygote_limit) {
- //
+ //
// This locus is a heterozygote.
- //
- if (record_snps) {
- SNP *snp = new SNP;
- snp->type = snp_type_het;
- snp->col = col;
- snp->lratio = l_ratio;
- snp->rank_1 = nuc[0].first;
- snp->rank_2 = nuc[1].first;
-
- tag->snps.push_back(snp);
- }
- res = snp_type_het;
+ //
+ if (record_snps) {
+ SNP *snp = new SNP;
+ snp->type = snp_type_het;
+ snp->col = col;
+ snp->lratio = l_ratio;
+ snp->rank_1 = nuc[0].first;
+ snp->rank_2 = nuc[1].first;
+
+ tag->snps.push_back(snp);
+ }
+ res = snp_type_het;
} else if (l_ratio >= homozygote_limit) {
- //
+ //
// This locus is a homozygote.
- //
- if (record_snps) {
- SNP *snp = new SNP;
- snp->type = snp_type_hom;
- snp->col = col;
- snp->lratio = l_ratio;
- snp->rank_1 = nuc[0].first;
- snp->rank_2 = '-';
-
- tag->snps.push_back(snp);
- }
- res = snp_type_hom;
+ //
+ if (record_snps) {
+ SNP *snp = new SNP;
+ snp->type = snp_type_hom;
+ snp->col = col;
+ snp->lratio = l_ratio;
+ snp->rank_1 = nuc[0].first;
+ snp->rank_2 = '-';
+
+ tag->snps.push_back(snp);
+ }
+ res = snp_type_hom;
} else {
- //
+ //
// Unknown whether this is a heterozygote or homozygote.
- //
- if (record_snps) {
- SNP *snp = new SNP;
- snp->type = snp_type_unk;
- snp->col = col;
- snp->lratio = l_ratio;
- snp->rank_1 = nuc[0].first;
- snp->rank_2 = nuc[1].first;
-
- tag->snps.push_back(snp);
- }
- res = snp_type_unk;
+ //
+ if (record_snps) {
+ SNP *snp = new SNP;
+ snp->type = snp_type_unk;
+ snp->col = col;
+ snp->lratio = l_ratio;
+ snp->rank_1 = nuc[0].first;
+ snp->rank_2 = nuc[1].first;
+
+ tag->snps.push_back(snp);
+ }
+ res = snp_type_unk;
}
return res;
}
-snp_type
-call_bounded_multinomial_snp(Locus *tag, int col, map<char, int> &n)
+snp_type
+call_bounded_multinomial_snp(Locus *tag, int col, map<char, int> &n)
{
vector<pair<char, int> > nuc;
map<char, int>::iterator i;
double total = 0.0;
for (i = n.begin(); i != n.end(); i++) {
- if (i->first != 'N') {
- total += i->second;
- nuc.push_back(make_pair(i->first, i->second));
- }
+ if (i->first != 'N') {
+ total += i->second;
+ nuc.push_back(make_pair(i->first, i->second));
+ }
}
sort(nuc.begin(), nuc.end(), compare_pair);
@@ -402,13 +402,13 @@ call_bounded_multinomial_snp(Locus *tag, int col, map<char, int> &n)
// If this column was simply uncalled Ns, return.
//
if (nuc[0].second == 0) {
- tag->snps[col]->type = snp_type_unk;
- tag->snps[col]->col = col;
- tag->snps[col]->lratio = 0;
- tag->snps[col]->rank_1 = 'N';
- tag->snps[col]->rank_2 = '-';
+ tag->snps[col]->type = snp_type_unk;
+ tag->snps[col]->col = col;
+ tag->snps[col]->lratio = 0;
+ tag->snps[col]->rank_1 = 'N';
+ tag->snps[col]->rank_2 = '-';
- return snp_type_unk;
+ return snp_type_unk;
}
double nuc_1 = nuc[0].second;
@@ -432,14 +432,14 @@ call_bounded_multinomial_snp(Locus *tag, int col, map<char, int> &n)
// Check if the error rate is above or below the specified bound.
//
if (epsilon_hom < bound_low)
- epsilon_hom = bound_low;
+ epsilon_hom = bound_low;
else if (epsilon_hom > bound_high)
- epsilon_hom = bound_high;
+ epsilon_hom = bound_high;
if (epsilon_het < bound_low)
- epsilon_het = bound_low;
+ epsilon_het = bound_low;
else if (epsilon_het > bound_high)
- epsilon_het = bound_high;
+ epsilon_het = bound_high;
//
// Calculate the log likelihood for the homozygous and heterozygous genotypes.
@@ -450,62 +450,62 @@ call_bounded_multinomial_snp(Locus *tag, int col, map<char, int> &n)
double ln_L_het = (nuc_1 + nuc_2) * log(0.5 - (epsilon_het / 4.0));
ln_L_het += epsilon_het > 0 ? ((nuc_3 + nuc_4) * log(epsilon_het / 4.0)) : 0;
- //
+ //
// Calculate the likelihood ratio.
//
double l_ratio = 2 * (ln_L_hom - ln_L_het);
- // cerr << " Nuc_1: " << nuc_1 << " Nuc_2: " << nuc_2 << " Nuc_3: " << nuc_3 << " Nuc_4: " << nuc_4
- // << " epsilon homozygote: " << epsilon_hom
- // << " epsilon heterozygote: " << epsilon_het
- // << " Log likelihood hom: " << ln_L_hom
- // << " Log likelihood het: " << ln_L_het
- // << " Likelihood ratio: " << l_ratio << "\n";
+ // cerr << " Nuc_1: " << nuc_1 << " Nuc_2: " << nuc_2 << " Nuc_3: " << nuc_3 << " Nuc_4: " << nuc_4
+ // << " epsilon homozygote: " << epsilon_hom
+ // << " epsilon heterozygote: " << epsilon_het
+ // << " Log likelihood hom: " << ln_L_hom
+ // << " Log likelihood het: " << ln_L_het
+ // << " Likelihood ratio: " << l_ratio << "\n";
snp_type res;
if (l_ratio <= heterozygote_limit) {
- //
+ //
// This locus is a heterozygote.
- //
- tag->snps[col]->type = snp_type_het;
- tag->snps[col]->col = col;
- tag->snps[col]->lratio = l_ratio;
- tag->snps[col]->rank_1 = nuc[0].first;
- tag->snps[col]->rank_2 = nuc[1].first;
+ //
+ tag->snps[col]->type = snp_type_het;
+ tag->snps[col]->col = col;
+ tag->snps[col]->lratio = l_ratio;
+ tag->snps[col]->rank_1 = nuc[0].first;
+ tag->snps[col]->rank_2 = nuc[1].first;
- res = snp_type_het;
+ res = snp_type_het;
} else if (l_ratio >= homozygote_limit) {
- //
+ //
// This locus is a homozygote.
- //
- tag->snps[col]->type = snp_type_hom;
- tag->snps[col]->col = col;
- tag->snps[col]->lratio = l_ratio;
- tag->snps[col]->rank_1 = nuc[0].first;
- tag->snps[col]->rank_2 = '-';
+ //
+ tag->snps[col]->type = snp_type_hom;
+ tag->snps[col]->col = col;
+ tag->snps[col]->lratio = l_ratio;
+ tag->snps[col]->rank_1 = nuc[0].first;
+ tag->snps[col]->rank_2 = '-';
- res = snp_type_hom;
+ res = snp_type_hom;
} else {
- //
+ //
// Unknown whether this is a heterozygote or homozygote.
- //
- tag->snps[col]->type = snp_type_unk;
- tag->snps[col]->col = col;
- tag->snps[col]->lratio = l_ratio;
- tag->snps[col]->rank_1 = nuc[0].first;
- tag->snps[col]->rank_2 = nuc[1].first;
-
- res = snp_type_unk;
+ //
+ tag->snps[col]->type = snp_type_unk;
+ tag->snps[col]->col = col;
+ tag->snps[col]->lratio = l_ratio;
+ tag->snps[col]->rank_1 = nuc[0].first;
+ tag->snps[col]->rank_2 = nuc[1].first;
+
+ res = snp_type_unk;
}
return res;
}
-int
-call_multinomial_fixed (MergedStack *tag, int col, map<char, int> &n)
+int
+call_multinomial_fixed (MergedStack *tag, int col, map<char, int> &n)
{
const double nucleotide_fixed_limit = 1.92;
@@ -514,24 +514,24 @@ call_multinomial_fixed (MergedStack *tag, int col, map<char, int> &n)
int total = 0;
for (i = n.begin(); i != n.end(); i++) {
- if (i->first != 'N') {
- total += i->second;
- nuc.push_back(make_pair(i->first, i->second));
- }
+ if (i->first != 'N') {
+ total += i->second;
+ nuc.push_back(make_pair(i->first, i->second));
+ }
}
sort(nuc.begin(), nuc.end(), compare_pair);
if (nuc[0].second == 0) {
- SNP *snp = new SNP;
- snp->type = snp_type_unk;
- snp->col = col;
- snp->lratio = 0;
- snp->rank_1 = 'N';
- snp->rank_2 = '-';
-
- tag->snps.push_back(snp);
- return snp_type_unk;
+ SNP *snp = new SNP;
+ snp->type = snp_type_unk;
+ snp->col = col;
+ snp->lratio = 0;
+ snp->rank_1 = 'N';
+ snp->rank_2 = '-';
+
+ tag->snps.push_back(snp);
+ return snp_type_unk;
}
//
@@ -556,60 +556,60 @@ call_multinomial_fixed (MergedStack *tag, int col, map<char, int> &n)
n_ratio = nuc_1 / (nuc_1 + nuc_2);
- l_ratio =
- nuc_1 * log( ((4 * nuc_1 * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) /
- ((4 * p_freq * (nuc_1 + nuc_2) * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) );
+ l_ratio =
+ nuc_1 * log( ((4 * nuc_1 * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) /
+ ((4 * p_freq * (nuc_1 + nuc_2) * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) );
- l_ratio +=
- nuc_2 * log( ((4 * nuc_2 * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) /
- ((4 * (1 - p_freq) * (nuc_1 + nuc_2) * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) );
+ l_ratio +=
+ nuc_2 * log( ((4 * nuc_2 * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) /
+ ((4 * (1 - p_freq) * (nuc_1 + nuc_2) * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) );
//cerr << "Nuc_1: " << nuc_1 << " Nuc_2: " << nuc_2 << " Likelihood ratio: " << l_ratio << "\n";
if (n_ratio < p_freq || l_ratio < nucleotide_fixed_limit) {
- //
- // This position is likely a SNP, record it's homozygosity as 'unknown'.
- //
- SNP *snp = new SNP;
- snp->type = snp_type_unk;
- snp->col = col;
- snp->lratio = l_ratio;
- snp->rank_1 = nuc[0].first;
- snp->rank_2 = nuc[1].first;
-
- tag->snps.push_back(snp);
+ //
+ // This position is likely a SNP, record it's homozygosity as 'unknown'.
+ //
+ SNP *snp = new SNP;
+ snp->type = snp_type_unk;
+ snp->col = col;
+ snp->lratio = l_ratio;
+ snp->rank_1 = nuc[0].first;
+ snp->rank_2 = nuc[1].first;
+
+ tag->snps.push_back(snp);
} else {
- //
- // Otherwise, this position is homozygous.
- //
- SNP *snp = new SNP;
- snp->type = snp_type_hom;
- snp->col = col;
- snp->lratio = l_ratio;
- snp->rank_1 = nuc[0].first;
- snp->rank_2 = nuc[1].first;
-
- tag->snps.push_back(snp);
+ //
+ // Otherwise, this position is homozygous.
+ //
+ SNP *snp = new SNP;
+ snp->type = snp_type_hom;
+ snp->col = col;
+ snp->lratio = l_ratio;
+ snp->rank_1 = nuc[0].first;
+ snp->rank_2 = nuc[1].first;
+
+ tag->snps.push_back(snp);
}
return 0;
}
//
-// ln L(1/2) = ln(n! / n_1!n_2!n_3!n_4!) +
-// (n_1 + n_2) * ln(n_1 + n_2 / 2n) +
+// ln L(1/2) = ln(n! / n_1!n_2!n_3!n_4!) +
+// (n_1 + n_2) * ln(n_1 + n_2 / 2n) +
// (n_3 + n_4) * ln(n_3 + n_4 / 2n)
//
-double
-heterozygous_likelihood(int col, map<char, int> &nuc)
+double
+heterozygous_likelihood(int col, map<char, int> &nuc)
{
vector<pair<char, int> > cnts;
map<char, int>::iterator i;
double n = 0;
for (i = nuc.begin(); i != nuc.end(); i++) {
- n += i->second;
- cnts.push_back(make_pair(i->first, i->second));
+ n += i->second;
+ cnts.push_back(make_pair(i->first, i->second));
}
sort(cnts.begin(), cnts.end(), compare_pair);
@@ -619,35 +619,35 @@ heterozygous_likelihood(int col, map<char, int> &nuc)
double n_3 = cnts[2].second;
double n_4 = cnts[3].second;
- double term_1 =
- reduced_log_factorial(n, n_1) -
+ double term_1 =
+ reduced_log_factorial(n, n_1) -
(log_factorial(n_2) + log_factorial(n_3) + log_factorial(n_4));
double term_3 = (n_3 + n_4 > 0) ? log((n_3 + n_4) / (2 * n)) : 0;
- double lnl =
- term_1 +
- ((n_1 + n_2) * log((n_1 + n_2) / (2 * n))) +
+ double lnl =
+ term_1 +
+ ((n_1 + n_2) * log((n_1 + n_2) / (2 * n))) +
((n_3 + n_4) * term_3);
return lnl;
}
//
-// ln L(1/1) = ln(n! / n_1!n_2!n_3!n_4!) +
-// n_1 * ln(n_1 / n) +
+// ln L(1/1) = ln(n! / n_1!n_2!n_3!n_4!) +
+// n_1 * ln(n_1 / n) +
// (n - n_1) * ln(n - n_1 / 3n)
//
-double
-homozygous_likelihood(int col, map<char, int> &nuc)
+double
+homozygous_likelihood(int col, map<char, int> &nuc)
{
vector<pair<char, int> > cnts;
map<char, int>::iterator i;
double n = 0;
for (i = nuc.begin(); i != nuc.end(); i++) {
- n += i->second;
- cnts.push_back(make_pair(i->first, i->second));
+ n += i->second;
+ cnts.push_back(make_pair(i->first, i->second));
}
sort(cnts.begin(), cnts.end(), compare_pair);
@@ -657,15 +657,15 @@ homozygous_likelihood(int col, map<char, int> &nuc)
double n_3 = cnts[2].second;
double n_4 = cnts[3].second;
- double term_1 =
+ double term_1 =
reduced_log_factorial(n, n_1) -
(log_factorial(n_2) + log_factorial(n_3) + log_factorial(n_4));
double term_3 = n - n_1 > 0 ? log((n - n_1) / (3 * n)) : 0;
- double lnl =
- term_1 +
- (n_1 * log(n_1 / n)) +
+ double lnl =
+ term_1 +
+ (n_1 * log(n_1 / n)) +
((n - n_1) * term_3);
return lnl;
diff --git a/src/mst.cc b/src/mst.cc
index c143a0d..6daf63c 100644
--- a/src/mst.cc
+++ b/src/mst.cc
@@ -163,7 +163,7 @@ int MinSpanTree::build_tree() {
}
//
- // Iterate through all of the edges of n and update the
+ // Iterate through all of the edges of n and update the
// minimum distance to the proper nodes.
//
Edge *e;
@@ -225,11 +225,11 @@ string MinSpanTree::vis(bool overlay) {
visited.insert(n->id);
for (uint i = 0; i < n->min_adj_list.size(); i++) {
- data << " ";
- n->label.length() > 0 ? data << n->label : data << n->id;
- data << "--";
- n->min_adj_list[i]->label.length() > 0 ? (data << n->min_adj_list[i]->label) : (data << n->min_adj_list[i]->id);
- data << "\n";
+ data << " ";
+ n->label.length() > 0 ? data << n->label : data << n->id;
+ data << "--";
+ n->min_adj_list[i]->label.length() > 0 ? (data << n->min_adj_list[i]->label) : (data << n->min_adj_list[i]->id);
+ data << "\n";
if (visited.count(n->min_adj_list[i]->id) == 0)
q.push(n->min_adj_list[i]);
}
@@ -244,7 +244,7 @@ string MinSpanTree::vis(bool overlay) {
for (i = this->nodes.begin(); i != this->nodes.end(); i++) {
n = i->second;
for (j = 0; j < n->edges.size(); j++)
- scale = n->edges[j]->dist > scale ? n->edges[j]->dist : scale;
+ scale = n->edges[j]->dist > scale ? n->edges[j]->dist : scale;
}
scale = scale / scale_factor;
@@ -256,13 +256,13 @@ string MinSpanTree::vis(bool overlay) {
for (j = 0; j < n->edges.size(); j++) {
d = n->edges[j]->dist;
scaled_d = d / scale;
- scaled_d = scaled_d < 0.75 ? 0.75 : scaled_d;
- sprintf(label, "%.1f", d);
+ scaled_d = scaled_d < 0.75 ? 0.75 : scaled_d;
+ sprintf(label, "%.1f", d);
- n->label.length() > 0 ? (data << n->label) : (data << n->id);
- data << " -- ";
- n->edges[j]->child->label.length() > 0 ? (data << n->edges[j]->child->label) : (data << n->edges[j]->child->id);
- data << " [len=" << scaled_d << ", label=" << label << "];\n";
+ n->label.length() > 0 ? (data << n->label) : (data << n->id);
+ data << " -- ";
+ n->edges[j]->child->label.length() > 0 ? (data << n->edges[j]->child->label) : (data << n->edges[j]->child->id);
+ data << " [len=" << scaled_d << ", label=" << label << "];\n";
}
}
diff --git a/src/mst.h b/src/mst.h
index bc4f94e..71c4b5a 100644
--- a/src/mst.h
+++ b/src/mst.h
@@ -65,8 +65,8 @@ public:
vector<Node *> min_adj_list;
Node(uint id) {
- this->id = id;
- this->parent = NULL;
+ this->id = id;
+ this->parent = NULL;
this->update = true;
this->min_dist = 1000000;
}
diff --git a/src/mstack.cc b/src/mstack.cc
index 8d7054f..5062676 100644
--- a/src/mstack.cc
+++ b/src/mstack.cc
@@ -31,7 +31,7 @@
#include "mstack.h"
#include "models.h"
-MergedStack::MergedStack() {
+MergedStack::MergedStack() {
this->id = 0;
this->count = 0;
this->len = 0;
@@ -40,13 +40,14 @@ MergedStack::MergedStack() {
this->lnl = 0.0;
this->cohort_id = -1;
- this->deleveraged = false;
- this->masked = false;
- this->blacklisted = false;
- this->lumberjackstack = false;
+ this->deleveraged = false;
+ this->masked = false;
+ this->blacklisted = false;
+ this->lumberjackstack = false;
+ this->gappedlumberjack = false;
}
-MergedStack::~MergedStack() {
+MergedStack::~MergedStack() {
delete [] this->con;
for (uint i = 0; i < snps.size(); i++)
@@ -57,7 +58,7 @@ MergedStack::~MergedStack() {
int MergedStack::add_consensus(const char *seq) {
if (this->con != NULL)
- delete [] this->con;
+ delete [] this->con;
this->len = strlen(seq);
this->con = new char[len + 1];
@@ -69,7 +70,7 @@ int MergedStack::add_consensus(const char *seq) {
int MergedStack::add_consensus(DNASeq *seq) {
if (this->con != NULL)
- delete [] this->con;
+ delete [] this->con;
this->len = seq->size;
this->con = new char[this->len + 1];
@@ -80,7 +81,7 @@ int MergedStack::add_consensus(DNASeq *seq) {
int MergedStack::add_consensus(DNANSeq *seq) {
if (this->con != NULL)
- delete [] this->con;
+ delete [] this->con;
this->len = seq->size();
this->con = new char[this->len + 1];
@@ -108,7 +109,7 @@ MergedStack::gen_matrix(map<int, Stack *> &unique, map<int, Rem *> &rem)
//
// Create a two-dimensional array, each row containing one read. For
// each unique tag that has been merged together, add the sequence for
- // that tag into our array as many times as it originally occurred.
+ // that tag into our array as many times as it originally occurred.
//
// We do not allocate memory for the second dimension of the array, we simply
// reuse the existing char arrays in the unique and rem maps
@@ -129,7 +130,7 @@ MergedStack::gen_matrix(map<int, Stack *> &unique, map<int, Rem *> &rem)
}
}
- // For each remainder tag that has been merged into this Stack, add the sequence.
+ // For each remainder tag that has been merged into this Stack, add the sequence.
for (j = this->remtags.begin(); j != this->remtags.end(); j++) {
this->matrix[i] = rem[*j]->seq;
i++;
@@ -139,13 +140,13 @@ MergedStack::gen_matrix(map<int, Stack *> &unique, map<int, Rem *> &rem)
}
DNANSeq **
-MergedStack::gen_matrix(map<int, PStack *> &unique)
+MergedStack::gen_matrix(map<int, PStack *> &unique)
{
PStack *tag;
//
// Create a two-dimensional array, each row containing one read. For
// each unique tag that has been merged together, add the sequence for
- // that tag into our array as many times as it originally occurred.
+ // that tag into our array as many times as it originally occurred.
//
// We do not allocate memory for the second dimension of the array, we simply
// reuse the existing char arrays in the unique and rem maps
@@ -169,8 +170,8 @@ MergedStack::gen_matrix(map<int, PStack *> &unique)
return this->matrix;
}
-double
-MergedStack::calc_likelihood()
+double
+MergedStack::calc_likelihood()
{
if (this->matrix == NULL || this->snps.size() == 0)
return 0;
@@ -202,7 +203,7 @@ MergedStack::calc_likelihood()
continue;
}
- nuc['A'] = 0;
+ nuc['A'] = 0;
nuc['G'] = 0;
nuc['C'] = 0;
nuc['T'] = 0;
@@ -211,9 +212,9 @@ MergedStack::calc_likelihood()
// Count the nucleotide type at each position in the column.
//
for (row = 0; row < height; row++) {
- d = this->matrix[row];
+ d = this->matrix[row];
nuc[(*d)[col]]++;
- }
+ }
//
// Find the base with a plurality of occurances and call it.
//
@@ -225,23 +226,23 @@ MergedStack::calc_likelihood()
max = n;
}
- //
- // For nucleotide positions with potential polymorphism (i.e. two or more alleles at
- // the locus that differ at that position), first find the ML genotype (call_multinomial_snp).
- // If it returns 'het' calculate the heterozygous_likelihood(), otherwise calculate homozygous
- // likelihood.
- //
- snp_type res = this->snps[col]->type;
-
- if (res == snp_type_het)
- this->lnl += heterozygous_likelihood(col, nuc);
- else if (res == snp_type_hom)
- this->lnl += homozygous_likelihood(col, nuc);
- else {
- double homlnl = homozygous_likelihood(col, nuc);
- double hetlnl = heterozygous_likelihood(col, nuc);
- this->lnl += hetlnl > homlnl ? hetlnl : homlnl;
- }
+ //
+ // For nucleotide positions with potential polymorphism (i.e. two or more alleles at
+ // the locus that differ at that position), first find the ML genotype (call_multinomial_snp).
+ // If it returns 'het' calculate the heterozygous_likelihood(), otherwise calculate homozygous
+ // likelihood.
+ //
+ snp_type res = this->snps[col]->type;
+
+ if (res == snp_type_het)
+ this->lnl += heterozygous_likelihood(col, nuc);
+ else if (res == snp_type_hom)
+ this->lnl += homozygous_likelihood(col, nuc);
+ else {
+ double homlnl = homozygous_likelihood(col, nuc);
+ double hetlnl = heterozygous_likelihood(col, nuc);
+ this->lnl += hetlnl > homlnl ? hetlnl : homlnl;
+ }
}
return this->lnl;
diff --git a/src/ordered.h b/src/ordered.h
index e3455f8..63ba5d2 100644
--- a/src/ordered.h
+++ b/src/ordered.h
@@ -61,7 +61,7 @@ public:
};
template<class StatT>
-int
+int
Ordered<StatT>::init_sites(vector<StatT *> &sites, map<uint, uint> &sites_key, vector<CSLocus *> &sorted_loci)
{
CSLocus *loc;
@@ -74,14 +74,14 @@ Ordered<StatT>::init_sites(vector<StatT *> &sites, map<uint, uint> &sites_key, v
// account for positions in the genome that are covered by more than one RAD tag.
//
for (uint pos = 0; pos < sorted_loci.size(); pos++) {
- loc = sorted_loci[pos];
- len = strlen(loc->con);
- ltally = this->psum->locus_tally(loc->id);
-
- for (int k = 0; k < len; k++) {
- if (ltally->nucs[k].allele_cnt == 2)
- bps.insert(ltally->nucs[k].bp);
- }
+ loc = sorted_loci[pos];
+ len = strlen(loc->con);
+ ltally = this->psum->locus_tally(loc->id);
+
+ for (int k = 0; k < len; k++) {
+ if (ltally->nucs[k].allele_cnt == 2)
+ bps.insert(ltally->nucs[k].bp);
+ }
}
sites.resize(bps.size(), NULL);
@@ -92,15 +92,15 @@ Ordered<StatT>::init_sites(vector<StatT *> &sites, map<uint, uint> &sites_key, v
set<int>::iterator it;
int i = 0;
for (it = bps.begin(); it != bps.end(); it++) {
- sites_key[*it] = i;
- i++;
+ sites_key[*it] = i;
+ i++;
}
return 0;
}
template<class StatT>
-int
+int
Ordered<StatT>::init_sites(vector<StatT *> &sites, map<uint, uint> &sites_key, vector<CSLocus *> &sorted_loci, int pop_id)
{
CSLocus *loc;
@@ -113,14 +113,14 @@ Ordered<StatT>::init_sites(vector<StatT *> &sites, map<uint, uint> &sites_key, v
// account for positions in the genome that are covered by more than one RAD tag.
//
for (uint pos = 0; pos < sorted_loci.size(); pos++) {
- loc = sorted_loci[pos];
- len = strlen(loc->con);
- lsum = this->psum->pop(loc->id, pop_id);
-
- for (int k = 0; k < len; k++) {
- if (lsum->nucs[k].num_indv > 0)
- bps.insert(lsum->nucs[k].bp);
- }
+ loc = sorted_loci[pos];
+ len = strlen(loc->con);
+ lsum = this->psum->pop(loc->id, pop_id);
+
+ for (int k = 0; k < len; k++) {
+ if (lsum->nucs[k].num_indv > 0)
+ bps.insert(lsum->nucs[k].bp);
+ }
}
sites.resize(bps.size(), NULL);
@@ -131,15 +131,15 @@ Ordered<StatT>::init_sites(vector<StatT *> &sites, map<uint, uint> &sites_key, v
set<int>::iterator it;
int i = 0;
for (it = bps.begin(); it != bps.end(); it++) {
- sites_key[*it] = i;
- i++;
+ sites_key[*it] = i;
+ i++;
}
return 0;
}
template<class StatT>
-int
+int
Ordered<StatT>::init_sites(vector<StatT *> &sites, map<uint, uint> &sites_key, vector<CSLocus *> &sorted_loci, int pop_id_1, int pop_id_2)
{
CSLocus *loc;
@@ -152,16 +152,16 @@ Ordered<StatT>::init_sites(vector<StatT *> &sites, map<uint, uint> &sites_key, v
// account for positions in the genome that are covered by more than one RAD tag.
//
for (uint pos = 0; pos < sorted_loci.size(); pos++) {
- loc = sorted_loci[pos];
- len = strlen(loc->con);
- lsum_1 = this->psum->pop(loc->id, pop_id_1);
- lsum_2 = this->psum->pop(loc->id, pop_id_2);
-
- for (int k = 0; k < len; k++) {
- if (lsum_1->nucs[k].num_indv > 0 &&
- lsum_2->nucs[k].num_indv > 0)
- bps.insert(lsum_1->nucs[k].bp); // slow
- }
+ loc = sorted_loci[pos];
+ len = strlen(loc->con);
+ lsum_1 = this->psum->pop(loc->id, pop_id_1);
+ lsum_2 = this->psum->pop(loc->id, pop_id_2);
+
+ for (int k = 0; k < len; k++) {
+ if (lsum_1->nucs[k].num_indv > 0 &&
+ lsum_2->nucs[k].num_indv > 0)
+ bps.insert(lsum_1->nucs[k].bp); // slow
+ }
}
sites.resize(bps.size(), NULL);
@@ -172,26 +172,26 @@ Ordered<StatT>::init_sites(vector<StatT *> &sites, map<uint, uint> &sites_key, v
set<int>::iterator it;
int i = 0;
for (it = bps.begin(); it != bps.end(); it++) {
- sites_key[*it] = i; // slow
- i++;
+ sites_key[*it] = i; // slow
+ i++;
}
return 0;
}
template<class StatT>
-int
-Ordered<StatT>::init_haplotypes(vector<StatT *> &sites, map<uint, uint> &sites_key, vector<CSLocus *> &sorted_loci)
+int
+Ordered<StatT>::init_haplotypes(vector<StatT *> &sites, map<uint, uint> &sites_key, vector<CSLocus *> &sorted_loci)
{
CSLocus *loc;
int bp;
set<int> bps;
for (uint pos = 0; pos < sorted_loci.size(); pos++) {
- loc = sorted_loci[pos];
- bp = loc->sort_bp();
+ loc = sorted_loci[pos];
+ bp = loc->sort_bp();
- bps.insert(bp);
+ bps.insert(bp);
}
sites.resize(bps.size(), NULL);
@@ -202,8 +202,8 @@ Ordered<StatT>::init_haplotypes(vector<StatT *> &sites, map<uint, uint> &sites_k
set<int>::iterator it;
int i = 0;
for (it = bps.begin(); it != bps.end(); it++) {
- sites_key[*it] = i;
- i++;
+ sites_key[*it] = i;
+ i++;
}
return 0;
@@ -218,8 +218,8 @@ public:
};
template<class StatT>
-int
-OHaplotypes<StatT>::order(vector<StatT *> &sites, map<uint, uint> &sites_key, vector<CSLocus *> &sorted_loci)
+int
+OHaplotypes<StatT>::order(vector<StatT *> &sites, map<uint, uint> &sites_key, vector<CSLocus *> &sorted_loci)
{
this->init_haplotypes(sites, sites_key, sorted_loci);
@@ -229,17 +229,17 @@ OHaplotypes<StatT>::order(vector<StatT *> &sites, map<uint, uint> &sites_key, ve
template<class StatT>
class OPopPair: public Ordered<StatT> {
public:
- OPopPair(PopSum<CSLocus> *psum, ofstream &log_fh): Ordered<StatT>() {
- this->log_fh = &log_fh;
- this->psum = psum;
+ OPopPair(PopSum<CSLocus> *psum, ofstream &log_fh): Ordered<StatT>() {
+ this->log_fh = &log_fh;
+ this->psum = psum;
}
int order(vector<StatT *> &, map<uint, uint> &, vector<CSLocus *> &, int, int);
};
template<class StatT>
-int
-OPopPair<StatT>::order(vector<StatT *> &sites, map<uint, uint> &sites_key, vector<CSLocus *> &sorted_loci, int pop_1, int pop_2)
+int
+OPopPair<StatT>::order(vector<StatT *> &sites, map<uint, uint> &sites_key, vector<CSLocus *> &sorted_loci, int pop_1, int pop_2)
{
CSLocus *loc;
StatT *pair;
@@ -251,61 +251,61 @@ OPopPair<StatT>::order(vector<StatT *> &sites, map<uint, uint> &sites_key, vecto
this->init_sites(sites, sites_key, sorted_loci, pop_1, pop_2);
for (uint pos = 0; pos < sorted_loci.size(); pos++) {
- loc = sorted_loci[pos];
- len = strlen(loc->con);
-
- for (int k = 0; k < len; k++) {
-
- pair = this->psum->Fst(loc->id, pop_1, pop_2, k);
-
- //
- // Locus is incompatible, log this position.
- //
- if (pair == NULL) {
- this->incompatible_loci++;
- *(this->log_fh) << "between_population\t"
- << "incompatible_locus\t"
- << loc->id << "\t"
- << loc->loc.chr << "\t"
- << loc->sort_bp(k) +1 << "\t"
- << k << "\t"
- << mpopi.pops()[pop_1].name << "\t"
- << mpopi.pops()[pop_2].name << "\n";
- delete pair;
- continue;
- }
-
- pair->loc_id = loc->id;
- pair->bp = loc->sort_bp(k);
- pair->col = k;
-
- //
- // Locus is fixed in both populations, or was only found in one population.
- //
- if (pair->pi == 0) {
- delete pair;
- continue;
- }
-
- //
- // Check if this basepair position is already covered by a RAD site.
- //
- if (sites[sites_key[pair->bp]] != NULL) {
- this->multiple_loci++;
- *(this->log_fh) << "between_population\t"
- << "multiple_locus\t"
- << loc->id << "\t"
- << loc->loc.chr << "\t"
- << pair->bp +1 << "\t"
- << k << "\t"
- << mpopi.pops()[pop_1].name << "\t"
- << mpopi.pops()[pop_2].name << "\n";
- delete pair;
- continue;
- }
-
- sites[sites_key[pair->bp]] = pair;
- }
+ loc = sorted_loci[pos];
+ len = strlen(loc->con);
+
+ for (int k = 0; k < len; k++) {
+
+ pair = this->psum->Fst(loc->id, pop_1, pop_2, k);
+
+ //
+ // Locus is incompatible, log this position.
+ //
+ if (pair == NULL) {
+ this->incompatible_loci++;
+ *(this->log_fh) << "between_population\t"
+ << "incompatible_locus\t"
+ << loc->id << "\t"
+ << loc->loc.chr << "\t"
+ << loc->sort_bp(k) +1 << "\t"
+ << k << "\t"
+ << mpopi.pops()[pop_1].name << "\t"
+ << mpopi.pops()[pop_2].name << "\n";
+ delete pair;
+ continue;
+ }
+
+ pair->loc_id = loc->id;
+ pair->bp = loc->sort_bp(k);
+ pair->col = k;
+
+ //
+ // Locus is fixed in both populations, or was only found in one population.
+ //
+ if (pair->pi == 0) {
+ delete pair;
+ continue;
+ }
+
+ //
+ // Check if this basepair position is already covered by a RAD site.
+ //
+ if (sites[sites_key[pair->bp]] != NULL) {
+ this->multiple_loci++;
+ *(this->log_fh) << "between_population\t"
+ << "multiple_locus\t"
+ << loc->id << "\t"
+ << loc->loc.chr << "\t"
+ << pair->bp +1 << "\t"
+ << k << "\t"
+ << mpopi.pops()[pop_1].name << "\t"
+ << mpopi.pops()[pop_2].name << "\n";
+ delete pair;
+ continue;
+ }
+
+ sites[sites_key[pair->bp]] = pair;
+ }
}
return 0;
@@ -314,17 +314,17 @@ OPopPair<StatT>::order(vector<StatT *> &sites, map<uint, uint> &sites_key, vecto
template<class StatT>
class OSumStat: public Ordered<StatT> {
public:
- OSumStat(PopSum<CSLocus> *psum, ofstream &log_fh): Ordered<StatT>() {
- this->log_fh = &log_fh;
- this->psum = psum;
+ OSumStat(PopSum<CSLocus> *psum, ofstream &log_fh): Ordered<StatT>() {
+ this->log_fh = &log_fh;
+ this->psum = psum;
}
int order(vector<StatT *> &, vector<CSLocus *> &, int);
};
template<class StatT>
-int
-OSumStat<StatT>::order(vector<StatT *> &sites, vector<CSLocus *> &sorted_loci, int pop_id)
+int
+OSumStat<StatT>::order(vector<StatT *> &sites, vector<CSLocus *> &sorted_loci, int pop_id)
{
this->incompatible_loci = 0;
this->multiple_loci = 0;
@@ -338,35 +338,35 @@ OSumStat<StatT>::order(vector<StatT *> &sites, vector<CSLocus *> &sorted_loci, i
int len;
//
- // Assign nucleotides to their proper, ordered location in the genome,
+ // Assign nucleotides to their proper, ordered location in the genome,
// checking that a site hasn't already been covered by another RAD locus.
//
for (uint pos = 0; pos < sorted_loci.size(); pos++) {
- loc = sorted_loci[pos];
- len = strlen(loc->con);
- lsum = this->psum->pop(loc->id, pop_id);
-
- for (int k = 0; k < len; k++) {
- if (lsum->nucs[k].num_indv == 0) continue;
-
- if (sites_key.count(lsum->nucs[k].bp) == 0) {
- cerr << "Error: locus " << lsum->nucs[k].loc_id << " at " << lsum->nucs[k].bp +1 << "bp is not defined in the sites map.\n";
-
- } else if (sites[sites_key[lsum->nucs[k].bp]] == NULL) {
- sites[sites_key[lsum->nucs[k].bp]] = &(lsum->nucs[k]);
-
- } else {
- this->multiple_loci++;
- *(this->log_fh) << "within_population\t"
- << "multiple_locus\t"
- << loc->id << "\t"
- << loc->loc.chr << "\t"
- << lsum->nucs[k].bp +1 << "\t"
- << k << "\t"
- << mpopi.pops()[pop_id].name << "\t"
- << "conflicts with locus " << sites[sites_key[lsum->nucs[k].bp]]->loc_id << "\n";
- }
- }
+ loc = sorted_loci[pos];
+ len = strlen(loc->con);
+ lsum = this->psum->pop(loc->id, pop_id);
+
+ for (int k = 0; k < len; k++) {
+ if (lsum->nucs[k].num_indv == 0) continue;
+
+ if (sites_key.count(lsum->nucs[k].bp) == 0) {
+ cerr << "Error: locus " << lsum->nucs[k].loc_id << " at " << lsum->nucs[k].bp +1 << "bp is not defined in the sites map.\n";
+
+ } else if (sites[sites_key[lsum->nucs[k].bp]] == NULL) {
+ sites[sites_key[lsum->nucs[k].bp]] = &(lsum->nucs[k]);
+
+ } else {
+ this->multiple_loci++;
+ *(this->log_fh) << "within_population\t"
+ << "multiple_locus\t"
+ << loc->id << "\t"
+ << loc->loc.chr << "\t"
+ << lsum->nucs[k].bp +1 << "\t"
+ << k << "\t"
+ << mpopi.pops()[pop_id].name << "\t"
+ << "conflicts with locus " << sites[sites_key[lsum->nucs[k].bp]]->loc_id << "\n";
+ }
+ }
}
return 0;
@@ -375,17 +375,17 @@ OSumStat<StatT>::order(vector<StatT *> &sites, vector<CSLocus *> &sorted_loci, i
template<class StatT>
class OLocTally: public Ordered<StatT> {
public:
- OLocTally(PopSum<CSLocus> *psum, ofstream &log_fh): Ordered<StatT>() {
- this->log_fh = &log_fh;
- this->psum = psum;
+ OLocTally(PopSum<CSLocus> *psum, ofstream &log_fh): Ordered<StatT>() {
+ this->log_fh = &log_fh;
+ this->psum = psum;
}
int order(vector<StatT *> &, vector<CSLocus *> &);
};
template<class StatT>
-int
-OLocTally<StatT>::order(vector<StatT *> &sites, vector<CSLocus *> &sorted_loci)
+int
+OLocTally<StatT>::order(vector<StatT *> &sites, vector<CSLocus *> &sorted_loci)
{
this->incompatible_loci = 0;
this->multiple_loci = 0;
@@ -399,34 +399,34 @@ OLocTally<StatT>::order(vector<StatT *> &sites, vector<CSLocus *> &sorted_loci)
int len;
//
- // Assign nucleotides to their proper, ordered location in the genome,
+ // Assign nucleotides to their proper, ordered location in the genome,
// checking that a site hasn't already been covered by another RAD locus.
//
for (uint pos = 0; pos < sorted_loci.size(); pos++) {
- loc = sorted_loci[pos];
- len = strlen(loc->con);
- ltally = this->psum->locus_tally(loc->id);
-
- for (int k = 0; k < len; k++) {
- if (ltally->nucs[k].allele_cnt != 2) continue;
-
- if (sites_key.count(ltally->nucs[k].bp) == 0) {
- cerr << "Error: locus " << ltally->nucs[k].loc_id << " at " << ltally->nucs[k].bp +1 << "bp is not defined in the sites map.\n";
-
- } else if (sites[sites_key[ltally->nucs[k].bp]] == NULL) {
- sites[sites_key[ltally->nucs[k].bp]] = &(ltally->nucs[k]);
-
- } else {
- this->multiple_loci++;
- *(this->log_fh) << "within_population\t"
- << "multiple_locus\t"
- << loc->id << "\t"
- << loc->loc.chr << "\t"
- << ltally->nucs[k].bp +1 << "\t"
- << k << "\t"
- << "conflicts with locus " << sites[sites_key[ltally->nucs[k].bp]]->loc_id << "\n";
- }
- }
+ loc = sorted_loci[pos];
+ len = strlen(loc->con);
+ ltally = this->psum->locus_tally(loc->id);
+
+ for (int k = 0; k < len; k++) {
+ if (ltally->nucs[k].allele_cnt != 2) continue;
+
+ if (sites_key.count(ltally->nucs[k].bp) == 0) {
+ cerr << "Error: locus " << ltally->nucs[k].loc_id << " at " << ltally->nucs[k].bp +1 << "bp is not defined in the sites map.\n";
+
+ } else if (sites[sites_key[ltally->nucs[k].bp]] == NULL) {
+ sites[sites_key[ltally->nucs[k].bp]] = &(ltally->nucs[k]);
+
+ } else {
+ this->multiple_loci++;
+ *(this->log_fh) << "within_population\t"
+ << "multiple_locus\t"
+ << loc->id << "\t"
+ << loc->loc.chr << "\t"
+ << ltally->nucs[k].bp +1 << "\t"
+ << k << "\t"
+ << "conflicts with locus " << sites[sites_key[ltally->nucs[k].bp]]->loc_id << "\n";
+ }
+ }
}
return 0;
diff --git a/src/phasedstacks.cc b/src/phasedstacks.cc
index 350fe20..998fca5 100644
--- a/src/phasedstacks.cc
+++ b/src/phasedstacks.cc
@@ -54,33 +54,33 @@ int main (int argc, char* argv[]) {
parse_command_line(argc, argv);
if (p_value_cutoff == 0.1) {
- chi_sq_limit = 2.71;
+ chi_sq_limit = 2.71;
} else if (p_value_cutoff == 0.05) {
- chi_sq_limit = 3.84;
+ chi_sq_limit = 3.84;
} else if (p_value_cutoff == 0.01) {
- chi_sq_limit = 6.64;
+ chi_sq_limit = 6.64;
} else if (p_value_cutoff == 0.001) {
- chi_sq_limit = 10.83;
+ chi_sq_limit = 10.83;
}
cerr << "Minor allele frequency cutoff: " << minor_freq_lim << "\n"
- << "Looking for ";
+ << "Looking for ";
switch(in_file_type) {
case FileT::beagle:
- cerr << "Beagle";
- break;
+ cerr << "Beagle";
+ break;
case FileT::phase:
- cerr << "PHASE";
- break;
+ cerr << "PHASE";
+ break;
case FileT::fastphase:
default:
- cerr << "fastPhase";
- break;
+ cerr << "fastPhase";
+ break;
}
cerr << " input files.\n"
- << "Size of buckets for binning D' values at a particular distance: " << bucket_dist / 1000 << "kb.\n";
+ << "Size of buckets for binning D' values at a particular distance: " << bucket_dist / 1000 << "kb.\n";
if (dprime_threshold)
- cerr << "D' Threshold set at " << dprime_threshold_level << ". D' values above this limit will be set to 1.0, values below will be set to 0.0.\n";
+ cerr << "D' Threshold set at " << dprime_threshold_level << ". D' values above this limit will be set to 1.0, values below will be set to 0.0.\n";
//
// Parse the population map.
@@ -96,7 +96,7 @@ int main (int argc, char* argv[]) {
vector<pair<int, string> > files;
if (!build_file_list(files))
- exit(1);
+ exit(1);
cerr << "Identified " << files.size() << " files.\n";
@@ -110,7 +110,7 @@ int main (int argc, char* argv[]) {
if (log_fh.fail()) {
cerr << "Error opening log file '" << log_path << "'\n";
- exit(1);
+ exit(1);
}
init_log(log_fh, argc, argv);
@@ -125,8 +125,8 @@ int main (int argc, char* argv[]) {
int res;
catalog_file << cat_path << "batch_" << batch_id << ".catalog";
if ((res = load_loci(catalog_file.str(), catalog, false, false, compressed)) == 0) {
- cerr << "Unable to load the catalog '" << catalog_file.str() << "'\n";
- return 0;
+ cerr << "Unable to load the catalog '" << catalog_file.str() << "'\n";
+ return 0;
}
cerr << "done.\n";
@@ -145,68 +145,68 @@ int main (int argc, char* argv[]) {
for (uint i = 0; i < files.size(); i++) {
- // if (files[i].second != "batch_1.groupV.phase") continue;
-
- PhasedSummary *psum = NULL;
-
- if (in_file_type == FileT::fastphase) {
- if ((psum = parse_fastphase(in_path + files[i].second)) == NULL) {
- cerr << "Unable to parse fastPhase input files.\n";
- exit(1);
- }
- } else if (in_file_type == FileT::beagle && haplotypes) {
- if ((psum = parse_beagle_haplotypes(catalog, in_path + files[i].second)) == NULL) {
- cerr << "Unable to parse Beagle input files.\n";
- exit(1);
- }
- } else if (in_file_type == FileT::beagle) {
- if ((psum = parse_beagle(catalog, in_path + files[i].second)) == NULL) {
- cerr << "Unable to parse Beagle input files.\n";
- exit(1);
- }
- }
-
- //
- // Summarize the genotypes in the populations.
- //
- summarize_phased_genotypes(psum);
-
- // for (uint j = 0; j < psum->size; j++) {
- // cerr << "BP: " << psum->nucs[j].bp << "\t"
- // << "A: " << std::setw(3) << psum->nucs[j].nuc[0] << " "
- // << "C: " << std::setw(3) << psum->nucs[j].nuc[1] << " "
- // << "G: " << std::setw(3) << psum->nucs[j].nuc[2] << " "
- // << "T: " << std::setw(3) << psum->nucs[j].nuc[3] << "\n";
- // }
-
- //
- // Calculate D'
- //
- cerr << "Calculating D'...";
- calc_dprime(psum);
- cerr << "done.\n";
-
- write_dprime(in_path + files[i].second, psum);
-
- //
- // Generate haplotype blocks based on D'.
- //
- dprime_blocks(in_path + files[i].second, pop_map, psum, dp_block_lens, dp_snp_cnts);
-
- //
- // Generate haplotype blocks using the four gamete test.
- //
- four_gamete_test(in_path + files[i].second, pop_map, psum, fgt_block_lens, fgt_snp_cnts);
-
- //
- // Bucket the D' measures by distance between SNPs.
- //
- bucket_dprime(dprime_buckets, dprime_bucket_cnts, psum);
-
- //
- // Free the Samples objects
- //
- delete psum;
+ // if (files[i].second != "batch_1.groupV.phase") continue;
+
+ PhasedSummary *psum = NULL;
+
+ if (in_file_type == FileT::fastphase) {
+ if ((psum = parse_fastphase(in_path + files[i].second)) == NULL) {
+ cerr << "Unable to parse fastPhase input files.\n";
+ exit(1);
+ }
+ } else if (in_file_type == FileT::beagle && haplotypes) {
+ if ((psum = parse_beagle_haplotypes(catalog, in_path + files[i].second)) == NULL) {
+ cerr << "Unable to parse Beagle input files.\n";
+ exit(1);
+ }
+ } else if (in_file_type == FileT::beagle) {
+ if ((psum = parse_beagle(catalog, in_path + files[i].second)) == NULL) {
+ cerr << "Unable to parse Beagle input files.\n";
+ exit(1);
+ }
+ }
+
+ //
+ // Summarize the genotypes in the populations.
+ //
+ summarize_phased_genotypes(psum);
+
+ // for (uint j = 0; j < psum->size; j++) {
+ // cerr << "BP: " << psum->nucs[j].bp << "\t"
+ // << "A: " << std::setw(3) << psum->nucs[j].nuc[0] << " "
+ // << "C: " << std::setw(3) << psum->nucs[j].nuc[1] << " "
+ // << "G: " << std::setw(3) << psum->nucs[j].nuc[2] << " "
+ // << "T: " << std::setw(3) << psum->nucs[j].nuc[3] << "\n";
+ // }
+
+ //
+ // Calculate D'
+ //
+ cerr << "Calculating D'...";
+ calc_dprime(psum);
+ cerr << "done.\n";
+
+ write_dprime(in_path + files[i].second, psum);
+
+ //
+ // Generate haplotype blocks based on D'.
+ //
+ dprime_blocks(in_path + files[i].second, pop_map, psum, dp_block_lens, dp_snp_cnts);
+
+ //
+ // Generate haplotype blocks using the four gamete test.
+ //
+ four_gamete_test(in_path + files[i].second, pop_map, psum, fgt_block_lens, fgt_snp_cnts);
+
+ //
+ // Bucket the D' measures by distance between SNPs.
+ //
+ bucket_dprime(dprime_buckets, dprime_bucket_cnts, psum);
+
+ //
+ // Free the Samples objects
+ //
+ delete psum;
}
//
@@ -220,31 +220,31 @@ int main (int argc, char* argv[]) {
log_fh << "# Distribution of FGT haplotype block lengths.\n";
map<int, int>::iterator buck_it;
for (buck_it = fgt_block_lens.begin(); buck_it != fgt_block_lens.end(); buck_it++)
- log_fh << buck_it->first << "\t" << buck_it->second << "\n";
+ log_fh << buck_it->first << "\t" << buck_it->second << "\n";
//
// Write the FGT bucketed SNP counts.
//
log_fh << "\n\n"
- << "# Distribution of FGT SNP counts per haplotype block.\n";
+ << "# Distribution of FGT SNP counts per haplotype block.\n";
for (buck_it = fgt_snp_cnts.begin(); buck_it != fgt_snp_cnts.end(); buck_it++)
- log_fh << buck_it->first << "\t" << buck_it->second << "\n";
+ log_fh << buck_it->first << "\t" << buck_it->second << "\n";
//
// Write the D' haplotype block bucketed distances.
//
- log_fh << "\n\n"
- << "# Distribution of D' haplotype block lengths.\n";
+ log_fh << "\n\n"
+ << "# Distribution of D' haplotype block lengths.\n";
for (buck_it = dp_block_lens.begin(); buck_it != dp_block_lens.end(); buck_it++)
- log_fh << buck_it->first << "\t" << buck_it->second << "\n";
+ log_fh << buck_it->first << "\t" << buck_it->second << "\n";
//
// Write the D' bucketed SNP counts.
//
log_fh << "\n\n"
- << "# Distribution of D' SNP counts per haplotype block.\n";
+ << "# Distribution of D' SNP counts per haplotype block.\n";
for (buck_it = dp_snp_cnts.begin(); buck_it != dp_snp_cnts.end(); buck_it++)
- log_fh << buck_it->first << "\t" << buck_it->second << "\n";
+ log_fh << buck_it->first << "\t" << buck_it->second << "\n";
log_fh.close();
@@ -262,46 +262,46 @@ bucket_dprime(vector<double> &dprime_buckets, vector<double> &dprime_bucket_cnts
// SNPs on this chromosome and add buckets as necessary.
//
for (uint i = 0; i < psum->size; i++) {
- for (uint j = i+1; j < psum->size; j++) {
+ for (uint j = i+1; j < psum->size; j++) {
- if (psum->nucs[i].freq < minor_freq_lim ||
- psum->nucs[j].freq < minor_freq_lim)
- continue;
+ if (psum->nucs[i].freq < minor_freq_lim ||
+ psum->nucs[j].freq < minor_freq_lim)
+ continue;
- if (write_zeros == false && psum->dprime[i][j].chisq_p == false)
- continue;
+ if (write_zeros == false && psum->dprime[i][j].chisq_p == false)
+ continue;
- dist = psum->nucs[j].bp - psum->nucs[i].bp;
- max_dist = dist > max_dist ? dist : max_dist;
- }
+ dist = psum->nucs[j].bp - psum->nucs[i].bp;
+ max_dist = dist > max_dist ? dist : max_dist;
+ }
}
max_bucket = max_dist / bucket_dist;
if (dprime_buckets.size() < max_bucket) {
- uint cnt = max_bucket + 1 - dprime_buckets.size();
- for (uint i = 0; i < cnt; i++) {
- dprime_buckets.push_back(0.0);
- dprime_bucket_cnts.push_back(0.0);
- }
+ uint cnt = max_bucket + 1 - dprime_buckets.size();
+ for (uint i = 0; i < cnt; i++) {
+ dprime_buckets.push_back(0.0);
+ dprime_bucket_cnts.push_back(0.0);
+ }
}
//
// Populate buckets
//
for (uint i = 0; i < psum->size; i++) {
- for (uint j = i+1; j < psum->size; j++) {
+ for (uint j = i+1; j < psum->size; j++) {
+
+ if (psum->nucs[i].freq < minor_freq_lim ||
+ psum->nucs[j].freq < minor_freq_lim)
+ continue;
- if (psum->nucs[i].freq < minor_freq_lim ||
- psum->nucs[j].freq < minor_freq_lim)
- continue;
+ if (write_zeros == false && psum->dprime[i][j].chisq_p == false)
+ continue;
- if (write_zeros == false && psum->dprime[i][j].chisq_p == false)
- continue;
+ bucket = ((psum->nucs[j].bp - psum->nucs[i].bp) / bucket_dist);
- bucket = ((psum->nucs[j].bp - psum->nucs[i].bp) / bucket_dist);
-
- dprime_buckets[bucket] += (psum->dprime[i][j].chisq_p ? psum->dprime[i][j].dprime : 0.0);
- dprime_bucket_cnts[bucket]++;
- }
+ dprime_buckets[bucket] += (psum->dprime[i][j].chisq_p ? psum->dprime[i][j].dprime : 0.0);
+ dprime_bucket_cnts[bucket]++;
+ }
}
return 0;
@@ -313,7 +313,7 @@ write_buckets(string path, vector<double> &dprime_buckets, vector<double> &dprim
//
// Write the bucketed D' data for plotting.
//
- stringstream file;
+ stringstream file;
file << path << "Dprime_dist_buckets" << bucket_dist/1000 << "kb.tsv";
cerr << "Writing bucketed D' data to '" << file.str() << "'...";
@@ -322,14 +322,14 @@ write_buckets(string path, vector<double> &dprime_buckets, vector<double> &dprim
if (fh.fail()) {
cerr << "Error opening D' file '" << file.str() << "'\n";
- exit(1);
+ exit(1);
}
fh << "# Distance (Kb)\tD' Average\n";
for (uint i = 0; i < dprime_buckets.size(); i++)
- fh << (i * bucket_dist) << "\t"
- << std::setprecision(3) << (dprime_buckets[i] / dprime_bucket_cnts[i]) << "\n";
+ fh << (i * bucket_dist) << "\t"
+ << std::setprecision(3) << (dprime_buckets[i] / dprime_bucket_cnts[i]) << "\n";
fh.close();
@@ -353,7 +353,7 @@ four_gamete_test(string path, map<string, int> &pop_map, PhasedSummary *psum, ma
if (fh.fail()) {
cerr << "Error opening FGT file '" << file << "'\n";
- exit(1);
+ exit(1);
}
fh << "# ID\tStart\tEnd\tLen\tSNP Count\tHaplotype Count\tHaplotype\tPopulations\tHapPopCnt\n";
@@ -364,61 +364,61 @@ four_gamete_test(string path, map<string, int> &pop_map, PhasedSummary *psum, ma
map<int, int> buckets, snps;
for (uint i = 0; i < psum->size; i++) {
- if (psum->nucs[i].freq < minor_freq_lim)
- continue;
-
- //
- // Start a new block.
- //
- start = i;
- bound = false;
- cnt = 0;
- uint j = i;
-
- do {
- if (psum->nucs[j].freq < minor_freq_lim) {
- j++;
- continue;
- }
-
- for (int k = j; k >= (int) start; k--) {
-
- if (psum->nucs[k].freq < minor_freq_lim)
- continue;
-
- if (psum->recomb[k][j] == true) {
- bound = true;
- end = j;
- }
- }
-
- j++;
- cnt++;
- } while (bound == false && j < psum->size);
-
- if (j == psum->size)
- end = j - 1;
-
- fh << id << "\t"
- << psum->nucs[start].bp << "\t"
- << psum->nucs[end].bp << "\t"
- << psum->nucs[end].bp - psum->nucs[start].bp + 1 << "\t"
- << cnt << "\t";
- //
- // Bucket the SNP counts for plotting.
- //
- snps[cnt]++;
-
- //
- // Bucket the haplotype block lengths for plotting.
- //
- dist = (psum->nucs[end].bp - psum->nucs[start].bp + 1) / 10000 * 10000;
- buckets[dist]++;
-
- enumerate_haplotypes(fh, pop_map, psum, start, end);
-
- i = end;
- id++;
+ if (psum->nucs[i].freq < minor_freq_lim)
+ continue;
+
+ //
+ // Start a new block.
+ //
+ start = i;
+ bound = false;
+ cnt = 0;
+ uint j = i;
+
+ do {
+ if (psum->nucs[j].freq < minor_freq_lim) {
+ j++;
+ continue;
+ }
+
+ for (int k = j; k >= (int) start; k--) {
+
+ if (psum->nucs[k].freq < minor_freq_lim)
+ continue;
+
+ if (psum->recomb[k][j] == true) {
+ bound = true;
+ end = j;
+ }
+ }
+
+ j++;
+ cnt++;
+ } while (bound == false && j < psum->size);
+
+ if (j == psum->size)
+ end = j - 1;
+
+ fh << id << "\t"
+ << psum->nucs[start].bp << "\t"
+ << psum->nucs[end].bp << "\t"
+ << psum->nucs[end].bp - psum->nucs[start].bp + 1 << "\t"
+ << cnt << "\t";
+ //
+ // Bucket the SNP counts for plotting.
+ //
+ snps[cnt]++;
+
+ //
+ // Bucket the haplotype block lengths for plotting.
+ //
+ dist = (psum->nucs[end].bp - psum->nucs[start].bp + 1) / 10000 * 10000;
+ buckets[dist]++;
+
+ enumerate_haplotypes(fh, pop_map, psum, start, end);
+
+ i = end;
+ id++;
}
//
@@ -428,9 +428,9 @@ four_gamete_test(string path, map<string, int> &pop_map, PhasedSummary *psum, ma
<< "# Distribution of FGT haplotype block lengths.\n";
map<int, int>::iterator it;
for (it = buckets.begin(); it != buckets.end(); it++) {
- fh << it->first << "\t" << it->second << "\n";
+ fh << it->first << "\t" << it->second << "\n";
- len_buckets[it->first] += it->second;
+ len_buckets[it->first] += it->second;
}
//
@@ -439,9 +439,9 @@ four_gamete_test(string path, map<string, int> &pop_map, PhasedSummary *psum, ma
fh << "\n\n"
<< "# Distribution of SNP counts per FGT haplotype block.\n";
for (it = snps.begin(); it != snps.end(); it++) {
- fh << it->first << "\t" << it->second << "\n";
+ fh << it->first << "\t" << it->second << "\n";
- snp_buckets[it->first] += it->second;
+ snp_buckets[it->first] += it->second;
}
fh.close();
@@ -466,7 +466,7 @@ dprime_blocks(string path, map<string, int> &pop_map, PhasedSummary *psum, map<i
if (fh.fail()) {
cerr << "Error opening D' blocks file '" << file << "'\n";
- exit(1);
+ exit(1);
}
fh << "# ID\tStart\tEnd\tLen\tSNP Count\tHaplotype Count\tHaplotype\tPopulations\tHapPopCnt\n";
@@ -481,53 +481,53 @@ dprime_blocks(string path, map<string, int> &pop_map, PhasedSummary *psum, map<i
uint recomb_pairs = 0;
for (uint i = 0; i < psum->size; i++) {
- if (psum->nucs[i].freq < minor_freq_lim)
- continue;
-
- for (uint j = i+1; j < psum->size; j++) {
- if (psum->nucs[j].freq < minor_freq_lim)
- continue;
-
- tot_pairs++;
- dist = psum->nucs[j].bp - psum->nucs[i].bp + 1;
-
- //
- // Does this pair of markers show a strong measure of LD?
- //
- if (psum->dprime[i][j].ci_high > 0.98 &&
- psum->dprime[i][j].ci_low > 0.7 &&
- dist <= max_pair_dist) {
- psum->dprime[i][j].type = strong_ld;
-
- ld_pairs.push_back(make_pair(i, j));
- ld_map[i].push_back(j);
-
- loci.insert(i);
- loci.insert(j);
- }
-
- //
- // Does this pair of markers show a strong measure of historical recombination?
- //
- if (psum->dprime[i][j].ci_high < 0.9) {
- psum->dprime[i][j].type = recomb;
- recomb_pairs++;
- }
- }
+ if (psum->nucs[i].freq < minor_freq_lim)
+ continue;
+
+ for (uint j = i+1; j < psum->size; j++) {
+ if (psum->nucs[j].freq < minor_freq_lim)
+ continue;
+
+ tot_pairs++;
+ dist = psum->nucs[j].bp - psum->nucs[i].bp + 1;
+
+ //
+ // Does this pair of markers show a strong measure of LD?
+ //
+ if (psum->dprime[i][j].ci_high > 0.98 &&
+ psum->dprime[i][j].ci_low > 0.7 &&
+ dist <= max_pair_dist) {
+ psum->dprime[i][j].type = strong_ld;
+
+ ld_pairs.push_back(make_pair(i, j));
+ ld_map[i].push_back(j);
+
+ loci.insert(i);
+ loci.insert(j);
+ }
+
+ //
+ // Does this pair of markers show a strong measure of historical recombination?
+ //
+ if (psum->dprime[i][j].ci_high < 0.9) {
+ psum->dprime[i][j].type = recomb;
+ recomb_pairs++;
+ }
+ }
}
// map<int, vector<int> >::iterator it;
// for (it = ld_map.begin(); it != ld_map.end(); it++) {
- // cerr << " " << it->first << " ->\n";
- // for (uint i = 0; i < it->second.size(); i++)
- // cerr << " " << it->second[i] << " dist: " << (psum->nucs[it->second[i]].bp - psum->nucs[it->first].bp + 1) << "bp\n";
+ // cerr << " " << it->first << " ->\n";
+ // for (uint i = 0; i < it->second.size(); i++)
+ // cerr << " " << it->second[i] << " dist: " << (psum->nucs[it->second[i]].bp - psum->nucs[it->first].bp + 1) << "bp\n";
// }
- cerr << " Total pairs examined: " << tot_pairs
- << "; Strong LD pairs: " << ld_pairs.size()
- << "; Recombination pairs: " << recomb_pairs
- << "; Informative markers: " << std::setprecision(3)
- << ((double) (ld_pairs.size() + recomb_pairs) / (double) tot_pairs) * 100 << "%\n";
+ cerr << " Total pairs examined: " << tot_pairs
+ << "; Strong LD pairs: " << ld_pairs.size()
+ << "; Recombination pairs: " << recomb_pairs
+ << "; Informative markers: " << std::setprecision(3)
+ << ((double) (ld_pairs.size() + recomb_pairs) / (double) tot_pairs) * 100 << "%\n";
//
// Convert our list of loci into an ordered, linked list, where each node
@@ -544,21 +544,21 @@ dprime_blocks(string path, map<string, int> &pop_map, PhasedSummary *psum, map<i
cur = blocks.head();
do {
- //
- // Can we merge these two nodes together?
- //
- if (check_adjacent_blocks(psum, cur)) {
- // cerr << " Merging blocks: ";
- // for (uint i = 0; i < cur->loci.size(); i++)
- // cerr << cur->loci[i] << ", ";
- // cerr << " and ";
- // for (uint i = 0; i < cur->next->loci.size(); i++)
- // cerr << cur->next->loci[i] << ", ";
- // cerr << "\n";
- blocks.merge_adjacent(cur);
- } else {
- cur = cur->next;
- }
+ //
+ // Can we merge these two nodes together?
+ //
+ if (check_adjacent_blocks(psum, cur)) {
+ // cerr << " Merging blocks: ";
+ // for (uint i = 0; i < cur->loci.size(); i++)
+ // cerr << cur->loci[i] << ", ";
+ // cerr << " and ";
+ // for (uint i = 0; i < cur->next->loci.size(); i++)
+ // cerr << cur->next->loci[i] << ", ";
+ // cerr << "\n";
+ blocks.merge_adjacent(cur);
+ } else {
+ cur = cur->next;
+ }
} while (cur->next != NULL);
// blocks.print();
@@ -572,30 +572,30 @@ dprime_blocks(string path, map<string, int> &pop_map, PhasedSummary *psum, map<i
cur = blocks.head();
do {
- start = *(cur->loci.begin());
- end = *(cur->loci.rbegin());
-
- fh << id << "\t"
- << psum->nucs[start].bp << "\t"
- << psum->nucs[end].bp << "\t"
- << psum->nucs[end].bp - psum->nucs[start].bp + 1 << "\t"
- << cur->loci.size() << "\t";
-
- //
- // Bucket the SNP counts for plotting.
- //
- snps[cur->loci.size()]++;
-
- //
- // Bucket the haplotype block lengths for plotting.
- //
- dist = (psum->nucs[end].bp - psum->nucs[start].bp + 1) / 10000 * 10000;
- buckets[dist]++;
-
- enumerate_haplotypes(fh, pop_map, psum, start, end);
-
- id++;
- cur = cur->next;
+ start = *(cur->loci.begin());
+ end = *(cur->loci.rbegin());
+
+ fh << id << "\t"
+ << psum->nucs[start].bp << "\t"
+ << psum->nucs[end].bp << "\t"
+ << psum->nucs[end].bp - psum->nucs[start].bp + 1 << "\t"
+ << cur->loci.size() << "\t";
+
+ //
+ // Bucket the SNP counts for plotting.
+ //
+ snps[cur->loci.size()]++;
+
+ //
+ // Bucket the haplotype block lengths for plotting.
+ //
+ dist = (psum->nucs[end].bp - psum->nucs[start].bp + 1) / 10000 * 10000;
+ buckets[dist]++;
+
+ enumerate_haplotypes(fh, pop_map, psum, start, end);
+
+ id++;
+ cur = cur->next;
} while (cur != NULL);
//
@@ -605,9 +605,9 @@ dprime_blocks(string path, map<string, int> &pop_map, PhasedSummary *psum, map<i
<< "# Distribution of D' haplotype block lengths.\n";
map<int, int>::iterator it;
for (it = buckets.begin(); it != buckets.end(); it++) {
- fh << it->first << "\t" << it->second << "\n";
+ fh << it->first << "\t" << it->second << "\n";
- len_buckets[it->first] += it->second;
+ len_buckets[it->first] += it->second;
}
//
@@ -616,9 +616,9 @@ dprime_blocks(string path, map<string, int> &pop_map, PhasedSummary *psum, map<i
fh << "\n\n"
<< "# Distribution of SNP counts per D' haplotype block.\n";
for (it = snps.begin(); it != snps.end(); it++) {
- fh << it->first << "\t" << it->second << "\n";
+ fh << it->first << "\t" << it->second << "\n";
- snp_buckets[it->first] += it->second;
+ snp_buckets[it->first] += it->second;
}
fh.close();
@@ -644,26 +644,26 @@ check_adjacent_blocks(PhasedSummary *psum, HBlock *block)
double strong_ld = 0.0;
for (uint i = start; i <= end; i++) {
- if (psum->nucs[i].freq < minor_freq_lim)
- continue;
-
- for (uint j = i + 1; j <= end; j++) {
- if (psum->dprime[i][j].type == uninformative ||
- psum->nucs[j].freq < minor_freq_lim)
- continue;
-
- tot++;
- if (psum->dprime[i][j].type == strong_ld)
- strong_ld++;
- }
+ if (psum->nucs[i].freq < minor_freq_lim)
+ continue;
+
+ for (uint j = i + 1; j <= end; j++) {
+ if (psum->dprime[i][j].type == uninformative ||
+ psum->nucs[j].freq < minor_freq_lim)
+ continue;
+
+ tot++;
+ if (psum->dprime[i][j].type == strong_ld)
+ strong_ld++;
+ }
}
- // cerr << "Comparing range " << start << " to " << end
- // << "; total pairs: " << tot << "; strong LD: " << strong_ld
- // << "; proportion: " << std::setprecision(3) << strong_ld / tot << "\n";
+ // cerr << "Comparing range " << start << " to " << end
+ // << "; total pairs: " << tot << "; strong LD: " << strong_ld
+ // << "; proportion: " << std::setprecision(3) << strong_ld / tot << "\n";
if (strong_ld / tot >= min_inform_pairs)
- return true;
+ return true;
return false;
}
@@ -677,7 +677,7 @@ dPrimeBlocks::merge_adjacent(HBlock *a)
HBlock *b = a->next;
for (uint i = 0; i < b->loci.size(); i++)
- a->loci.push_back(b->loci[i]);
+ a->loci.push_back(b->loci[i]);
a->next = b->next;
delete b;
return a;
@@ -698,9 +698,9 @@ dPrimeBlocks::initialize(set<int> &loci)
// // Create a node from each locus and add to it all immediately adjacent loci.
// //
// do {
- // this->_head->loci.push_back(*it);
- // prev_it = it;
- // it++;
+ // this->_head->loci.push_back(*it);
+ // prev_it = it;
+ // it++;
// } while (it != loci.end() && (*prev_it) + 1 == *it);
next = this->_head;
@@ -708,19 +708,19 @@ dPrimeBlocks::initialize(set<int> &loci)
// if (it == loci.end()) return this->_head;
do {
- cur = new HBlock;
- cur->loci.push_back(*it);
- it++;
+ cur = new HBlock;
+ cur->loci.push_back(*it);
+ it++;
- // do {
- // cur->loci.push_back(*it);
- // prev_it = it;
- // it++;
- // } while (it != loci.end() &&
- // (*prev_it) + 1 == *it);
+ // do {
+ // cur->loci.push_back(*it);
+ // prev_it = it;
+ // it++;
+ // } while (it != loci.end() &&
+ // (*prev_it) + 1 == *it);
- next->next = cur;
- next = next->next;
+ next->next = cur;
+ next = next->next;
} while (it != loci.end());
@@ -732,14 +732,14 @@ dPrimeBlocks::print()
{
HBlock *cur = this->_head;
while (cur != NULL) {
- for (uint i = 0; i < cur->loci.size(); i++) {
- if (i > 0)
- cerr << ", ";
- cerr << cur->loci[i];
- }
- cerr << "\n";
-
- cur = cur->next;
+ for (uint i = 0; i < cur->loci.size(); i++) {
+ if (i > 0)
+ cerr << ", ";
+ cerr << cur->loci[i];
+ }
+ cerr << "\n";
+
+ cur = cur->next;
}
return 0;
@@ -759,32 +759,32 @@ enumerate_haplotypes(ofstream &fh, map<string, int> &pop_map, PhasedSummary *psu
//
for (uint k = 0; k < psum->sample_cnt; k++) {
- for (uint n = start; n <= end; n++)
- if (psum->nucs[n].freq >= minor_freq_lim)
- haplotype += psum->samples[k].nucs_1[n];
+ for (uint n = start; n <= end; n++)
+ if (psum->nucs[n].freq >= minor_freq_lim)
+ haplotype += psum->samples[k].nucs_1[n];
- pops.insert(pop_map[psum->samples[k].name]);
+ pops.insert(pop_map[psum->samples[k].name]);
- if (haplotypes.count(haplotype) == 0)
- haplotypes[haplotype][pop_map[psum->samples[k].name]] = 1;
- else
- haplotypes[haplotype][pop_map[psum->samples[k].name]]++;
- haplotype.clear();
+ if (haplotypes.count(haplotype) == 0)
+ haplotypes[haplotype][pop_map[psum->samples[k].name]] = 1;
+ else
+ haplotypes[haplotype][pop_map[psum->samples[k].name]]++;
+ haplotype.clear();
}
for (uint k = 0; k < psum->sample_cnt; k++) {
- for (uint n = start; n <= end; n++)
- if (psum->nucs[n].freq >= minor_freq_lim)
- haplotype += psum->samples[k].nucs_2[n];
+ for (uint n = start; n <= end; n++)
+ if (psum->nucs[n].freq >= minor_freq_lim)
+ haplotype += psum->samples[k].nucs_2[n];
- pops.insert(pop_map[psum->samples[k].name]);
+ pops.insert(pop_map[psum->samples[k].name]);
- if (haplotypes.count(haplotype) == 0)
- haplotypes[haplotype][pop_map[psum->samples[k].name]] = 1;
- else
- haplotypes[haplotype][pop_map[psum->samples[k].name]]++;
- haplotype.clear();
+ if (haplotypes.count(haplotype) == 0)
+ haplotypes[haplotype][pop_map[psum->samples[k].name]] = 1;
+ else
+ haplotypes[haplotype][pop_map[psum->samples[k].name]]++;
+ haplotype.clear();
}
//
@@ -794,16 +794,16 @@ enumerate_haplotypes(ofstream &fh, map<string, int> &pop_map, PhasedSummary *psu
fh << haplotypes.size() << "\t";
for (it = haplotypes.begin(); it != haplotypes.end(); it++) {
- //
- // Haplotypes are stored per population; sum them up here.
- //
- for (sit = it->second.begin(); sit != it->second.end(); sit++)
- tot += sit->second;
-
- if (it != haplotypes.begin())
- fh << ",";
- fh << it->first << "|"
- << std::setprecision(3) << tot / ((float) psum->sample_cnt * 2.0);
+ //
+ // Haplotypes are stored per population; sum them up here.
+ //
+ for (sit = it->second.begin(); sit != it->second.end(); sit++)
+ tot += sit->second;
+
+ if (it != haplotypes.begin())
+ fh << ",";
+ fh << it->first << "|"
+ << std::setprecision(3) << tot / ((float) psum->sample_cnt * 2.0);
}
fh << "\t";
@@ -814,10 +814,10 @@ enumerate_haplotypes(ofstream &fh, map<string, int> &pop_map, PhasedSummary *psu
// Write which populations this haplotype block occurs in.
//
if (pops.size() == 0)
- fh << "-1\t";
+ fh << "-1\t";
else
- for (pit = pops.begin(); pit != pops.end(); pit++)
- pops_str << *pit << ",";
+ for (pit = pops.begin(); pit != pops.end(); pit++)
+ pops_str << *pit << ",";
fh << pops_str.str().substr(0, pops_str.str().length()-1);
pops_str.str("");
@@ -825,13 +825,13 @@ enumerate_haplotypes(ofstream &fh, map<string, int> &pop_map, PhasedSummary *psu
// Write the frequency of occurence of each haplotype in each population.
//
for (it = haplotypes.begin(); it != haplotypes.end(); it++) {
- pops_str << "\t";
- for (pit = pops.begin(); pit != pops.end(); pit++)
- pops_str << (it->second)[*pit] << "|"
- << std::setprecision(3)
- << (float) (it->second)[*pit] / (float) (pop_cnts[*pit] * 2.0) << ",";
- fh << pops_str.str().substr(0, pops_str.str().length()-1);
- pops_str.str("");
+ pops_str << "\t";
+ for (pit = pops.begin(); pit != pops.end(); pit++)
+ pops_str << (it->second)[*pit] << "|"
+ << std::setprecision(3)
+ << (float) (it->second)[*pit] / (float) (pop_cnts[*pit] * 2.0) << ",";
+ fh << pops_str.str().substr(0, pops_str.str().length()-1);
+ pops_str.str("");
}
fh << "\n";
@@ -843,127 +843,127 @@ calc_dprime(PhasedSummary *psum)
{
#pragma omp parallel
{
- char allele_A, allele_a, allele_B, allele_b;
- double freq_A, freq_a, freq_B, freq_b;
- double freq_AB, freq_Ab, freq_aB, freq_ab;
- double D, min, var, chisq;
- double tot = psum->sample_cnt * 2.0;
- uint hap_cnt;
-
- #pragma omp for schedule(dynamic, 1)
- for (uint i = 0; i < psum->size; i++) {
- //
- // Assign nucleotides to allele A, and a.
- //
- assign_alleles(psum->nucs[i], allele_A, allele_a, freq_A, freq_a);
-
- for (uint j = i+1; j < psum->size; j++) {
- //
- // Assign nucleotides to allele B, and b.
- //
- assign_alleles(psum->nucs[j], allele_B, allele_b, freq_B, freq_b);
-
- freq_AB = 0.0;
- freq_Ab = 0.0;
- freq_aB = 0.0;
- freq_ab = 0.0;
- hap_cnt = 0;
- D = 0.0;
-
- //
- // Tally up haplotype frequencies.
- //
- for (uint k = 0; k < psum->sample_cnt; k++) {
-
- if (psum->samples[k].nucs_1[i] == allele_A &&
- psum->samples[k].nucs_1[j] == allele_B)
- freq_AB++;
- else if (psum->samples[k].nucs_1[i] == allele_A &&
- psum->samples[k].nucs_1[j] == allele_b)
- freq_Ab++;
- else if (psum->samples[k].nucs_1[i] == allele_a &&
- psum->samples[k].nucs_1[j] == allele_B)
- freq_aB++;
- else if (psum->samples[k].nucs_1[i] == allele_a &&
- psum->samples[k].nucs_1[j] == allele_b)
- freq_ab++;
-
- if (psum->samples[k].nucs_2[i] == allele_A &&
- psum->samples[k].nucs_2[j] == allele_B)
- freq_AB++;
- else if (psum->samples[k].nucs_2[i] == allele_A &&
- psum->samples[k].nucs_2[j] == allele_b)
- freq_Ab++;
- else if (psum->samples[k].nucs_2[i] == allele_a &&
- psum->samples[k].nucs_2[j] == allele_B)
- freq_aB++;
- else if (psum->samples[k].nucs_2[i] == allele_a &&
- psum->samples[k].nucs_2[j] == allele_b)
- freq_ab++;
- }
-
- freq_AB = freq_AB / tot;
- freq_Ab = freq_Ab / tot;
- freq_aB = freq_aB / tot;
- freq_ab = freq_ab / tot;
-
- //
- // Using the four-gamete test, check whether recombination has occurred
- // between these two loci.
- // Four-gamete test: if no recombination has occurred between any two loci (SNPs) there will
- // be three haplotypes present, if recombination has occurred there will be four haplotypes.
- //
- hap_cnt += freq_AB > 0 ? 1 : 0;
- hap_cnt += freq_Ab > 0 ? 1 : 0;
- hap_cnt += freq_aB > 0 ? 1 : 0;
- hap_cnt += freq_ab > 0 ? 1 : 0;
-
- if (hap_cnt == 3)
- psum->recomb[i][j] = false;
- else
- psum->recomb[i][j] = true;
-
-
- D = freq_AB - (freq_A * freq_B);
- // cerr << "D_AB: " << D << "; ";
- // D = freq_Ab - (freq_A * freq_b);
- // cerr << "D_Ab: " << D << "; ";
- // D = freq_aB - (freq_a * freq_B);
- // cerr << "D_aB: " << D << "; ";
- // D = freq_ab - (freq_a * freq_b);
- // cerr << "D_ab: " << D << "\n";
- // cerr << " freq_AB: " << freq_AB << "; freq_Ab: " << freq_Ab << "; freq_aB: " << freq_aB << "; freq_ab: " << freq_ab << "\n";
-
- if (D > 0) {
- min = (freq_A * freq_b) < (freq_a * freq_B) ? (freq_A * freq_b) : (freq_a * freq_B);
- psum->dprime[i][j].dprime = min == 0 ? 0.0 : D / min;
- } else {
- min = (freq_A * freq_B) < (freq_a * freq_b) ? (freq_A * freq_B) : (freq_a * freq_b);
- psum->dprime[i][j].dprime = min == 0 ? 0.0 :(-1 * D) / min;
- }
-
- //
- // Test D against a chi square distribution with 1 degree of freedom to show
- // whether these two loci have a D that is statistically significantly different from 0.
- //
- chisq = (tot * (D * D)) / (freq_A * freq_a * freq_B * freq_b);
- if (chisq >= chi_sq_limit)
- psum->dprime[i][j].chisq_p = true;
-
- //
- // Calculate variance and confidence limits.
- //
- if (psum->dprime[i][j].chisq_p) {
- var = (1.0 / tot) * ((freq_A * freq_a * freq_B * freq_b) + ((1 - (2 * freq_A)) * (1 - (2 * freq_B)) * D) - (D * D));
- psum->dprime[i][j].var = var;
- psum->dprime[i][j].ci_high = psum->dprime[i][j].dprime + (1.96 * sqrt(var));
- psum->dprime[i][j].ci_low = psum->dprime[i][j].dprime - (1.96 * sqrt(var));
- } else {
- psum->dprime[i][j].ci_high = 0.0;
- psum->dprime[i][j].ci_low = 0.0;
- }
- }
- }
+ char allele_A, allele_a, allele_B, allele_b;
+ double freq_A, freq_a, freq_B, freq_b;
+ double freq_AB, freq_Ab, freq_aB, freq_ab;
+ double D, min, var, chisq;
+ double tot = psum->sample_cnt * 2.0;
+ uint hap_cnt;
+
+ #pragma omp for schedule(dynamic, 1)
+ for (uint i = 0; i < psum->size; i++) {
+ //
+ // Assign nucleotides to allele A, and a.
+ //
+ assign_alleles(psum->nucs[i], allele_A, allele_a, freq_A, freq_a);
+
+ for (uint j = i+1; j < psum->size; j++) {
+ //
+ // Assign nucleotides to allele B, and b.
+ //
+ assign_alleles(psum->nucs[j], allele_B, allele_b, freq_B, freq_b);
+
+ freq_AB = 0.0;
+ freq_Ab = 0.0;
+ freq_aB = 0.0;
+ freq_ab = 0.0;
+ hap_cnt = 0;
+ D = 0.0;
+
+ //
+ // Tally up haplotype frequencies.
+ //
+ for (uint k = 0; k < psum->sample_cnt; k++) {
+
+ if (psum->samples[k].nucs_1[i] == allele_A &&
+ psum->samples[k].nucs_1[j] == allele_B)
+ freq_AB++;
+ else if (psum->samples[k].nucs_1[i] == allele_A &&
+ psum->samples[k].nucs_1[j] == allele_b)
+ freq_Ab++;
+ else if (psum->samples[k].nucs_1[i] == allele_a &&
+ psum->samples[k].nucs_1[j] == allele_B)
+ freq_aB++;
+ else if (psum->samples[k].nucs_1[i] == allele_a &&
+ psum->samples[k].nucs_1[j] == allele_b)
+ freq_ab++;
+
+ if (psum->samples[k].nucs_2[i] == allele_A &&
+ psum->samples[k].nucs_2[j] == allele_B)
+ freq_AB++;
+ else if (psum->samples[k].nucs_2[i] == allele_A &&
+ psum->samples[k].nucs_2[j] == allele_b)
+ freq_Ab++;
+ else if (psum->samples[k].nucs_2[i] == allele_a &&
+ psum->samples[k].nucs_2[j] == allele_B)
+ freq_aB++;
+ else if (psum->samples[k].nucs_2[i] == allele_a &&
+ psum->samples[k].nucs_2[j] == allele_b)
+ freq_ab++;
+ }
+
+ freq_AB = freq_AB / tot;
+ freq_Ab = freq_Ab / tot;
+ freq_aB = freq_aB / tot;
+ freq_ab = freq_ab / tot;
+
+ //
+ // Using the four-gamete test, check whether recombination has occurred
+ // between these two loci.
+ // Four-gamete test: if no recombination has occurred between any two loci (SNPs) there will
+ // be three haplotypes present, if recombination has occurred there will be four haplotypes.
+ //
+ hap_cnt += freq_AB > 0 ? 1 : 0;
+ hap_cnt += freq_Ab > 0 ? 1 : 0;
+ hap_cnt += freq_aB > 0 ? 1 : 0;
+ hap_cnt += freq_ab > 0 ? 1 : 0;
+
+ if (hap_cnt == 3)
+ psum->recomb[i][j] = false;
+ else
+ psum->recomb[i][j] = true;
+
+
+ D = freq_AB - (freq_A * freq_B);
+ // cerr << "D_AB: " << D << "; ";
+ // D = freq_Ab - (freq_A * freq_b);
+ // cerr << "D_Ab: " << D << "; ";
+ // D = freq_aB - (freq_a * freq_B);
+ // cerr << "D_aB: " << D << "; ";
+ // D = freq_ab - (freq_a * freq_b);
+ // cerr << "D_ab: " << D << "\n";
+ // cerr << " freq_AB: " << freq_AB << "; freq_Ab: " << freq_Ab << "; freq_aB: " << freq_aB << "; freq_ab: " << freq_ab << "\n";
+
+ if (D > 0) {
+ min = (freq_A * freq_b) < (freq_a * freq_B) ? (freq_A * freq_b) : (freq_a * freq_B);
+ psum->dprime[i][j].dprime = min == 0 ? 0.0 : D / min;
+ } else {
+ min = (freq_A * freq_B) < (freq_a * freq_b) ? (freq_A * freq_B) : (freq_a * freq_b);
+ psum->dprime[i][j].dprime = min == 0 ? 0.0 :(-1 * D) / min;
+ }
+
+ //
+ // Test D against a chi square distribution with 1 degree of freedom to show
+ // whether these two loci have a D that is statistically significantly different from 0.
+ //
+ chisq = (tot * (D * D)) / (freq_A * freq_a * freq_B * freq_b);
+ if (chisq >= chi_sq_limit)
+ psum->dprime[i][j].chisq_p = true;
+
+ //
+ // Calculate variance and confidence limits.
+ //
+ if (psum->dprime[i][j].chisq_p) {
+ var = (1.0 / tot) * ((freq_A * freq_a * freq_B * freq_b) + ((1 - (2 * freq_A)) * (1 - (2 * freq_B)) * D) - (D * D));
+ psum->dprime[i][j].var = var;
+ psum->dprime[i][j].ci_high = psum->dprime[i][j].dprime + (1.96 * sqrt(var));
+ psum->dprime[i][j].ci_low = psum->dprime[i][j].dprime - (1.96 * sqrt(var));
+ } else {
+ psum->dprime[i][j].ci_high = 0.0;
+ psum->dprime[i][j].ci_low = 0.0;
+ }
+ }
+ }
}
return 0;
@@ -979,48 +979,48 @@ assign_alleles(NucSum nsum, char &p_allele, char &q_allele, double &p_freq, doub
float tot = 0;
while (p_allele == 0 && i < 4) {
- if (nsum.nuc[i] > 0) {
- tot += nsum.nuc[i];
- switch(i) {
- case 0:
- p_allele = 'A';
- p_freq = nsum.nuc[0];
- break;
- case 1:
- p_allele = 'C';
- p_freq = nsum.nuc[1];
- break;
- case 2:
- p_allele = 'G';
- p_freq = nsum.nuc[2];
- break;
- case 3:
- p_allele = 'T';
- p_freq = nsum.nuc[3];
- break;
- }
- }
- i++;
+ if (nsum.nuc[i] > 0) {
+ tot += nsum.nuc[i];
+ switch(i) {
+ case 0:
+ p_allele = 'A';
+ p_freq = nsum.nuc[0];
+ break;
+ case 1:
+ p_allele = 'C';
+ p_freq = nsum.nuc[1];
+ break;
+ case 2:
+ p_allele = 'G';
+ p_freq = nsum.nuc[2];
+ break;
+ case 3:
+ p_allele = 'T';
+ p_freq = nsum.nuc[3];
+ break;
+ }
+ }
+ i++;
}
while (q_allele == 0 && i < 4) {
- if (nsum.nuc[i] > 0) {
- tot += nsum.nuc[i];
- switch(i) {
- case 1:
- q_allele = 'C';
- q_freq = nsum.nuc[1];
- break;
- case 2:
- q_allele = 'G';
- q_freq = nsum.nuc[2];
- break;
- case 3:
- q_allele = 'T';
- q_freq = nsum.nuc[3];
- break;
- }
- }
- i++;
+ if (nsum.nuc[i] > 0) {
+ tot += nsum.nuc[i];
+ switch(i) {
+ case 1:
+ q_allele = 'C';
+ q_freq = nsum.nuc[1];
+ break;
+ case 2:
+ q_allele = 'G';
+ q_freq = nsum.nuc[2];
+ break;
+ case 3:
+ q_allele = 'T';
+ q_freq = nsum.nuc[3];
+ break;
+ }
+ }
+ i++;
}
p_freq = p_freq / tot;
@@ -1043,7 +1043,7 @@ write_dprime(string path, PhasedSummary *psum)
if (fh.fail()) {
cerr << "Error opening D' file '" << file << "'\n";
- exit(1);
+ exit(1);
}
fh << "# Basepair 1\tBasepair 2\tD'\tCorrected D'\tVariance\tCI Low\tCI High\n";
@@ -1051,28 +1051,28 @@ write_dprime(string path, PhasedSummary *psum)
double dprime = 0.0;
for (uint i = 0; i < psum->size; i++) {
- for (uint j = i+1; j < psum->size; j++) {
+ for (uint j = i+1; j < psum->size; j++) {
- if (psum->nucs[i].freq < minor_freq_lim ||
- psum->nucs[j].freq < minor_freq_lim)
- continue;
+ if (psum->nucs[i].freq < minor_freq_lim ||
+ psum->nucs[j].freq < minor_freq_lim)
+ continue;
- dprime = psum->dprime[i][j].dprime;
+ dprime = psum->dprime[i][j].dprime;
- if (dprime_threshold)
- dprime = dprime >= dprime_threshold_level ? 1.0 : 0.0;
+ if (dprime_threshold)
+ dprime = dprime >= dprime_threshold_level ? 1.0 : 0.0;
- if (write_zeros == false && (dprime == 0.0 || psum->dprime[i][j].chisq_p == false))
- continue;
+ if (write_zeros == false && (dprime == 0.0 || psum->dprime[i][j].chisq_p == false))
+ continue;
- fh << psum->nucs[i].bp << "\t"
- << psum->nucs[j].bp << "\t"
- << std::setprecision(3) << dprime << "\t"
- << std::setprecision(3) << (psum->dprime[i][j].chisq_p ? dprime : 0.0) << "\t"
- << psum->dprime[i][j].var << "\t"
- << psum->dprime[i][j].ci_low << "\t"
- << psum->dprime[i][j].ci_high << "\n";
- }
+ fh << psum->nucs[i].bp << "\t"
+ << psum->nucs[j].bp << "\t"
+ << std::setprecision(3) << dprime << "\t"
+ << std::setprecision(3) << (psum->dprime[i][j].chisq_p ? dprime : 0.0) << "\t"
+ << psum->dprime[i][j].var << "\t"
+ << psum->dprime[i][j].ci_low << "\t"
+ << psum->dprime[i][j].ci_high << "\n";
+ }
}
fh.close();
@@ -1091,76 +1091,76 @@ summarize_phased_genotypes(PhasedSummary *psum)
char **gtypes = new char *[psum->sample_cnt];
for (uint i = 0; i < psum->sample_cnt; i++) {
- gtypes[i] = psum->samples[i].nucs_1;
+ gtypes[i] = psum->samples[i].nucs_1;
}
//
// Sum up the occurences of each nucleotide.
//
for (uint i = 0; i < psum->size; i++) {
- for (uint j = 0; j < psum->sample_cnt; j++) {
- switch(gtypes[j][i]) {
- case 'A':
- psum->nucs[i].nuc[0]++;
- break;
- case 'C':
- psum->nucs[i].nuc[1]++;
- break;
- case 'G':
- psum->nucs[i].nuc[2]++;
- break;
- case 'T':
- psum->nucs[i].nuc[3]++;
- break;
- case 'N':
- default:
- break;
- }
- }
+ for (uint j = 0; j < psum->sample_cnt; j++) {
+ switch(gtypes[j][i]) {
+ case 'A':
+ psum->nucs[i].nuc[0]++;
+ break;
+ case 'C':
+ psum->nucs[i].nuc[1]++;
+ break;
+ case 'G':
+ psum->nucs[i].nuc[2]++;
+ break;
+ case 'T':
+ psum->nucs[i].nuc[3]++;
+ break;
+ case 'N':
+ default:
+ break;
+ }
+ }
}
//
// Repeat for the second set of phased genotypes.
//
for (uint i = 0; i < psum->sample_cnt; i++) {
- gtypes[i] = psum->samples[i].nucs_2;
+ gtypes[i] = psum->samples[i].nucs_2;
}
//
// Sum up the occurences of each nucleotide.
//
for (uint i = 0; i < psum->size; i++) {
- for (uint j = 0; j < psum->sample_cnt; j++) {
- switch(gtypes[j][i]) {
- case 'A':
- psum->nucs[i].nuc[0]++;
- break;
- case 'C':
- psum->nucs[i].nuc[1]++;
- break;
- case 'G':
- psum->nucs[i].nuc[2]++;
- break;
- case 'T':
- psum->nucs[i].nuc[3]++;
- break;
- case 'N':
- default:
- break;
- }
- }
-
- //
- // Calculate minor allele frequency.
- //
- float tot = (float) psum->sample_cnt * 2.0;
- float freq = 0.0;
- for (uint j = 0; j < 4; j++) {
- if (psum->nucs[i].nuc[j] > 0) {
- freq = (float) psum->nucs[i].nuc[j] / tot;
- psum->nucs[i].freq = freq < psum->nucs[i].freq ? freq : psum->nucs[i].freq;
- }
- }
+ for (uint j = 0; j < psum->sample_cnt; j++) {
+ switch(gtypes[j][i]) {
+ case 'A':
+ psum->nucs[i].nuc[0]++;
+ break;
+ case 'C':
+ psum->nucs[i].nuc[1]++;
+ break;
+ case 'G':
+ psum->nucs[i].nuc[2]++;
+ break;
+ case 'T':
+ psum->nucs[i].nuc[3]++;
+ break;
+ case 'N':
+ default:
+ break;
+ }
+ }
+
+ //
+ // Calculate minor allele frequency.
+ //
+ float tot = (float) psum->sample_cnt * 2.0;
+ float freq = 0.0;
+ for (uint j = 0; j < 4; j++) {
+ if (psum->nucs[i].nuc[j] > 0) {
+ freq = (float) psum->nucs[i].nuc[j] / tot;
+ psum->nucs[i].freq = freq < psum->nucs[i].freq ? freq : psum->nucs[i].freq;
+ }
+ }
}
delete [] gtypes;
@@ -1169,10 +1169,10 @@ summarize_phased_genotypes(PhasedSummary *psum)
}
//
-// Code to parse fastPhase format.
+// Code to parse fastPhase format.
//
-PhasedSummary *
-parse_fastphase(string path)
+PhasedSummary *
+parse_fastphase(string path)
{
ifstream fh;
char line[max_len];
@@ -1193,7 +1193,7 @@ parse_fastphase(string path)
if (fh.fail()) {
cerr << "Error opening input file '" << path << "'\n";
- return NULL;
+ return NULL;
}
cerr << "Parsing " << filepath << "...\n";
@@ -1208,8 +1208,8 @@ parse_fastphase(string path)
num_samples = is_integer(line);
if (num_samples < 0) {
- cerr << "Unable to find the number of samples, should be the first line.\n";
- return NULL;
+ cerr << "Unable to find the number of samples, should be the first line.\n";
+ return NULL;
}
//
@@ -1219,8 +1219,8 @@ parse_fastphase(string path)
num_genotypes = is_integer(line);
if (num_genotypes < 0) {
- cerr << "Unable to find the number of genotypes, should be the second line.\n";
- return NULL;
+ cerr << "Unable to find the number of genotypes, should be the second line.\n";
+ return NULL;
}
PhasedSummary *psum = new PhasedSummary(num_samples, num_genotypes);
@@ -1230,9 +1230,9 @@ parse_fastphase(string path)
//
buf.clear();
do {
- fh.clear();
- fh.getline(line, max_len);
- buf += line;
+ fh.clear();
+ fh.getline(line, max_len);
+ buf += line;
} while (fh.fail() && !fh.bad() && !fh.eof());
i = 0;
@@ -1240,28 +1240,28 @@ parse_fastphase(string path)
end = p + buf.length();
if (*p != 'P') {
- cerr << "Unable to locate line of basepair positions, should be the third line.\n";
- delete psum;
- return NULL;
+ cerr << "Unable to locate line of basepair positions, should be the third line.\n";
+ delete psum;
+ return NULL;
}
for (p += 2, q = p; p < end; p++, q++) {
- while (*q != ' ' && q < end) {
- q++;
- }
- strncpy(bp, p, q - p);
- bp[q - p] = '\0';
- pos = is_integer(bp);
-
- if (pos < 0) {
- cerr << "Unable to parse base pair positions.\n";
- delete psum;
- return NULL;
- } else {
- psum->nucs[i].bp = (uint) pos;
- }
-
- i++;
- p = q;
+ while (*q != ' ' && q < end) {
+ q++;
+ }
+ strncpy(bp, p, q - p);
+ bp[q - p] = '\0';
+ pos = is_integer(bp);
+
+ if (pos < 0) {
+ cerr << "Unable to parse base pair positions.\n";
+ delete psum;
+ return NULL;
+ } else {
+ psum->nucs[i].bp = (uint) pos;
+ }
+
+ i++;
+ p = q;
}
fh.close();
@@ -1274,7 +1274,7 @@ parse_fastphase(string path)
if (fh.fail()) {
cerr << "Error opening input file '" << path << "'\n";
- return NULL;
+ return NULL;
}
cerr << "Parsing " << filepath << "...\n";
@@ -1287,10 +1287,10 @@ parse_fastphase(string path)
fh.getline(line, max_len);
if (!fh.good()) {
- cerr << "Unable to find file section entitled 'BEGIN GENOTYPES'\n";
- delete psum;
+ cerr << "Unable to find file section entitled 'BEGIN GENOTYPES'\n";
+ delete psum;
return NULL;
- }
+ }
} while (strcmp(line, "BEGIN GENOTYPES") != 0);
@@ -1304,62 +1304,62 @@ parse_fastphase(string path)
fh.getline(line, max_len);
do {
- //
- // Create a new Sample object and store the sample label.
- //
- sindex = psum->add_sample(line);
-
- //
- // Get the first set of phased genotypes.
- //
- buf.clear();
- do {
- fh.clear();
- fh.getline(line, max_len);
- buf += line;
- } while (fh.fail() && !fh.bad() && !fh.eof());
-
- //
- // Count the number of genotypes on this line (they should be space deliniated).
- //
- i = 0;
- for (p = buf.c_str(); *p != '\0'; p++)
- if (*p != ' ') psum->samples[sindex].size++;
- //
- // Store the genotypes into our internal buffer.
- //
- psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
- for (p = buf.c_str(); *p != '\0'; p++) {
- if (*p == ' ') continue;
- psum->samples[sindex].nucs_1[i] = *p;
- i++;
- }
-
- // len = strlen(line);
- // if (line[len - 1] == '\r') line[len - 1] = '\0';
-
- //
- // Get the second set of phased genotypes.
- //
- buf.clear();
- do {
- fh.clear();
- fh.getline(line, max_len);
- buf += line;
- } while (fh.fail() && !fh.bad() && !fh.eof());
-
- i = 0;
- psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
- for (p = buf.c_str(); *p != '\0'; p++) {
- if (*p == ' ') continue;
- psum->samples[sindex].nucs_2[i] = *p;
- i++;
- }
-
- //
- // Get the sample label of the next record.
- //
- fh.getline(line, max_len);
+ //
+ // Create a new Sample object and store the sample label.
+ //
+ sindex = psum->add_sample(line);
+
+ //
+ // Get the first set of phased genotypes.
+ //
+ buf.clear();
+ do {
+ fh.clear();
+ fh.getline(line, max_len);
+ buf += line;
+ } while (fh.fail() && !fh.bad() && !fh.eof());
+
+ //
+ // Count the number of genotypes on this line (they should be space deliniated).
+ //
+ i = 0;
+ for (p = buf.c_str(); *p != '\0'; p++)
+ if (*p != ' ') psum->samples[sindex].size++;
+ //
+ // Store the genotypes into our internal buffer.
+ //
+ psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
+ for (p = buf.c_str(); *p != '\0'; p++) {
+ if (*p == ' ') continue;
+ psum->samples[sindex].nucs_1[i] = *p;
+ i++;
+ }
+
+ // len = strlen(line);
+ // if (line[len - 1] == '\r') line[len - 1] = '\0';
+
+ //
+ // Get the second set of phased genotypes.
+ //
+ buf.clear();
+ do {
+ fh.clear();
+ fh.getline(line, max_len);
+ buf += line;
+ } while (fh.fail() && !fh.bad() && !fh.eof());
+
+ i = 0;
+ psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
+ for (p = buf.c_str(); *p != '\0'; p++) {
+ if (*p == ' ') continue;
+ psum->samples[sindex].nucs_2[i] = *p;
+ i++;
+ }
+
+ //
+ // Get the sample label of the next record.
+ //
+ fh.getline(line, max_len);
} while (strcmp(line, "END GENOTYPES") != 0 && fh.good());
@@ -1369,10 +1369,10 @@ parse_fastphase(string path)
}
//
-// Code to parse Beagle format.
+// Code to parse Beagle format.
//
-PhasedSummary *
-parse_beagle(map<int, CSLocus *> &catalog, string path)
+PhasedSummary *
+parse_beagle(map<int, CSLocus *> &catalog, string path)
{
gzFile gz_fh;
char *line;
@@ -1391,8 +1391,8 @@ parse_beagle(map<int, CSLocus *> &catalog, string path)
filepath = path + ".phased.gz";
gz_fh = gzopen(filepath.c_str(), "rb");
if (!gz_fh) {
- cerr << "Failed to open gzipped file '" << filepath << "': " << strerror(errno) << ".\n";
- return NULL;
+ cerr << "Failed to open gzipped file '" << filepath << "': " << strerror(errno) << ".\n";
+ return NULL;
}
cerr << "Parsing " << filepath << "...\n";
@@ -1417,48 +1417,48 @@ parse_beagle(map<int, CSLocus *> &catalog, string path)
// 'M' is a marker, count the number of markers.
//
do {
- eol = false;
- buf.clear();
- do {
- gzgets(gz_fh, line, line_len);
- buf += line;
-
- len = strlen(line);
- if (len > 0 && line[len - 1] == '\n') {
- eol = true;
- line[len - 1] = '\0';
- }
- } while (!gzeof(gz_fh) && !eol);
-
- if (line_len < buf.length()) {
- // cerr << "Resizing line buffer from " << line_len << " to " << buf.length() << "\n";
- delete [] line;
- line = new char[buf.length() + 1];
- line_len = buf.length() + 1;
- memset(line, '\0', line_len);
- }
-
- if (buf[0] == 'M') {
- num_genotypes++;
- } else if (buf[0] == 'I') {
- //
- // Count the number of samples.
- //
- parse_ssv(buf.c_str(), parts);
- num_samples = (parts.size() - 2) / 2;
- }
+ eol = false;
+ buf.clear();
+ do {
+ gzgets(gz_fh, line, line_len);
+ buf += line;
+
+ len = strlen(line);
+ if (len > 0 && line[len - 1] == '\n') {
+ eol = true;
+ line[len - 1] = '\0';
+ }
+ } while (!gzeof(gz_fh) && !eol);
+
+ if (line_len < buf.length()) {
+ // cerr << "Resizing line buffer from " << line_len << " to " << buf.length() << "\n";
+ delete [] line;
+ line = new char[buf.length() + 1];
+ line_len = buf.length() + 1;
+ memset(line, '\0', line_len);
+ }
+
+ if (buf[0] == 'M') {
+ num_genotypes++;
+ } else if (buf[0] == 'I') {
+ //
+ // Count the number of samples.
+ //
+ parse_ssv(buf.c_str(), parts);
+ num_samples = (parts.size() - 2) / 2;
+ }
} while (!gzeof(gz_fh));
PhasedSummary *psum = new PhasedSummary(num_samples, num_genotypes);
for (uint j = 2; j < parts.size(); j++) {
- if (j % 2 == 0) {
- sindex = psum->add_sample(parts[j]);
- psum->samples[sindex].size = num_genotypes;
- psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
- psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
- }
+ if (j % 2 == 0) {
+ sindex = psum->add_sample(parts[j]);
+ psum->samples[sindex].size = num_genotypes;
+ psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
+ psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
+ }
}
cerr << " Found " << num_samples << " samples; " << num_genotypes << " genotypes.\n";
@@ -1469,47 +1469,47 @@ parse_beagle(map<int, CSLocus *> &catalog, string path)
memset(line, '\0', line_len);
do {
- do {
- gzgets(gz_fh, line, line_len);
- } while (!gzeof(gz_fh) && line[0] != 'M');
-
- len = strlen(line);
-
- if (len == 0) break;
- if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0';
-
- parse_ssv(line, parts);
-
- //
- // Parse the catalog locus ID and the column number of the SNP:
- // e.g. LocId_column or 10329_37
- //
- p = parts[1].c_str();
- for (q = p + 1; *q != '_' && *q != '\0'; q++);
- strncpy(cat_loc_str, p, q - p);
- cat_loc_str[q-p] = '\0';
- q++;
- strcpy(col_str, q);
-
- psum->nucs[marker_num].clocus = is_integer(cat_loc_str);
- psum->nucs[marker_num].col = is_integer(col_str);
-
- //
- // Store the genotypes into our internal buffer.
- //
- sindex = 0;
- i = 2;
- while (i < parts.size()) {
- p = parts[i].c_str();
- psum->samples[sindex].nucs_1[marker_num] = *p;
- i++;
- p = parts[i].c_str();
- psum->samples[sindex].nucs_2[marker_num] = *p;
- i++;
- sindex++;
- }
-
- marker_num++;
+ do {
+ gzgets(gz_fh, line, line_len);
+ } while (!gzeof(gz_fh) && line[0] != 'M');
+
+ len = strlen(line);
+
+ if (len == 0) break;
+ if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0';
+
+ parse_ssv(line, parts);
+
+ //
+ // Parse the catalog locus ID and the column number of the SNP:
+ // e.g. LocId_column or 10329_37
+ //
+ p = parts[1].c_str();
+ for (q = p + 1; *q != '_' && *q != '\0'; q++);
+ strncpy(cat_loc_str, p, q - p);
+ cat_loc_str[q-p] = '\0';
+ q++;
+ strcpy(col_str, q);
+
+ psum->nucs[marker_num].clocus = is_integer(cat_loc_str);
+ psum->nucs[marker_num].col = is_integer(col_str);
+
+ //
+ // Store the genotypes into our internal buffer.
+ //
+ sindex = 0;
+ i = 2;
+ while (i < parts.size()) {
+ p = parts[i].c_str();
+ psum->samples[sindex].nucs_1[marker_num] = *p;
+ i++;
+ p = parts[i].c_str();
+ psum->samples[sindex].nucs_2[marker_num] = *p;
+ i++;
+ sindex++;
+ }
+
+ marker_num++;
} while (!gzeof(gz_fh));
@@ -1520,18 +1520,18 @@ parse_beagle(map<int, CSLocus *> &catalog, string path)
//
CSLocus *loc;
for (i = 0; i < psum->size; i++) {
- loc = catalog[psum->nucs[i].clocus];
- psum->nucs[i].bp = loc->sort_bp(psum->nucs[i].col);
+ loc = catalog[psum->nucs[i].clocus];
+ psum->nucs[i].bp = loc->sort_bp(psum->nucs[i].col);
}
return psum;
}
//
-// Code to parse Beagle format.
+// Code to parse Beagle format.
//
-PhasedSummary *
-parse_beagle_haplotypes(map<int, CSLocus *> &catalog, string path)
+PhasedSummary *
+parse_beagle_haplotypes(map<int, CSLocus *> &catalog, string path)
{
gzFile gz_fh;
char *line;
@@ -1550,8 +1550,8 @@ parse_beagle_haplotypes(map<int, CSLocus *> &catalog, string path)
filepath = path + ".phased.gz";
gz_fh = gzopen(filepath.c_str(), "rb");
if (!gz_fh) {
- cerr << "Failed to open gzipped file '" << filepath << "': " << strerror(errno) << ".\n";
- return NULL;
+ cerr << "Failed to open gzipped file '" << filepath << "': " << strerror(errno) << ".\n";
+ return NULL;
}
cerr << "Parsing " << filepath << "...\n";
@@ -1576,54 +1576,54 @@ parse_beagle_haplotypes(map<int, CSLocus *> &catalog, string path)
// 'M' is a marker, count the number of markers.
//
do {
- eol = false;
- buf.clear();
- do {
- gzgets(gz_fh, line, line_len);
- buf += line;
-
- len = strlen(line);
- if (len > 0 && line[len - 1] == '\n') {
- eol = true;
- line[len - 1] = '\0';
- }
- } while (!gzeof(gz_fh) && !eol);
-
- if (line_len < buf.length()) {
- // cerr << "Resizing line buffer from " << line_len << " to " << buf.length() << "\n";
- delete [] line;
- line = new char[buf.length() + 1];
- line_len = buf.length() + 1;
- memset(line, '\0', line_len);
- }
-
- if (buf[0] == 'M') {
- //
- // Count the number of genotypes by counting the number or nucleotides in each
- // haplotype for each marker.
- //
- parse_ssv(buf.c_str(), parts);
- num_genotypes += parts[2].length();
-
- } else if (buf[0] == 'I') {
- //
- // Count the number of samples.
- //
- parse_ssv(buf.c_str(), samples);
- num_samples = (samples.size() - 2) / 2;
- }
+ eol = false;
+ buf.clear();
+ do {
+ gzgets(gz_fh, line, line_len);
+ buf += line;
+
+ len = strlen(line);
+ if (len > 0 && line[len - 1] == '\n') {
+ eol = true;
+ line[len - 1] = '\0';
+ }
+ } while (!gzeof(gz_fh) && !eol);
+
+ if (line_len < buf.length()) {
+ // cerr << "Resizing line buffer from " << line_len << " to " << buf.length() << "\n";
+ delete [] line;
+ line = new char[buf.length() + 1];
+ line_len = buf.length() + 1;
+ memset(line, '\0', line_len);
+ }
+
+ if (buf[0] == 'M') {
+ //
+ // Count the number of genotypes by counting the number or nucleotides in each
+ // haplotype for each marker.
+ //
+ parse_ssv(buf.c_str(), parts);
+ num_genotypes += parts[2].length();
+
+ } else if (buf[0] == 'I') {
+ //
+ // Count the number of samples.
+ //
+ parse_ssv(buf.c_str(), samples);
+ num_samples = (samples.size() - 2) / 2;
+ }
} while (!gzeof(gz_fh));
PhasedSummary *psum = new PhasedSummary(num_samples, num_genotypes);
for (uint j = 2; j < samples.size(); j++) {
- if (j % 2 == 0) {
- sindex = psum->add_sample(samples[j]);
- psum->samples[sindex].size = num_genotypes;
- psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
- psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
- }
+ if (j % 2 == 0) {
+ sindex = psum->add_sample(samples[j]);
+ psum->samples[sindex].size = num_genotypes;
+ psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
+ psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
+ }
}
cerr << " Found " << num_samples << " samples; " << num_genotypes << " genotypes.\n";
@@ -1636,55 +1636,55 @@ parse_beagle_haplotypes(map<int, CSLocus *> &catalog, string path)
memset(line, '\0', line_len);
do {
- do {
- gzgets(gz_fh, line, line_len);
- } while (!gzeof(gz_fh) && line[0] != 'M');
-
- len = strlen(line);
-
- if (len == 0) break;
- if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0';
-
- parse_ssv(line, parts);
-
- //
- // Use the catalog to look up the basepair positions for each catalog locus.
- //
- cat_loc = is_integer(parts[1].c_str());
- loc = catalog[cat_loc];
- hap_len = parts[2].length();
-
- if (hap_len != loc->snps.size())
- cerr << "Haplotypes don't match between catalog and beagle; Locus ID: " << loc->id << "; beagle hap len: " << hap_len << "; catalog hap len: " << loc->snps.size() << "\n";
-
- for (j = 0, i = marker_num; i < marker_num + hap_len; i++, j++) {
- psum->nucs[i].clocus = cat_loc;
- psum->nucs[i].col = loc->snps[j]->col;
- psum->nucs[i].bp = loc->sort_bp(psum->nucs[i].col);
- }
-
- //
- // Store the genotypes into our internal buffer.
- //
- sindex = 0;
- i = 2;
- while (i < parts.size()) {
- p = parts[i].c_str();
- for (j = marker_num; j < marker_num + hap_len; j++) {
- psum->samples[sindex].nucs_1[j] = *p;
- p++;
- }
- i++;
- p = parts[i].c_str();
- for (j = marker_num; j < marker_num + hap_len; j++) {
- psum->samples[sindex].nucs_2[j] = *p;
- p++;
- }
- i++;
- sindex++;
- }
-
- marker_num += hap_len;
+ do {
+ gzgets(gz_fh, line, line_len);
+ } while (!gzeof(gz_fh) && line[0] != 'M');
+
+ len = strlen(line);
+
+ if (len == 0) break;
+ if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0';
+
+ parse_ssv(line, parts);
+
+ //
+ // Use the catalog to look up the basepair positions for each catalog locus.
+ //
+ cat_loc = is_integer(parts[1].c_str());
+ loc = catalog[cat_loc];
+ hap_len = parts[2].length();
+
+ if (hap_len != loc->snps.size())
+ cerr << "Haplotypes don't match between catalog and beagle; Locus ID: " << loc->id << "; beagle hap len: " << hap_len << "; catalog hap len: " << loc->snps.size() << "\n";
+
+ for (j = 0, i = marker_num; i < marker_num + hap_len; i++, j++) {
+ psum->nucs[i].clocus = cat_loc;
+ psum->nucs[i].col = loc->snps[j]->col;
+ psum->nucs[i].bp = loc->sort_bp(psum->nucs[i].col);
+ }
+
+ //
+ // Store the genotypes into our internal buffer.
+ //
+ sindex = 0;
+ i = 2;
+ while (i < parts.size()) {
+ p = parts[i].c_str();
+ for (j = marker_num; j < marker_num + hap_len; j++) {
+ psum->samples[sindex].nucs_1[j] = *p;
+ p++;
+ }
+ i++;
+ p = parts[i].c_str();
+ for (j = marker_num; j < marker_num + hap_len; j++) {
+ psum->samples[sindex].nucs_2[j] = *p;
+ p++;
+ }
+ i++;
+ sindex++;
+ }
+
+ marker_num += hap_len;
} while (!gzeof(gz_fh));
@@ -1701,60 +1701,60 @@ parse_population_map(string popmap_path, map<string, int> &pop_map, map<int, int
vector<string> parts;
uint len;
- if (pmap_path.length() == 0)
- return 0;
+ if (pmap_path.length() == 0)
+ return 0;
cerr << "Parsing population map.\n";
ifstream fh(popmap_path.c_str(), ifstream::in);
if (fh.fail()) {
- cerr << "Error opening population map '" << popmap_path << "'\n";
- return 0;
+ cerr << "Error opening population map '" << popmap_path << "'\n";
+ return 0;
}
while (fh.good()) {
- fh.getline(line, max_len);
-
- len = strlen(line);
- if (len == 0) continue;
-
- //
- // Check that there is no carraige return in the buffer.
- //
- if (line[len - 1] == '\r') line[len - 1] = '\0';
-
- //
- // Ignore comments
- //
- if (line[0] == '#') continue;
-
- //
- // Parse the population map, we expect:
- // <file name> <tab> <population ID>
- //
- parse_tsv(line, parts);
-
- if (parts.size() != 2) {
- cerr << "Population map is not formated correctly: expecting two, tab separated columns, found " << parts.size() << ".\n";
- return 0;
- }
-
- strncpy(pop_id_str, parts[1].c_str(), id_len);
- for (int i = 0; i < id_len && pop_id_str[i] != '\0'; i++)
- if (!isdigit(pop_id_str[i])) {
- cerr << "Population map is not formated correctly: expecting numerical ID in second column, found '" << parts[1] << "'.\n";
- return 0;
- }
-
- //
- // Add the sample name to population number mapping.
- //
- pop_map[parts[0]] = atoi(parts[1].c_str());
- if (pop_cnts.count(atoi(parts[1].c_str())) == 0)
- pop_cnts[atoi(parts[1].c_str())] = 1;
- else
- pop_cnts[atoi(parts[1].c_str())]++;
+ fh.getline(line, max_len);
+
+ len = strlen(line);
+ if (len == 0) continue;
+
+ //
+ // Check that there is no carraige return in the buffer.
+ //
+ if (line[len - 1] == '\r') line[len - 1] = '\0';
+
+ //
+ // Ignore comments
+ //
+ if (line[0] == '#') continue;
+
+ //
+ // Parse the population map, we expect:
+ // <file name> <tab> <population ID>
+ //
+ parse_tsv(line, parts);
+
+ if (parts.size() != 2) {
+ cerr << "Population map is not formated correctly: expecting two, tab separated columns, found " << parts.size() << ".\n";
+ return 0;
+ }
+
+ strncpy(pop_id_str, parts[1].c_str(), id_len);
+ for (int i = 0; i < id_len && pop_id_str[i] != '\0'; i++)
+ if (!isdigit(pop_id_str[i])) {
+ cerr << "Population map is not formated correctly: expecting numerical ID in second column, found '" << parts[1] << "'.\n";
+ return 0;
+ }
+
+ //
+ // Add the sample name to population number mapping.
+ //
+ pop_map[parts[0]] = atoi(parts[1].c_str());
+ if (pop_cnts.count(atoi(parts[1].c_str())) == 0)
+ pop_cnts[atoi(parts[1].c_str())] = 1;
+ else
+ pop_cnts[atoi(parts[1].c_str())]++;
}
fh.close();
@@ -1762,7 +1762,7 @@ parse_population_map(string popmap_path, map<string, int> &pop_map, map<int, int
return 0;
}
-int
+int
build_file_list(vector<pair<int, string> > &files)
{
vector<string> parts;
@@ -1778,36 +1778,36 @@ build_file_list(vector<pair<int, string> > &files)
DIR *dir = opendir(in_path.c_str());
if (dir == NULL) {
- cerr << "Unable to open directory '" << in_path << "' for reading.\n";
- exit(1);
+ cerr << "Unable to open directory '" << in_path << "' for reading.\n";
+ exit(1);
}
switch(in_file_type) {
case FileT::beagle:
- pattern = ".phased.gz";
- break;
+ pattern = ".phased.gz";
+ break;
case FileT::fastphase:
default:
- pattern = "_hapguess_switch.out";
- break;
+ pattern = "_hapguess_switch.out";
+ break;
}
while ((direntry = readdir(dir)) != NULL) {
- file = direntry->d_name;
+ file = direntry->d_name;
- if (file == "." || file == "..")
- continue;
+ if (file == "." || file == "..")
+ continue;
- pos = file.rfind(pattern);
- if (pos < file.length())
- files.push_back(make_pair(1, file.substr(0, pos)));
+ pos = file.rfind(pattern);
+ if (pos < file.length())
+ files.push_back(make_pair(1, file.substr(0, pos)));
}
closedir(dir);
if (files.size() == 0) {
- cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
- return 0;
+ cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
+ return 0;
}
return 1;
@@ -1815,62 +1815,62 @@ build_file_list(vector<pair<int, string> > &files)
int parse_command_line(int argc, char* argv[]) {
int c;
-
+
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
- {"haplotypes", no_argument, NULL, 'H'},
- {"skip_zeros", no_argument, NULL, 'Z'},
+ {"haplotypes", no_argument, NULL, 'H'},
+ {"skip_zeros", no_argument, NULL, 'Z'},
{"infile_type", required_argument, NULL, 't'},
- {"num_threads", required_argument, NULL, 'p'},
- {"in_path", required_argument, NULL, 'P'},
- {"cat_path", required_argument, NULL, 'S'},
- {"pop_map", required_argument, NULL, 'M'},
- {"batch_id", required_argument, NULL, 'b'},
- {"dprime_bin_size", required_argument, NULL, 'B'},
- {"minor_allele_freq", required_argument, NULL, 'a'},
- {"min_inform_pairs", required_argument, NULL, 'm'},
- {"dprime_threshold", required_argument, NULL, 'T'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "hvZHAb:M:t:P:S:p:a:B:T:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 'b':
- batch_id = is_integer(optarg);
- if (batch_id < 0) {
- cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
- help();
- }
- break;
- case 'p':
- num_threads = atoi(optarg);
- break;
- case 'a':
- minor_freq_lim = atof(optarg);
- break;
- case 'm':
- min_inform_pairs = atof(optarg);
- break;
- case 'P':
- in_path = optarg;
- break;
- case 'S':
- cat_path = optarg;
- break;
- case 't':
+ {"num_threads", required_argument, NULL, 'p'},
+ {"in_path", required_argument, NULL, 'P'},
+ {"cat_path", required_argument, NULL, 'S'},
+ {"pop_map", required_argument, NULL, 'M'},
+ {"batch_id", required_argument, NULL, 'b'},
+ {"dprime_bin_size", required_argument, NULL, 'B'},
+ {"minor_allele_freq", required_argument, NULL, 'a'},
+ {"min_inform_pairs", required_argument, NULL, 'm'},
+ {"dprime_threshold", required_argument, NULL, 'T'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hvZHAb:M:t:P:S:p:a:B:T:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 'b':
+ batch_id = is_integer(optarg);
+ if (batch_id < 0) {
+ cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
+ help();
+ }
+ break;
+ case 'p':
+ num_threads = atoi(optarg);
+ break;
+ case 'a':
+ minor_freq_lim = atof(optarg);
+ break;
+ case 'm':
+ min_inform_pairs = atof(optarg);
+ break;
+ case 'P':
+ in_path = optarg;
+ break;
+ case 'S':
+ cat_path = optarg;
+ break;
+ case 't':
if (strcasecmp(optarg, "phase") == 0)
in_file_type = FileT::phase;
else if (strcasecmp(optarg, "fastphase") == 0)
@@ -1879,57 +1879,57 @@ int parse_command_line(int argc, char* argv[]) {
in_file_type = FileT::beagle;
else
in_file_type = FileT::unknown;
- break;
- case 'M':
- pmap_path = optarg;
- break;
- case 'H':
- haplotypes = true;
- break;
- case 'Z':
- write_zeros = false;
- break;
- case 'B':
- bucket_dist = atoi(optarg);
- break;
- case 'T':
- dprime_threshold = true;
- dprime_threshold_level = atof(optarg);
- break;
+ break;
+ case 'M':
+ pmap_path = optarg;
+ break;
+ case 'H':
+ haplotypes = true;
+ break;
+ case 'Z':
+ write_zeros = false;
+ break;
+ case 'B':
+ bucket_dist = atoi(optarg);
+ break;
+ case 'T':
+ dprime_threshold = true;
+ dprime_threshold_level = atof(optarg);
+ break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
- default:
- help();
- abort();
- }
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
+ default:
+ help();
+ abort();
+ }
}
if (in_path.length() == 0) {
- cerr << "You must specify a path to the directory containing Stacks output files.\n";
- help();
+ cerr << "You must specify a path to the directory containing Stacks output files.\n";
+ help();
}
- if (in_path.at(in_path.length() - 1) != '/')
- in_path += "/";
+ if (in_path.at(in_path.length() - 1) != '/')
+ in_path += "/";
if (minor_freq_lim > 0) {
- if (minor_freq_lim > 1)
- minor_freq_lim = minor_freq_lim / 100;
+ if (minor_freq_lim > 1)
+ minor_freq_lim = minor_freq_lim / 100;
- if (minor_freq_lim > 0.5) {
- cerr << "Unable to parse the minor allele frequency\n";
- help();
- }
+ if (minor_freq_lim > 0.5) {
+ cerr << "Unable to parse the minor allele frequency\n";
+ help();
+ }
}
if (min_inform_pairs > 0) {
- if (min_inform_pairs > 1)
- min_inform_pairs = min_inform_pairs / 100;
+ if (min_inform_pairs > 1)
+ min_inform_pairs = min_inform_pairs / 100;
}
return 0;
@@ -1944,21 +1944,21 @@ void version() {
void help() {
std::cerr << "phasedstacks " << VERSION << "\n"
<< "phasedstacks -b id -S path -P path -t file_type [-p threads] [-M popmap] [-v] [-h]" << "\n"
- << " b: Stacks batch ID.\n"
- << " P: path to the phased output files.\n"
- << " S: path to the Stacks output files.\n"
- << " t: input file type. Supported types: fastphase, and beagle.\n"
- << " p: number of processes to run in parallel sections of code.\n"
- << " M: path to the population map, a tab separated file describing which individuals belong in which population.\n"
- << " v: print program version." << "\n"
- << " h: display this help messsage." << "\n"
- << " --haplotypes: data were phased as RAD locus haplotypes.\n"
- << " --dprime_bin_size: size of buckets for binning SNPs at a particular distance to calculate the mean D' value.\n"
- << " --dprime_threshold <val>: if D' values fall above <val>, set the D' to 1, otherwise set D' to 0.\n\n"
- << " Filtering options:\n"
- << " --skip_zeros: do not include D' values of zero in the D' output.\n"
- << " --minor_allele_freq: specify a minimum minor allele frequency required to process a nucleotide site (0 < a < 0.5).\n"
- << " --min_inform_pairs: when building D' haplotype blocks, the minimum number of informative D' measures to combine two blocks (default 0.9).\n\n";
+ << " b: Stacks batch ID.\n"
+ << " P: path to the phased output files.\n"
+ << " S: path to the Stacks output files.\n"
+ << " t: input file type. Supported types: fastphase, and beagle.\n"
+ << " p: number of processes to run in parallel sections of code.\n"
+ << " M: path to the population map, a tab separated file describing which individuals belong in which population.\n"
+ << " v: print program version." << "\n"
+ << " h: display this help messsage." << "\n"
+ << " --haplotypes: data were phased as RAD locus haplotypes.\n"
+ << " --dprime_bin_size: size of buckets for binning SNPs at a particular distance to calculate the mean D' value.\n"
+ << " --dprime_threshold <val>: if D' values fall above <val>, set the D' to 1, otherwise set D' to 0.\n\n"
+ << " Filtering options:\n"
+ << " --skip_zeros: do not include D' values of zero in the D' output.\n"
+ << " --minor_allele_freq: specify a minimum minor allele frequency required to process a nucleotide site (0 < a < 0.5).\n"
+ << " --min_inform_pairs: when building D' haplotype blocks, the minimum number of informative D' measures to combine two blocks (default 0.9).\n\n";
exit(0);
diff --git a/src/phasedstacks.h b/src/phasedstacks.h
index 96ddd00..2545882 100644
--- a/src/phasedstacks.h
+++ b/src/phasedstacks.h
@@ -77,16 +77,16 @@ public:
char *nucs_2;
Sample() {
- this->id = 0;
- this->size = 0;
- this->nucs_1 = NULL;
- this->nucs_2 = NULL;
+ this->id = 0;
+ this->size = 0;
+ this->nucs_1 = NULL;
+ this->nucs_2 = NULL;
}
~Sample() {
- if (this->nucs_1 != NULL)
- delete [] this->nucs_1;
- if (this->nucs_2 != NULL)
- delete [] this->nucs_2;
+ if (this->nucs_1 != NULL)
+ delete [] this->nucs_1;
+ if (this->nucs_2 != NULL)
+ delete [] this->nucs_2;
}
};
@@ -103,11 +103,11 @@ public:
// nuc[3] == T
NucSum() {
- this->freq = 1.0;
- this->bp = 0;
- this->clocus = 0;
- for (uint i = 0; i < 4; i++)
- this->nuc[i] = 0;
+ this->freq = 1.0;
+ this->bp = 0;
+ this->clocus = 0;
+ for (uint i = 0; i < 4; i++)
+ this->nuc[i] = 0;
}
};
@@ -122,12 +122,12 @@ public:
loc_t type;
dPrime() {
- this->dprime = 0.0;
- this->chisq_p = false;
- this->var = 0.0;
- this->ci_high = 0.0;
- this->ci_low = 0.0;
- this->type = uninformative;
+ this->dprime = 0.0;
+ this->chisq_p = false;
+ this->var = 0.0;
+ this->ci_high = 0.0;
+ this->ci_low = 0.0;
+ this->type = uninformative;
}
};
@@ -143,41 +143,41 @@ public:
bool **recomb;
PhasedSummary(uint num_samples, uint num_genotypes) {
- this->sample_cnt = num_samples;
- this->samples = new Sample[this->sample_cnt];
- this->size = num_genotypes;
- this->nucs = new NucSum[this->size];
- this->dprime = new dPrime *[this->size];
- for (uint i = 0; i < this->size; i++)
- this->dprime[i] = new dPrime[this->size];
-
- this->recomb = new bool *[this->size];
- for (uint i = 0; i < this->size; i++) {
- this->recomb[i] = new bool[this->size];
- memset(this->recomb[i], 0, this->size);
- }
+ this->sample_cnt = num_samples;
+ this->samples = new Sample[this->sample_cnt];
+ this->size = num_genotypes;
+ this->nucs = new NucSum[this->size];
+ this->dprime = new dPrime *[this->size];
+ for (uint i = 0; i < this->size; i++)
+ this->dprime[i] = new dPrime[this->size];
+
+ this->recomb = new bool *[this->size];
+ for (uint i = 0; i < this->size; i++) {
+ this->recomb[i] = new bool[this->size];
+ memset(this->recomb[i], 0, this->size);
+ }
}
~PhasedSummary() {
- if (this->nucs != NULL)
- delete [] this->nucs;
- if (this->dprime != NULL) {
- for (uint i = 0; i < this->size; i++)
- delete [] this->dprime[i];
- delete [] this->dprime;
- }
- if (this->recomb != NULL) {
- for (uint i = 0; i < this->size; i++)
- delete [] this->recomb[i];
- delete [] this->recomb;
- }
- if (this->samples != NULL)
- delete [] this->samples;
+ if (this->nucs != NULL)
+ delete [] this->nucs;
+ if (this->dprime != NULL) {
+ for (uint i = 0; i < this->size; i++)
+ delete [] this->dprime[i];
+ delete [] this->dprime;
+ }
+ if (this->recomb != NULL) {
+ for (uint i = 0; i < this->size; i++)
+ delete [] this->recomb[i];
+ delete [] this->recomb;
+ }
+ if (this->samples != NULL)
+ delete [] this->samples;
}
int add_sample(string name) {
- uint i = this->sample_map.size();
- this->sample_map[name] = i;
- this->samples[i].name = name;
- return i;
+ uint i = this->sample_map.size();
+ this->sample_map[name] = i;
+ this->samples[i].name = name;
+ return i;
}
};
@@ -187,7 +187,7 @@ public:
HBlock *next;
HBlock() {
- this->next = NULL;
+ this->next = NULL;
}
};
@@ -196,21 +196,21 @@ class dPrimeBlocks {
public:
dPrimeBlocks() {
- this->_head = NULL;
+ this->_head = NULL;
}
~dPrimeBlocks() {
- HBlock *cur, *next;
- cur = this->_head;
- next = cur->next;
-
- while (next != NULL) {
- delete cur;
- cur = next;
- next = cur->next;
- }
+ HBlock *cur, *next;
+ cur = this->_head;
+ next = cur->next;
+
+ while (next != NULL) {
+ delete cur;
+ cur = next;
+ next = cur->next;
+ }
}
HBlock *head() {
- return this->_head;
+ return this->_head;
}
HBlock *initialize(set<int> &);
HBlock *merge_adjacent(HBlock *);
diff --git a/src/populations.cc b/src/populations.cc
index 072c494..2ea9f72 100644
--- a/src/populations.cc
+++ b/src/populations.cc
@@ -19,7 +19,7 @@
//
//
-// populations -- generate population genetic statistics and output
+// populations -- generate population genetic statistics and output
// haplotypes in a population context.
//
@@ -38,7 +38,7 @@ typedef MetaPopInfo::Sample Sample;
typedef MetaPopInfo::Pop Pop;
typedef MetaPopInfo::Group Group;
-extern int **encoded_gtypes;
+extern int encoded_gtypes[4][4];
// Global variables to hold command-line options.
InputMode input_mode = InputMode::stacks;
@@ -572,7 +572,7 @@ int main (int argc, char* argv[]) {
implement_random_snp_whitelist(catalog, psum, whitelist);
//
- // Remove the accumulated SNPs
+ // Remove the accumulated SNPs
//
cerr << "Removing " << blacklist.size() << " additional loci for which all variant sites were filtered...";
set<int> empty_list;
@@ -649,7 +649,7 @@ int main (int argc, char* argv[]) {
if (structure_out && ordered_export)
write_structure_ordered(catalog, pmap, psum, log_fh);
- else if (structure_out)
+ else if (structure_out)
write_structure(catalog, pmap, psum);
if (fastphase_out)
@@ -672,7 +672,7 @@ int main (int argc, char* argv[]) {
if (treemix_out)
write_treemix(catalog, pmap, psum);
-
+
if (phylip_out || phylip_var)
write_phylip(catalog, pmap, psum);
@@ -795,8 +795,8 @@ void vcfcomp_simplify_pmap (map<int, CSLocus*>& catalog, PopMap<CSLocus>* pmap)
}
int
-apply_locus_constraints(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
+apply_locus_constraints(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
ofstream &log_fh)
{
uint pop_sthg;
@@ -854,8 +854,8 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
//
// Check that each sample is over the minimum stack depth for this locus.
//
- if (d[i] != NULL &&
- min_stack_depth > 0 &&
+ if (d[i] != NULL &&
+ min_stack_depth > 0 &&
d[i]->tot_depth < min_stack_depth) {
below_stack_dep++;
delete d[i];
@@ -866,8 +866,8 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
//
// Check that each sample is over the log likelihood threshold.
//
- if (d[i] != NULL &&
- filter_lnl &&
+ if (d[i] != NULL &&
+ filter_lnl &&
d[i]->lnl < lnl_limit) {
below_lnl_thresh++;
delete d[i];
@@ -885,14 +885,14 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
}
//
- // Check that the counts for each population are over sample_limit. If not, zero out
+ // Check that the counts for each population are over sample_limit. If not, zero out
// the members of that population.
//
for (uint i = 0; i < pop_cnt; i++) {
const Pop& pop = mpopi.pops()[pop_order[i]];
pct = (double) pop_cnts[i] / (double) pop_tot[i];
-
+
if (pop_cnts[i] > 0 && pct < sample_limit) {
//cerr << "Removing population " << pop_order[i] << " at locus: " << loc->id << "; below sample limit: " << pct << "\n";
for (uint j = pop.first_sample; j <= pop.last_sample; j++) {
@@ -936,7 +936,7 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
//
// Remove loci
//
- if (min_stack_depth > 0)
+ if (min_stack_depth > 0)
cerr << "Removed " << below_stack_dep << " samples from loci that are below the minimum stack depth of " << min_stack_depth << "x\n";
if (filter_lnl)
cerr << "Removed " << below_lnl_thresh << " samples from loci that are below the log likelihood threshold of " << lnl_limit << "\n";
@@ -958,7 +958,7 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
}
int
-prune_polymorphic_sites(map<int, CSLocus *> &catalog,
+prune_polymorphic_sites(map<int, CSLocus *> &catalog,
PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum,
map<int, set<int> > &whitelist, set<int> &blacklist,
@@ -979,7 +979,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
//
// If the whitelist is populated, use it as a guide for what loci to consider.
- //
+ //
// Construct a new whitelist along the way, that is a subset of the existing list.
//
if (whitelist.size() > 0) {
@@ -997,7 +997,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
s = psum->locus(loc->id);
//
- // Check that each SNP in this locus is above the sample_limit and that
+ // Check that each SNP in this locus is above the sample_limit and that
// each SNP is above the minor allele frequency. If so, add it back to
// the whiteliest.
//
@@ -1021,7 +1021,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
het_prune = false;
inc_prune = false;
pop_prune_list.clear();
-
+
for (size_t p=0; p<mpopi.pops().size(); ++p) {
if (s[p]->nucs[loc->snps[i]->col].incompatible_site)
inc_prune = true;
@@ -1045,7 +1045,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
d = pmap->locus(loc->id);
const Pop& pop = mpopi.pops()[p];
for (uint k = pop.first_sample; k <= pop.last_sample; k++) {
- if (d[k] == NULL || loc->snps[i]->col >= (uint) d[k]->len)
+ if (d[k] == NULL || loc->snps[i]->col >= (uint) d[k]->len)
continue;
if (d[k]->model != NULL) {
d[k]->model[loc->snps[i]->col] = 'U';
@@ -1053,7 +1053,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
}
}
}
-
+
if (t->nucs[loc->snps[i]->col].allele_cnt > 1) {
//
// Test for minor allele frequency.
@@ -1076,7 +1076,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
<< loc->id << "\t"
<< loc->loc.chr << "\t"
<< loc->sort_bp(loc->snps[i]->col) +1 << "\t"
- << loc->snps[i]->col << "\t";
+ << loc->snps[i]->col << "\t";
if (inc_prune)
log_fh << "incompatible_site\n";
else if (sample_prune)
@@ -1137,7 +1137,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
het_prune = false;
inc_prune = false;
pop_prune_list.clear();
-
+
for (size_t p = 0; p < mpopi.pops().size(); p++) {
if (s[p]->nucs[loc->snps[i]->col].incompatible_site)
inc_prune = true;
@@ -1160,7 +1160,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
d = pmap->locus(loc->id);
const Pop& pop = mpopi.pops()[p];
for (uint k = pop.first_sample; k <= pop.last_sample; k++) {
- if (d[k] == NULL || loc->snps[i]->col >= (uint) d[k]->len)
+ if (d[k] == NULL || loc->snps[i]->col >= (uint) d[k]->len)
continue;
if (d[k]->model != NULL) {
d[k]->model[loc->snps[i]->col] = 'U';
@@ -1168,7 +1168,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
}
}
}
-
+
if (t->nucs[loc->snps[i]->col].allele_cnt > 1) {
//
// Test for minor allele frequency.
@@ -1226,8 +1226,8 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
return pruned;
}
-bool
-order_unordered_loci(map<int, CSLocus *> &catalog)
+bool
+order_unordered_loci(map<int, CSLocus *> &catalog)
{
map<int, CSLocus *>::iterator it;
CSLocus *loc;
@@ -1235,7 +1235,7 @@ order_unordered_loci(map<int, CSLocus *> &catalog)
for (it = catalog.begin(); it != catalog.end(); it++) {
loc = it->second;
- if (strlen(loc->loc.chr) > 0)
+ if (strlen(loc->loc.chr) > 0)
chrs.insert(loc->loc.chr);
}
@@ -1310,8 +1310,8 @@ log_haplotype_cnts(map<int, CSLocus *> &catalog, ofstream &log_fh)
return 0;
}
-int
-tabulate_haplotypes(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
+int
+tabulate_haplotypes(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
{
map<int, CSLocus *>::iterator it;
vector<char *>::iterator hit;
@@ -1327,7 +1327,7 @@ tabulate_haplotypes(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
cnt = 0.0;
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL)
+ if (d[i] == NULL)
continue;
if (d[i]->obshap.size() > 1)
@@ -1349,7 +1349,7 @@ tabulate_haplotypes(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
}
int
-merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
+merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
map<int, pair<merget, int> > &merge_map,
ofstream &log_fh)
@@ -1378,7 +1378,7 @@ merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
cerr << "To merge adjacent loci at least " << merge_prune_lim * 100 << "% of samples must have both adjacent loci;"
<< " the remaining " << 100 - (merge_prune_lim * 100) << "% of individuals will be pruned.\n"
<< "Attempting to merge adjacent loci that share a cutsite...";
-
+
if (verbose)
log_fh << "\n#\n# List of locus pairs that share a cutsite that failed to merge because they could not be phased.\n#\n";
@@ -1443,31 +1443,31 @@ merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
if (prune_pct < merge_prune_lim) {
int pct = (int) (prune_pct * 100);
missing_samps_dist[pct]++;
- if (verbose) log_fh << "Missing samples, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
+ if (verbose) log_fh << "Missing samples, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
<< pct << "% present (" << 100 - pct << "% missing)\n";
missing_samps_cnt++;
failure++;
continue;
- }
+ }
phaset res = merge_and_phase_loci(pmap, cur, next, loci_to_destroy, log_fh);
switch(res) {
case multiple_fails:
- if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
+ if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
<< "multiple failures\n";
multifails_cnt++;
phase_fail_cnt++;
failure++;
break;
case multimapping_fail:
- if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
+ if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
<< "multimapping in one or more individuals\n";
multimapping_cnt++;
phase_fail_cnt++;
failure++;
break;
case nomapping_fail:
- if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
+ if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
<< "no mapping in one or more individuals\n";
nomapping_cnt++;
phase_fail_cnt++;
@@ -1513,8 +1513,8 @@ merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
<< " Of those merged, " << simple_merge_cnt << " required only a simple merge without phasing; "
<< "while " << complex_merge_cnt << " required phasing.\n"
<< " Of those that failed to merge, " << missing_samps_cnt << " were missing one of the two haplotypes in one or more samples; "
- << "while " << phase_fail_cnt << " failed to be phased.\n"
- << " Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
+ << "while " << phase_fail_cnt << " failed to be phased.\n"
+ << " Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
<< multimapping_cnt << " failed due to multiple haplotype mappings; " << multifails_cnt << " failed due to both.\n";
log_fh << "\n#\n# Merging adjacent loci with a shared restriction enzyme cutsite\n#\n"
@@ -1526,8 +1526,8 @@ merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
<< " Of those merged, " << simple_merge_cnt << " required only a simple merge without phasing; "
<< "while " << complex_merge_cnt << " required phasing.\n"
<< " Of those that failed to merge, " << missing_samps_cnt << " were missing one of the two haplotypes in one or more samples; "
- << "while " << phase_fail_cnt << " failed to be phased.\n"
- << " Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
+ << "while " << phase_fail_cnt << " failed to be phased.\n"
+ << " Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
<< multimapping_cnt << " failed due to multiple haplotype mappings; " << multifails_cnt << " failed due to both.\n";
log_fh << "#\n# Distribution of loci with samples missing one of two loci to be merged\n"
<< "# Percent samples with both loci present\tNumber of cases\n";
@@ -1540,7 +1540,7 @@ merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
}
phaset
-merge_and_phase_loci(PopMap<CSLocus> *pmap, CSLocus *cur, CSLocus *next,
+merge_and_phase_loci(PopMap<CSLocus> *pmap, CSLocus *cur, CSLocus *next,
set<int> &loci_to_destroy,
ofstream &log_fh)
{
@@ -1558,7 +1558,7 @@ merge_and_phase_loci(PopMap<CSLocus> *pmap, CSLocus *cur, CSLocus *next,
int sample_cnt = 0;
int phased_sample_cnt = 0;
//
- // Take a census of the already phased haplotypes. We have phased haplotypes
+ // Take a census of the already phased haplotypes. We have phased haplotypes
// if for individual i:
// 1. d_1 has a single haplotype and d_2 has a single haplotype
// 2. d_1 has a single haplotpye and d_2 has multiple haplotypes
@@ -1624,7 +1624,7 @@ merge_and_phase_loci(PopMap<CSLocus> *pmap, CSLocus *cur, CSLocus *next,
sample_cnt++;
//
- // We should be able to find a sinlge phasing mapping for each haplotype from d_1 to d_2
+ // We should be able to find a sinlge phasing mapping for each haplotype from d_1 to d_2
// that includes all the haplotypes in these two loci.
//
vector<pair<char *, char *> > seen_phased;
@@ -1691,7 +1691,7 @@ merge_and_phase_loci(PopMap<CSLocus> *pmap, CSLocus *cur, CSLocus *next,
}
if (phased_sample_cnt != sample_cnt) {
- if (phased_results.count(nomapping_fail) > 0 &&
+ if (phased_results.count(nomapping_fail) > 0 &&
phased_results.count(multimapping_fail) > 0)
return multiple_fails;
else if (phased_results.count(nomapping_fail) > 0)
@@ -1736,7 +1736,7 @@ merge_csloci(CSLocus *sink, CSLocus *src, set<string> &phased_haplotypes)
//
//
- // 1. Reverse complement the SNP coordinates in the sink locus so that they are
+ // 1. Reverse complement the SNP coordinates in the sink locus so that they are
// enumerated on the positive strand. Complement the alleles as well.
//
for (uint j = 0; j < sink->snps.size(); j++) {
@@ -1754,7 +1754,7 @@ merge_csloci(CSLocus *sink, CSLocus *src, set<string> &phased_haplotypes)
src->snps[j]->col = sink->len + src->snps[j]->col - renz_olap[enz];
//
- // 3. Combine SNPs between the two catalog loci: add the SNPs from the sink (formerly on the
+ // 3. Combine SNPs between the two catalog loci: add the SNPs from the sink (formerly on the
// negative strand) in reverse order, followed by the SNPs from the src.
//
vector<SNP *> tmpsnp;
@@ -1802,23 +1802,23 @@ merge_csloci(CSLocus *sink, CSLocus *src, set<string> &phased_haplotypes)
// cerr << "CSLocus " << sink->id << ":\n"
// << "Length: " << sink->len << "; Chr: " << sink->loc.chr << "; BP: " << sink->sort_bp() << "; strand: " << (sink->loc.strand == strand_plus ? "+" : "-") << "\n"
// << " SNPs:\n";
- // for (uint j = 0; j < sink->snps.size(); j++)
- // cerr << " Col: " << sink->snps[j]->col
+ // for (uint j = 0; j < sink->snps.size(); j++)
+ // cerr << " Col: " << sink->snps[j]->col
// << " Rank 1: " << sink->snps[j]->rank_1
// << " Rank 2: " << sink->snps[j]->rank_2 << "\n";
// cerr << " Alleles:\n";
// map<string, int>::iterator ait;
- // for (ait = sink->alleles.begin(); ait != sink->alleles.end(); ait++)
+ // for (ait = sink->alleles.begin(); ait != sink->alleles.end(); ait++)
// cerr << " " << ait->first << "\n";
return 1;
}
int
-merge_datums(int sample_cnt,
+merge_datums(int sample_cnt,
int sink_locus_len,
- Datum **sink, Datum **src,
- set<string> &phased_haplotypes,
+ Datum **sink, Datum **src,
+ set<string> &phased_haplotypes,
int merge_type)
{
char tmphap[id_len], *new_hap;
@@ -1839,7 +1839,7 @@ merge_datums(int sample_cnt,
cerr << "Unexpected condition in merging datums: one datum is NULL while the other is not.\n";
//
- // 1. Reverse complement the SNP coordinates in the sink locus so that they are
+ // 1. Reverse complement the SNP coordinates in the sink locus so that they are
// enumerated on the positive strand. Complement the alleles as well.
//
for (uint j = 0; j < sink[i]->snps.size(); j++) {
@@ -1868,7 +1868,7 @@ merge_datums(int sample_cnt,
}
//
- // 4. Combine SNPs between the two datums: add the SNPs from the sink (formerly on the
+ // 4. Combine SNPs between the two datums: add the SNPs from the sink (formerly on the
// negative strand) in reverse order, followed by the SNPs from the src.
//
tmpsnp.clear();
@@ -2008,7 +2008,7 @@ merge_datums(int sample_cnt,
p = sink[i]->model;
p += offset + sink[i]->len - renz_olap[enz];
strcpy(p, src[i]->model);
-
+
sink[i]->len = model_len;
sink[i]->tot_depth = (sink[i]->tot_depth + src[i]->tot_depth) / 2;
sink[i]->lnl = (sink[i]->lnl + src[i]->lnl) / 2.0;
@@ -2022,14 +2022,14 @@ merge_datums(int sample_cnt,
return 1;
}
-int
-create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap)
+int
+create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap)
{
//
// Create a genotype map. For any set of haplotypes, this routine will
- // assign each haplotype to a genotype, e.g. given the haplotypes
- // 'AC' and 'GT' in the population, this routine will assign 'AC' == 'a'
- // and 'GT' == 'b'. If an individual is homozygous for 'AC', they will be
+ // assign each haplotype to a genotype, e.g. given the haplotypes
+ // 'AC' and 'GT' in the population, this routine will assign 'AC' == 'a'
+ // and 'GT' == 'b'. If an individual is homozygous for 'AC', they will be
// assigned an 'aa' genotype.
//
//cerr << "Creating genotype map for catalog ID " << locus->id << ", marker: " << locus->marker << ".\n";
@@ -2057,7 +2057,7 @@ create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap)
//
if (haplotypes.size() > 26) return 0;
- //
+ //
// Sort the haplotypes map by value
//
for (k = haplotypes.begin(); k != haplotypes.end(); k++)
@@ -2072,7 +2072,7 @@ create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap)
return 0;
}
-int call_population_genotypes(CSLocus *locus,
+int call_population_genotypes(CSLocus *locus,
PopMap<CSLocus> *pmap) {
//
// Fetch the array of observed haplotypes from the population
@@ -2080,7 +2080,7 @@ int call_population_genotypes(CSLocus *locus,
Datum **d = pmap->locus(locus->id);
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL)
+ if (d[i] == NULL)
continue;
vector<string> gtypes;
@@ -2109,7 +2109,7 @@ int call_population_genotypes(CSLocus *locus,
//cerr << " Adding genotype to string: " << gtypes[j] << "; " << gtype << "\n";
}
- string m = gtype.length() == 1 ?
+ string m = gtype.length() == 1 ?
gtype + gtype : gtype;
d[i]->gtype = new char[m.length() + 1];
@@ -2177,7 +2177,7 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
uint start = 0;
uint end = loc->len;
//
- // Check for the existence of the restriction enzyme cut site, mask off
+ // Check for the existence of the restriction enzyme cut site, mask off
// its output.
//
for (uint n = 0; n < rcnt; n++)
@@ -2205,7 +2205,7 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
if (d[j] == NULL)
fh << "0";
- else
+ else
switch (d[j]->obshap.size()) {
case 1:
a = encode_gtype(d[j]->obshap[0][k]);
@@ -2233,7 +2233,7 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
return 0;
}
-int
+int
calculate_haplotype_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
@@ -2314,7 +2314,7 @@ calculate_haplotype_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, P
loc = it->second[pos];
d = pmap->locus(loc->id);
- if (loc->snps.size() == 0)
+ if (loc->snps.size() == 0)
continue;
// cerr << "Looking at locus " << loc->id << "\n";
@@ -2333,7 +2333,7 @@ calculate_haplotype_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, P
ks->smooth(locstats);
}
- if (bootstrap_div)
+ if (bootstrap_div)
bs->add_data(locstats);
}
@@ -2366,11 +2366,11 @@ calculate_haplotype_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, P
<< l->hap_str << "\n";
}
- for (uint k = 0; k < locstats.size(); k++)
+ for (uint k = 0; k < locstats.size(); k++)
delete locstats[k];
}
- if (bootstrap_div)
+ if (bootstrap_div)
delete bs;
}
@@ -2385,7 +2385,7 @@ calculate_haplotype_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, P
}
int
-nuc_substitution_dist(map<string, int> &hap_index, double **hdists)
+nuc_substitution_dist(map<string, int> &hap_index, double **hdists)
{
vector<string> haplotypes;
map<string, int>::iterator it;
@@ -2434,7 +2434,7 @@ nuc_substitution_dist(map<string, int> &hap_index, double **hdists)
}
int
-nuc_substitution_identity(map<string, int> &hap_index, double **hdists)
+nuc_substitution_identity(map<string, int> &hap_index, double **hdists)
{
vector<string> haplotypes;
map<string, int>::iterator it;
@@ -2462,7 +2462,7 @@ nuc_substitution_identity(map<string, int> &hap_index, double **hdists)
}
int
-nuc_substitution_identity_max(map<string, int> &hap_index, double **hdists)
+nuc_substitution_identity_max(map<string, int> &hap_index, double **hdists)
{
vector<string> haplotypes;
map<string, int>::iterator it;
@@ -2481,7 +2481,7 @@ nuc_substitution_identity_max(map<string, int> &hap_index, double **hdists)
return 0;
}
-int
+int
calculate_haplotype_divergence(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
@@ -2525,7 +2525,7 @@ calculate_haplotype_divergence(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pm
ord->order(hapstats, hapstats_key, it->second);
#pragma omp parallel
- {
+ {
CSLocus *loc;
LocSum **s;
Datum **d;
@@ -2640,14 +2640,14 @@ calculate_haplotype_divergence(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pm
<< "SSD(AP/WG)" << "\t"
<< "SSD(AG)" << "\t"
<< "SSD(TOTAL)" << "\t"
- << "MSD(WP)" << "\t"
- << "MSD(AP/WG)" << "\t"
+ << "MSD(WP)" << "\t"
+ << "MSD(AP/WG)" << "\t"
<< "MSD(AG)" << "\t"
<< "MSD(TOTAL)" << "\t"
<< "n" << "\t"
<< "n'" << "\t"
<< "n''" << "\t"
- << "Sigma2_a" << "\t"
+ << "Sigma2_a" << "\t"
<< "Sigma2_b" << "\t"
<< "Sigma2_c" << "\t"
<< "Sigma_Total" << "\t";
@@ -2685,14 +2685,14 @@ calculate_haplotype_divergence(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pm
<< hapstats[k]->comp[1] << "\t"
<< hapstats[k]->comp[2] << "\t"
<< hapstats[k]->comp[3] << "\t"
- << hapstats[k]->comp[4] << "\t"
- << hapstats[k]->comp[5] << "\t"
+ << hapstats[k]->comp[4] << "\t"
+ << hapstats[k]->comp[5] << "\t"
<< hapstats[k]->comp[6] << "\t"
<< hapstats[k]->comp[7] << "\t"
<< hapstats[k]->comp[8] << "\t"
<< hapstats[k]->comp[9] << "\t"
<< hapstats[k]->comp[10] << "\t"
- << hapstats[k]->comp[11] << "\t"
+ << hapstats[k]->comp[11] << "\t"
<< hapstats[k]->comp[12] << "\t"
<< hapstats[k]->comp[13] << "\t"
<< hapstats[k]->comp[14] << "\t";
@@ -2723,7 +2723,7 @@ calculate_haplotype_divergence(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pm
return 0;
}
-int
+int
calculate_haplotype_divergence_pairwise(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
@@ -2777,7 +2777,7 @@ calculate_haplotype_divergence_pairwise(map<int, CSLocus *> &catalog, PopMap<CSL
ord->order(hapstats, hapstats_key, it->second);
#pragma omp parallel
- {
+ {
CSLocus *loc;
LocSum **s;
Datum **d;
@@ -2876,14 +2876,14 @@ calculate_haplotype_divergence_pairwise(map<int, CSLocus *> &catalog, PopMap<CSL
<< "SSD(AP/WG)" << "\t"
<< "SSD(AG)" << "\t"
<< "SSD(TOTAL)" << "\t"
- << "MSD(WP)" << "\t"
- << "MSD(AP/WG)" << "\t"
+ << "MSD(WP)" << "\t"
+ << "MSD(AP/WG)" << "\t"
<< "MSD(AG)" << "\t"
<< "MSD(TOTAL)" << "\t"
<< "n" << "\t"
<< "n'" << "\t"
<< "n''" << "\t"
- << "Sigma2_a" << "\t"
+ << "Sigma2_a" << "\t"
<< "Sigma2_b" << "\t"
<< "Sigma2_c" << "\t"
<< "Sigma_Total" << "\t";
@@ -2916,14 +2916,14 @@ calculate_haplotype_divergence_pairwise(map<int, CSLocus *> &catalog, PopMap<CSL
<< hapstats[k]->comp[1] << "\t"
<< hapstats[k]->comp[2] << "\t"
<< hapstats[k]->comp[3] << "\t"
- << hapstats[k]->comp[4] << "\t"
- << hapstats[k]->comp[5] << "\t"
+ << hapstats[k]->comp[4] << "\t"
+ << hapstats[k]->comp[5] << "\t"
<< hapstats[k]->comp[6] << "\t"
<< hapstats[k]->comp[7] << "\t"
<< hapstats[k]->comp[8] << "\t"
<< hapstats[k]->comp[9] << "\t"
<< hapstats[k]->comp[10] << "\t"
- << hapstats[k]->comp[11] << "\t"
+ << hapstats[k]->comp[11] << "\t"
<< hapstats[k]->comp[12] << "\t"
<< hapstats[k]->comp[13] << "\t"
<< hapstats[k]->comp[14] << "\t";
@@ -2966,7 +2966,7 @@ fixed_locus(Datum **d, vector<int> &pop_ids)
for (size_t i = pop.first_sample; i <= pop.last_sample; i++) {
if (d[i] == NULL) continue;
- if (d[i]->obshap.size() > 2) {
+ if (d[i]->obshap.size() > 2) {
continue;
} else if (d[i]->obshap.size() == 1) {
@@ -2996,7 +2996,7 @@ fixed_locus(Datum **d, vector<int> &pop_ids)
//
// Check that more than one population has data for this locus.
//
- if (valid_pops <= 1)
+ if (valid_pops <= 1)
return true;
//
@@ -3030,7 +3030,7 @@ haplotype_diversity(int start, int end, Datum **d)
//
// If this haplotype is fixed, don't calculate any statistics.
//
- if (n == 0)
+ if (n == 0)
return NULL;
lstat = new LocStat;
@@ -3077,16 +3077,16 @@ haplotype_diversity(int start, int end, Datum **d)
//
for (uint i = 0; i < haplotypes.size(); i++) {
for (uint j = 0; j < haplotypes.size(); j++) {
- hapl_diversity +=
- hap_freq[haplotypes[i]] *
- hap_freq[haplotypes[j]] *
+ hapl_diversity +=
+ hap_freq[haplotypes[i]] *
+ hap_freq[haplotypes[j]] *
hdists[hap_index[haplotypes[i]]][hap_index[haplotypes[j]]];
}
}
hapl_diversity = (n / (n-1)) * hapl_diversity;
//
- // Calculate gene diversity.
+ // Calculate gene diversity.
//
for (uint i = 0; i < haplotypes.size(); i++) {
gene_diversity += hap_freq[haplotypes[i]] * hap_freq[haplotypes[i]];
@@ -3128,7 +3128,7 @@ haplotype_amova(Datum **d, LocSum **s, vector<int> &pop_ids)
for (size_t i = pop.first_sample; i <= pop.last_sample; i++) {
if (d[i] == NULL) continue;
- if (d[i]->obshap.size() > 2) {
+ if (d[i]->obshap.size() > 2) {
continue;
} else if (d[i]->obshap.size() == 1) {
@@ -3217,7 +3217,7 @@ haplotype_amova(Datum **d, LocSum **s, vector<int> &pop_ids)
nuc_substitution_dist(loc_hap_index, hdists);
//
- // Calculate the sum of squared distances in each subset: total, within populations, across populations
+ // Calculate the sum of squared distances in each subset: total, within populations, across populations
// and withing groups, and across groups.
//
double ssd_total = amova_ssd_total(loc_haplotypes, loc_hap_index, hdists);
@@ -3309,7 +3309,7 @@ haplotype_amova(Datum **d, LocSum **s, vector<int> &pop_ids)
sigma_a = (msd_ag - sigma_c - (n_1 * sigma_b)) / n_2;
// Arlequin seems to sum the variance components instead of independently calculating sigma_total: MSD(total) = SSD(total)/degrees.of.freedom
- double sigma_total = sigma_a + sigma_b + sigma_c; // msd_total;
+ double sigma_total = sigma_a + sigma_b + sigma_c; // msd_total;
double phi_st = 0.0;
double phi_ct = 0.0;
@@ -3419,10 +3419,10 @@ amova_ssd_total(vector<string> &loc_haplotypes, map<string, int> &loc_hap_index,
for (uint j = 0; j < loc_haplotypes.size(); j++) {
for (uint k = 0; k < loc_haplotypes.size(); k++) {
ssd_total += hdists[loc_hap_index[loc_haplotypes[j]]][loc_hap_index[loc_haplotypes[k]]];
- // cerr << j << "\t"
- // << k << "\t"
- // << loc_haplotypes[j] << "\t"
- // << loc_haplotypes[k] << "\t"
+ // cerr << j << "\t"
+ // << k << "\t"
+ // << loc_haplotypes[j] << "\t"
+ // << loc_haplotypes[k] << "\t"
// << hdists[loc_hap_index[loc_haplotypes[j]]][loc_hap_index[loc_haplotypes[k]]] << "\n";
}
}
@@ -3433,8 +3433,8 @@ amova_ssd_total(vector<string> &loc_haplotypes, map<string, int> &loc_hap_index,
}
double
-amova_ssd_wp(vector<int> &grps, map<int, vector<int> > &grp_members,
- map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
+amova_ssd_wp(vector<int> &grps, map<int, vector<int> > &grp_members,
+ map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
double **hdists)
{
//
@@ -3454,9 +3454,9 @@ amova_ssd_wp(vector<int> &grps, map<int, vector<int> > &grp_members,
ssd += hdists[loc_hap_index[pop_haplotypes[pop_id][j]]][loc_hap_index[pop_haplotypes[pop_id][k]]];
// cerr << pop_id << "\t"
// << j << "\t"
- // << k << "\t"
- // << loc_haplotypes[j] << "\t"
- // << loc_haplotypes[k] << "\t"
+ // << k << "\t"
+ // << loc_haplotypes[j] << "\t"
+ // << loc_haplotypes[k] << "\t"
// << hdists[loc_hap_index[loc_haplotypes[j]]][loc_hap_index[loc_haplotypes[k]]] << "\n";
}
}
@@ -3471,8 +3471,8 @@ amova_ssd_wp(vector<int> &grps, map<int, vector<int> > &grp_members,
}
double
-amova_ssd_ap_wg(vector<int> &grps, map<int, vector<int> > &grp_members,
- map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
+amova_ssd_ap_wg(vector<int> &grps, map<int, vector<int> > &grp_members,
+ map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
double **hdists_1, double **hdists_2)
{
//
@@ -3537,8 +3537,8 @@ amova_ssd_ap_wg(vector<int> &grps, map<int, vector<int> > &grp_members,
}
double
-amova_ssd_ag(vector<int> &grps, map<int, vector<int> > &grp_members,
- map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
+amova_ssd_ag(vector<int> &grps, map<int, vector<int> > &grp_members,
+ map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
double **hdists, double ssd_total)
{
//
@@ -3588,7 +3588,7 @@ double
haplotype_d_est(Datum **d, LocSum **s, vector<int> &pop_ids)
{
//
- // Calculate D_est, fixation index, as described by
+ // Calculate D_est, fixation index, as described by
// Bird, et al., 2011, Detecting and measuring genetic differentiation
// +-Equation 11
// and
@@ -3658,7 +3658,7 @@ haplotype_d_est(Datum **d, LocSum **s, vector<int> &pop_ids)
return d_est;
}
-int
+int
calculate_summary_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
@@ -3897,7 +3897,7 @@ calculate_summary_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, Pop
for (int i = 0; i < len; i++) {
- //
+ //
// If this site is fixed in all populations, DON'T output it. If it is variable,
// or fixed within populations but variable among, DO output it.
//
@@ -4197,11 +4197,11 @@ calculate_summary_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, Pop
return 0;
}
-int
+int
write_fst_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum, ofstream &log_fh)
{
//
- // We want to iterate over each pair of populations and calculate Fst at each
+ // We want to iterate over each pair of populations and calculate Fst at each
// nucleotide of each locus.
//
if (mpopi.pops().size() == 1)
@@ -4362,7 +4362,7 @@ write_fst_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLo
// If bootstrap resampling method is approximate, generate our single, empirical distribution.
//
map<int, vector<double> > approx_fst_dist;
- // if (bootstrap_fst && bootstrap_type == bs_approx)
+ // if (bootstrap_fst && bootstrap_type == bs_approx)
// bootstrap_fst_approximate_dist(fst_samples, allele_depth_samples, weights, snp_dist, approx_fst_dist);
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
@@ -4517,12 +4517,12 @@ correct_fst_bonferroni_win(vector<PopPair *> &pairs)
if (pairs[pos_l] == NULL) {
pos_l++;
} else {
- if (pairs[pos_l]->bp < limit_l)
+ if (pairs[pos_l]->bp < limit_l)
pos_l++;
else
break;
}
- }
+ }
while (pos_u < pairs.size()) {
if (pairs[pos_u] == NULL) {
pos_u++;
@@ -4547,8 +4547,8 @@ correct_fst_bonferroni_win(vector<PopPair *> &pairs)
return 0;
}
-int
-kernel_smoothed_popstats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum, int pop_id, ofstream &log_fh)
+int
+kernel_smoothed_popstats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum, int pop_id, ofstream &log_fh)
{
// int snp_dist[max_snp_dist] = {0};
// int sites_per_snp = 0;
@@ -4603,8 +4603,8 @@ kernel_smoothed_popstats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, Po
// // cerr << "Sites per snp: " << sites_per_snp << "\n";
-// bootstrap_popstats_approximate_dist(fis_samples, pi_samples, allele_depth_samples,
-// weights, snp_dist, sites_per_snp,
+// bootstrap_popstats_approximate_dist(fis_samples, pi_samples, allele_depth_samples,
+// weights, snp_dist, sites_per_snp,
// approx_fis_dist, approx_pi_dist);
// for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
@@ -4671,7 +4671,7 @@ bootstrap_popstats_approximate_dist(vector<double> &fis_samples,
// #pragma omp parallel private(poss, pos, index_1, index_2, index_3, dist, sum_fis, sum_pi, weighted_fis, weighted_pi, final_weight_fis, final_weight_pi)
#pragma omp parallel private(poss, pos, index_3, dist, sum_fis, sum_pi, weighted_fis, weighted_pi, final_weight_fis, final_weight_pi)
- {
+ {
BSample *bs = new BSample[win_size];
//
@@ -4714,7 +4714,7 @@ bootstrap_popstats_approximate_dist(vector<double> &fis_samples,
//
// Randomly select the positions and values for each SNP to populate the window
- //
+ //
for (int k = 0; k < i - 1; k++) {
pos = (int) (win_size * (random() / (RAND_MAX + 1.0)));
// index_1 = (int) (fis_samples.size() * (random() / (RAND_MAX + 1.0)));
@@ -4828,7 +4828,7 @@ bootstrap_fst_approximate_dist(vector<double> &fst_samples,
// #pragma omp parallel private(poss, pos, index_1, index_2, dist, sum, weighted_fst, final_weight)
#pragma omp parallel private(poss, pos, index_2, dist, sum, weighted_fst, final_weight)
- {
+ {
BSample *bs = new BSample[win_size];
//
@@ -4857,7 +4857,7 @@ bootstrap_fst_approximate_dist(vector<double> &fst_samples,
//
// Randomly select the positions and values for each SNP to populate the window
- //
+ //
for (int k = 0; k < i - 1; k++) {
pos = (int) (win_size * (random() / (RAND_MAX + 1.0)));
// index_1 = (int) (fst_samples.size() * (random() / (RAND_MAX + 1.0)));
@@ -4925,7 +4925,7 @@ bootstrap_approximate_pval(int snp_cnt, double stat, map<int, vector<double> > &
pos = 1;
else if (up == dist.end())
pos = dist.size();
- else
+ else
pos = up - dist.begin() + 1;
double res = 1.0 - (pos / (double) dist.size());
@@ -4934,8 +4934,8 @@ bootstrap_approximate_pval(int snp_cnt, double stat, map<int, vector<double> > &
// for (uint n = 0; n < dist.size(); n++)
// cerr << " n: " << n << "; Fst: " << dist[n] << "\n";
- // cerr << "Comparing Fst value: " << stat
- // << " at position " << (up - dist.begin()) << " out of "
+ // cerr << "Comparing Fst value: " << stat
+ // << " at position " << (up - dist.begin()) << " out of "
// << dist.size() << " positions (converted position: " << pos << "); pvalue: " << res << ".\n";
return res;
@@ -4974,7 +4974,7 @@ write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, bool write_gt
for (int i = 0; i < pmap->sample_cnt(); i++) {
fh << mpopi.samples()[i].name;
- if (i < pmap->sample_cnt() - 1)
+ if (i < pmap->sample_cnt() - 1)
fh << "\t";
}
fh << "\n";
@@ -4986,7 +4986,7 @@ write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, bool write_gt
loc = it->second;
stringstream id;
- loc->annotation.length() > 0 ?
+ loc->annotation.length() > 0 ?
id << loc->id << "|" << loc->annotation : id << loc->id;
fh << id.str();
@@ -5043,24 +5043,39 @@ int load_marker_list(string path, set<int> &list) {
exit(1);
}
- int marker;
- char *p, *e;
-
- while (fh.good()) {
- fh.getline(line, id_len);
-
- if (strlen(line) == 0) continue;
+ size_t line_num = 0;
+ while (fh.getline(line, id_len)) {
+ ++line_num;
//
- // Skip commented lines.
+ // Skip blank & commented lines ; correct windows-style line ends.
//
- for (p = line; isspace(*p) && *p != '\0'; p++);
- if (*p == '#') continue;
-
- marker = (int) strtol(line, &e, 10);
+ size_t len = strlen(line);
+ if (len == 0) {
+ continue;
+ } else if (line[len-1] == '\r') {
+ line[len-1] = '\0';
+ --len;
+ if (len == 0)
+ continue;
+ }
+ char* p = line;
+ while (isspace(*p) && *p != '\0')
+ ++p;
+ if (*p == '#')
+ continue;
- if (*e == '\0')
+ //
+ // Parse the blacklist
+ //
+ char* e;
+ int marker = (int) strtol(line, &e, 10);
+ if (*e == '\0') {
list.insert(marker);
+ } else {
+ cerr << "Error: Unable to parse blacklist '" << path << "' at line " << line_num << ".\n";
+ throw exception();
+ }
}
fh.close();
@@ -5083,20 +5098,29 @@ int load_marker_column_list(string path, map<int, set<int> > &list) {
}
vector<string> parts;
- uint marker, col;
- char *p, *e;
+ uint col;
+ char *e;
uint line_num = 1;
- while (fh.good()) {
- fh.getline(line, id_len);
-
- if (strlen(line) == 0) continue;
+ while (fh.getline(line, id_len)) {
//
- // Skip commented lines.
+ // Skip blank & commented lines ; correct windows-style line ends.
//
- for (p = line; isspace(*p) && *p != '\0'; p++);
- if (*p == '#') continue;
+ size_t len = strlen(line);
+ if (len == 0) {
+ continue;
+ } else if (line[len-1] == '\r') {
+ line[len-1] = '\0';
+ --len;
+ if (len == 0)
+ continue;
+ }
+ char* p = line;
+ while (isspace(*p) && *p != '\0')
+ ++p;
+ if (*p == '#')
+ continue;
//
// Parse the whitelist, we expect:
@@ -5109,7 +5133,7 @@ int load_marker_column_list(string path, map<int, set<int> > &list) {
exit(1);
} else if (parts.size() == 2) {
- marker = (int) strtol(parts[0].c_str(), &e, 10);
+ int marker = (int) strtol(parts[0].c_str(), &e, 10);
if (*e != '\0') {
cerr << "Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
exit(1);
@@ -5122,9 +5146,9 @@ int load_marker_column_list(string path, map<int, set<int> > &list) {
list[marker].insert(col);
} else {
- marker = (int) strtol(parts[0].c_str(), &e, 10);
+ int marker = (int) strtol(parts[0].c_str(), &e, 10);
if (*e != '\0') {
- cerr << "Unable to parse whitelist, '" << path << "' at line " << line << "\n";
+ cerr << "Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
exit(1);
}
list.insert(make_pair(marker, std::set<int>()));
@@ -5210,14 +5234,14 @@ int parse_command_line(int argc, char* argv[]) {
{"debug_flags", required_argument, NULL, 1000},
{0, 0, 0, 0}
};
-
+
// getopt_long stores the option index here.
- int c = getopt_long(argc, argv, "ACDEFGHJKLNSTUVYZ123456dghjklnsva:b:c:e:f:i:m:o:p:q:r:t:u:w:B:I:M:O:P:R:Q:W:", long_options, NULL);
+ int c = getopt_long(argc, argv, "ACDEFGHJKLNSTUV:YZ123456dghjklnsva:b:c:e:f:i:m:o:p:q:r:t:u:w:B:I:M:O:P:R:Q:W:", long_options, NULL);
// Detect the end of the options.
if (c == -1)
break;
-
+
switch (c) {
case 'h':
help();
@@ -5541,7 +5565,7 @@ int parse_command_line(int argc, char* argv[]) {
cerr << "You must specify the restriction enzyme associated with this data set to merge overlaping cutsites.\n";
help();
}
-
+
return 0;
}
@@ -5553,12 +5577,12 @@ void help() {
cerr << "populations " << VERSION << "\n"
<< "Usage:\n"
<< "populations -P dir -b batch_id [-O dir] [-M popmap] (filters) [--fstats] [-k [--window_size=150000] [--bootstrap [-N 100]]] (output formats)\n"
- << "populations -V vcf -O dir [-M popmap] (filters) [--fstats] [-k [--window_size=150000] [--bootstrap [-N 100]]] (output formats)\n"
+ << "populations -V vcf -O dir [-M popmap] (filters) [--fstats] [-k [--sigma=150000] [--bootstrap [-N 100]]] (output formats)\n"
<< "\n"
<< " -P,--in_path: path to the directory containing the Stacks files.\n"
<< " -b,--batch_id: Batch ID to examine when exporting from the catalog (required by -P).\n"
<< " -V,--in_vcf: path to an input VCF file.\n"
- << " -O,--out_path: path to a directory where to white the output files. (Required by -V; otherwise defaults to value of -P.)\n"
+ << " -O,--out_path: path to a directory where to write the output files. (Required by -V; otherwise defaults to value of -P.)\n"
<< " -M,--popmap: path to a population map. (Format is 'SAMPLE1\tPOP1\\n...'.)\n"
<< " -t,--threads: number of threads to run in parallel sections of code.\n"
<< " -s,--sql_out: output a file to import results into an SQL database.\n"
diff --git a/src/process_radtags.cc b/src/process_radtags.cc
index afca39f..f94953f 100644
--- a/src/process_radtags.cc
+++ b/src/process_radtags.cc
@@ -20,10 +20,12 @@
//
// process_radtags -- clean raw reads using a sliding window approach;
-// split reads by barcode, check RAD cutsite is intact, correct barcodes/cutsites
+// split reads by barcode, check RAD cutsite is intact, correct barcodes/cutsites
// within one basepair, truncate reads on request.
//
+#include <iomanip>
+
#include "process_radtags.h"
//
@@ -100,33 +102,33 @@ int main (int argc, char* argv[]) {
// If input files are gzipped, output gziped files, unless the user chooses an output type.
//
if (out_file_type == FileT::unknown) {
- if (in_file_type == FileT::gzfastq || in_file_type == FileT::bam)
- out_file_type = FileT::gzfastq;
- else
- out_file_type = FileT::fastq;
+ if (in_file_type == FileT::gzfastq || in_file_type == FileT::bam)
+ out_file_type = FileT::gzfastq;
+ else
+ out_file_type = FileT::fastq;
}
if (paired)
- cerr << "Processing paired-end data.\n";
+ cerr << "Processing paired-end data.\n";
else
- cerr << "Processing single-end data.\n";
+ cerr << "Processing single-end data.\n";
cerr << "Using Phred+" << qual_offset << " encoding for quality scores.\n";
if (truncate_seq > 0)
- cerr << "Reads will be truncated to " << truncate_seq << "bp\n";
+ cerr << "Reads will be truncated to " << truncate_seq << "bp\n";
if (filter_illumina)
- cerr << "Discarding reads marked as 'failed' by Illumina's chastity/purity filters.\n";
+ cerr << "Discarding reads marked as 'failed' by Illumina's chastity/purity filters.\n";
if (filter_adapter) {
- cerr << "Filtering reads for adapter sequence:\n";
- if (adapter_1 != NULL) {
- cerr << " " << adapter_1 << "\n";
- init_adapter_seq(kmer_size, adapter_1, adp_1_len, adp_1_kmers);
- }
- if (adapter_2 != NULL) {
- cerr << " " << adapter_2 << "\n";
- init_adapter_seq(kmer_size, adapter_2, adp_2_len, adp_2_kmers);
- }
-
- cerr << " " << distance << " mismatches allowed to adapter sequence.\n";
+ cerr << "Filtering reads for adapter sequence:\n";
+ if (adapter_1 != NULL) {
+ cerr << " " << adapter_1 << "\n";
+ init_adapter_seq(kmer_size, adapter_1, adp_1_len, adp_1_kmers);
+ }
+ if (adapter_2 != NULL) {
+ cerr << " " << adapter_2 << "\n";
+ init_adapter_seq(kmer_size, adapter_2, adp_2_len, adp_2_kmers);
+ }
+
+ cerr << " " << distance << " mismatches allowed to adapter sequence.\n";
}
vector<pair<string, string> > files;
@@ -140,89 +142,89 @@ int main (int argc, char* argv[]) {
build_file_list(files);
load_barcodes(barcode_file, barcodes, se_bc, pe_bc, min_bc_size_1, max_bc_size_1, min_bc_size_2, max_bc_size_2);
if (recover && barcode_type != null_null) {
- if (barcode_type == index_null || barcode_type == inline_null)
- cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " mismatches.\n";
- else
- cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " / " << barcode_dist_2 << " mismatches.\n";
+ if (barcode_type == index_null || barcode_type == inline_null)
+ cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " mismatches.\n";
+ else
+ cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " / " << barcode_dist_2 << " mismatches.\n";
}
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
- open_files(files, barcodes, pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs, counters);
+ open_files(files, barcodes, pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs, counters);
else
- open_files(files, barcodes, pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs, counters);
+ open_files(files, barcodes, pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs, counters);
int result = 1;
for (uint i = 0; i < files.size(); i++) {
- cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].first.c_str() << "]\n";
-
- counters[files[i].first]["total"] = 0;
- counters[files[i].first]["ill_filtered"] = 0;
- counters[files[i].first]["adapter"] = 0;
- counters[files[i].first]["low_quality"] = 0;
- counters[files[i].first]["noradtag"] = 0;
- counters[files[i].first]["ambiguous"] = 0;
- counters[files[i].first]["retained"] = 0;
- counters[files[i].first]["recovered"] = 0;
-
- if (paired) {
- if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
- result = process_paired_reads(files[i].first, files[i].second,
- se_bc, pe_bc,
- pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs,
- counters[files[i].first], barcode_log);
- else
- result = process_paired_reads(files[i].first, files[i].second,
- se_bc, pe_bc,
- pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs,
- counters[files[i].first], barcode_log);
- } else {
- if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
- result = process_reads(files[i].first,
- se_bc, pe_bc,
- pair_1_gzfhs,
- counters[files[i].first], barcode_log);
- else
- result = process_reads(files[i].first,
- se_bc, pe_bc,
- pair_1_fhs,
- counters[files[i].first], barcode_log);
- }
-
- cerr << " "
- << counters[files[i].first]["total"] << " total reads; ";
- if (filter_illumina)
- cerr << "-" << counters[files[i].first]["ill_filtered"] << " failed Illumina reads; ";
- cerr << "-" << counters[files[i].first]["ambiguous"] << " ambiguous barcodes; "
- << "-" << counters[files[i].first]["noradtag"] << " ambiguous RAD-Tags; "
- << "+" << counters[files[i].first]["recovered"] << " recovered; "
- << "-" << counters[files[i].first]["low_quality"] << " low quality reads; "
- << counters[files[i].first]["retained"] << " retained reads.\n";
- if (filter_adapter)
- cerr << " "
- << counters[files[i].first]["adapter"] << " reads with adapter sequence.\n";
-
- if (!result) {
- cerr << "Error processing reads.\n";
- break;
- }
+ cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].first.c_str() << "]\n";
+
+ counters[files[i].first]["total"] = 0;
+ counters[files[i].first]["ill_filtered"] = 0;
+ counters[files[i].first]["adapter"] = 0;
+ counters[files[i].first]["low_quality"] = 0;
+ counters[files[i].first]["noradtag"] = 0;
+ counters[files[i].first]["ambiguous"] = 0;
+ counters[files[i].first]["retained"] = 0;
+ counters[files[i].first]["recovered"] = 0;
+
+ if (paired) {
+ if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
+ result = process_paired_reads(files[i].first, files[i].second,
+ se_bc, pe_bc,
+ pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs,
+ counters[files[i].first], barcode_log);
+ else
+ result = process_paired_reads(files[i].first, files[i].second,
+ se_bc, pe_bc,
+ pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs,
+ counters[files[i].first], barcode_log);
+ } else {
+ if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
+ result = process_reads(files[i].first,
+ se_bc, pe_bc,
+ pair_1_gzfhs,
+ counters[files[i].first], barcode_log);
+ else
+ result = process_reads(files[i].first,
+ se_bc, pe_bc,
+ pair_1_fhs,
+ counters[files[i].first], barcode_log);
+ }
+
+ cerr << " "
+ << counters[files[i].first]["total"] << " total reads; ";
+ if (filter_illumina)
+ cerr << "-" << counters[files[i].first]["ill_filtered"] << " failed Illumina reads; ";
+ cerr << "-" << counters[files[i].first]["ambiguous"] << " ambiguous barcodes; "
+ << "-" << counters[files[i].first]["noradtag"] << " ambiguous RAD-Tags; "
+ << "+" << counters[files[i].first]["recovered"] << " recovered; "
+ << "-" << counters[files[i].first]["low_quality"] << " low quality reads; "
+ << counters[files[i].first]["retained"] << " retained reads.\n";
+ if (filter_adapter)
+ cerr << " "
+ << counters[files[i].first]["adapter"] << " reads with adapter sequence.\n";
+
+ if (!result) {
+ cerr << "Error processing reads.\n";
+ break;
+ }
}
cerr << "Closing files, flushing buffers...\n";
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
- close_file_handles(pair_1_gzfhs);
- if (paired) {
- close_file_handles(rem_1_gzfhs);
- close_file_handles(rem_2_gzfhs);
- close_file_handles(pair_2_gzfhs);
- }
+ close_file_handles(pair_1_gzfhs);
+ if (paired) {
+ close_file_handles(rem_1_gzfhs);
+ close_file_handles(rem_2_gzfhs);
+ close_file_handles(pair_2_gzfhs);
+ }
} else {
- close_file_handles(pair_1_fhs);
- if (paired) {
- close_file_handles(rem_1_fhs);
- close_file_handles(rem_2_fhs);
- close_file_handles(pair_2_fhs);
- }
+ close_file_handles(pair_1_fhs);
+ if (paired) {
+ close_file_handles(rem_1_fhs);
+ close_file_handles(rem_2_fhs);
+ close_file_handles(pair_2_fhs);
+ }
}
print_results(argc, argv, barcodes, counters, barcode_log);
@@ -231,39 +233,39 @@ int main (int argc, char* argv[]) {
}
template <typename fhType>
-int
-process_paired_reads(string prefix_1,
- string prefix_2,
- set<string> &se_bc, set<string> &pe_bc,
- map<BarcodePair, fhType *> &pair_1_fhs,
- map<BarcodePair, fhType *> &pair_2_fhs,
- map<BarcodePair, fhType *> &rem_1_fhs,
- map<BarcodePair, fhType *> &rem_2_fhs,
- map<string, long> &counter,
- map<BarcodePair, map<string, long> > &barcode_log) {
+int
+process_paired_reads(string prefix_1,
+ string prefix_2,
+ set<string> &se_bc, set<string> &pe_bc,
+ map<BarcodePair, fhType *> &pair_1_fhs,
+ map<BarcodePair, fhType *> &pair_2_fhs,
+ map<BarcodePair, fhType *> &rem_1_fhs,
+ map<BarcodePair, fhType *> &rem_2_fhs,
+ map<string, long> &counter,
+ map<BarcodePair, map<string, long> > &barcode_log) {
Input *fh_1, *fh_2;
Read *r_1, *r_2;
ofstream *discard_fh_1, *discard_fh_2;
int return_val = 1;
-
+
string path_1 = in_path_1 + prefix_1;
string path_2 = in_path_2 + prefix_2;
if (interleaved)
- cerr << " Reading data from:\n " << path_1 << "\n";
+ cerr << " Reading data from:\n " << path_1 << "\n";
else
- cerr << " Reading data from:\n " << path_1 << " and\n " << path_2 << "\n";
+ cerr << " Reading data from:\n " << path_1 << " and\n " << path_2 << "\n";
if (in_file_type == FileT::fastq) {
fh_1 = new Fastq(path_1.c_str());
- fh_2 = interleaved ? fh_1 : new Fastq(path_2.c_str());
+ fh_2 = interleaved ? fh_1 : new Fastq(path_2.c_str());
} else if (in_file_type == FileT::gzfastq) {
fh_1 = new GzFastq(path_1.c_str());
- fh_2 = interleaved ? fh_1 : new GzFastq(path_2.c_str());
+ fh_2 = interleaved ? fh_1 : new GzFastq(path_2.c_str());
} else if (in_file_type == FileT::bam) {
fh_1 = new BamUnAln(path_1.c_str());
- fh_2 = fh_1;
+ fh_2 = fh_1;
} else if (in_file_type == FileT::bustard) {
fh_1 = new Bustard(path_1.c_str());
fh_2 = interleaved ? fh_1 : new Bustard(path_2.c_str());
@@ -273,33 +275,33 @@ process_paired_reads(string prefix_1,
// Open a file for recording discarded reads
//
if (discards) {
- path_1 = out_path + prefix_1 + ".discards";
- discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
+ path_1 = out_path + prefix_1 + ".discards";
+ discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
- if (discard_fh_1->fail()) {
- cerr << "Error opening discard output file '" << path_1 << "'\n";
- exit(1);
- }
+ if (discard_fh_1->fail()) {
+ cerr << "Error opening discard output file '" << path_1 << "'\n";
+ exit(1);
+ }
- path_2 = out_path + prefix_2 + ".discards";
- discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
+ path_2 = out_path + prefix_2 + ".discards";
+ discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
- if (discard_fh_1->fail()) {
- cerr << "Error opening discard output file '" << path_2 << "'\n";
- exit(1);
- }
+ if (discard_fh_1->fail()) {
+ cerr << "Error opening discard output file '" << path_2 << "'\n";
+ exit(1);
+ }
}
//
- // Read in the first record, initializing the Seq object s. Then
+ // Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
//
Seq *s_1 = fh_1->next_seq();
Seq *s_2 = fh_2->next_seq();
if (s_1 == NULL || s_2 == NULL) {
- cerr << "Attempting to read first pair of input records, unable to allocate "
- << "Seq object (Was the correct input type specified?).\n";
- exit(1);
+ cerr << "Attempting to read first pair of input records, unable to allocate "
+ << "Seq object (Was the correct input type specified?).\n";
+ exit(1);
}
r_1 = new Read(strlen(s_1->seq), 1, min_bc_size_1, win_size);
@@ -310,133 +312,136 @@ process_paired_reads(string prefix_1,
// they will be discarded.
//
if (truncate_seq > 0)
- len_limit = truncate_seq;
+ len_limit = truncate_seq;
BarcodePair bc;
//
// If no barcodes were specified, set the barcode object to be the input file names.
//
if (max_bc_size_1 == 0)
- bc.set(prefix_1, prefix_2);
-
+ bc.set(prefix_1, prefix_2);
+
long i = 1;
+ cerr << " Processing RAD-Tags...";
do {
- if (i % 10000 == 0) cerr << " Processing RAD-Tag " << i << " \r";
-
- parse_input_record(s_1, r_1);
- parse_input_record(s_2, r_2);
- counter["total"] += 2;
-
- if (barcode_type != null_null &&
- barcode_type != inline_null &&
- barcode_type != index_null)
- bc.set(r_1->se_bc, r_2->pe_bc);
- else if (barcode_type != null_null)
- bc.set(r_1->se_bc);
-
- process_barcode(r_1, r_2, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
-
- //
- // Adjust the size of the read to accommodate truncating the sequence and variable
- // barcode lengths. With standard Illumina data we want to output constant length
- // reads even as the barcode size may change. Other technologies, like IonTorrent
- // need to be truncated uniformly.
- //
- if (truncate_seq > 0) {
- if (truncate_seq + r_1->inline_bc_len <= r_1->len)
- r_1->set_len(truncate_seq + r_1->inline_bc_len);
- if (truncate_seq + r_2->inline_bc_len <= r_2->len)
- r_2->set_len(truncate_seq + r_2->inline_bc_len);
- } else {
- if (barcode_type == inline_null || barcode_type == inline_inline || barcode_type == inline_index)
- r_1->set_len(r_1->len - (max_bc_size_1 - r_1->inline_bc_len));
- if (barcode_type == index_inline || barcode_type == inline_inline)
- r_2->set_len(r_2->len - (max_bc_size_2 - r_2->inline_bc_len));
- }
-
- if (r_1->retain)
- process_singlet(r_1, renz_1, false, barcode_log[bc], counter);
- if (r_2->retain)
- process_singlet(r_2, renz_2, true, barcode_log[bc], counter);
-
- int result_1 = 1;
- int result_2 = 1;
-
- if (r_1->retain && r_2->retain) {
- if (retain_header) {
- result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_1_fhs[bc], s_1, r_1) :
- write_fasta(pair_1_fhs[bc], s_1, r_1);
- result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_2_fhs[bc], s_2, r_2) :
- write_fasta(pair_2_fhs[bc], s_2, r_2);
- } else {
- result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_1_fhs[bc], r_1, overhang) :
- write_fasta(pair_1_fhs[bc], r_1, overhang);
- result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_2_fhs[bc], r_2, overhang) :
- write_fasta(pair_2_fhs[bc], r_2, overhang);
- }
- } else if (r_1->retain && !r_2->retain) {
- //
- // Write to the remainder file.
- //
- if (retain_header)
- result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(rem_1_fhs[bc], s_1, r_1) :
- write_fasta(rem_1_fhs[bc], s_1, r_1);
- else
- result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(rem_1_fhs[bc], r_1, overhang) :
- write_fasta(rem_1_fhs[bc], r_1, overhang);
-
- } else if (!r_1->retain && r_2->retain) {
- //
- // Write to the remainder file.
- //
- if (retain_header)
- result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(rem_2_fhs[bc], s_2, r_2) :
- write_fasta(rem_2_fhs[bc], s_2, r_2);
- else
- result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(rem_2_fhs[bc], r_2, overhang) :
- write_fasta(rem_2_fhs[bc], r_2, overhang);
- }
-
- if (!result_1 || !result_2) {
- cerr << "Error writing to output file for '" << bc.str() << "'\n";
- return_val = -1;
- break;
- }
-
- if (discards && !r_1->retain)
- result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(discard_fh_1, s_1) :
- write_fasta(discard_fh_1, s_1);
- if (discards && !r_2->retain)
- result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(discard_fh_2, s_2) :
- write_fasta(discard_fh_2, s_2);
-
- if (!result_1 || !result_2) {
- cerr << "Error writing to discard file for '" << bc.str() << "'\n";
- return_val = -1;
- break;
- }
-
- delete s_1;
- delete s_2;
-
- i++;
- } while ((s_1 = fh_1->next_seq()) != NULL &&
- (s_2 = fh_2->next_seq()) != NULL);
+ if (i % 1000000 == 0)
+ cerr << i/1000000 << "M...";
+
+ parse_input_record(s_1, r_1);
+ parse_input_record(s_2, r_2);
+ counter["total"] += 2;
+
+ if (barcode_type != null_null &&
+ barcode_type != inline_null &&
+ barcode_type != index_null)
+ bc.set(r_1->se_bc, r_2->pe_bc);
+ else if (barcode_type != null_null)
+ bc.set(r_1->se_bc);
+
+ process_barcode(r_1, r_2, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
+
+ //
+ // Adjust the size of the read to accommodate truncating the sequence and variable
+ // barcode lengths. With standard Illumina data we want to output constant length
+ // reads even as the barcode size may change. Other technologies, like IonTorrent
+ // need to be truncated uniformly.
+ //
+ if (truncate_seq > 0) {
+ if (truncate_seq + r_1->inline_bc_len <= r_1->len)
+ r_1->set_len(truncate_seq + r_1->inline_bc_len);
+ if (truncate_seq + r_2->inline_bc_len <= r_2->len)
+ r_2->set_len(truncate_seq + r_2->inline_bc_len);
+ } else {
+ if (barcode_type == inline_null || barcode_type == inline_inline || barcode_type == inline_index)
+ r_1->set_len(r_1->len - (max_bc_size_1 - r_1->inline_bc_len));
+ if (barcode_type == index_inline || barcode_type == inline_inline)
+ r_2->set_len(r_2->len - (max_bc_size_2 - r_2->inline_bc_len));
+ }
+
+ if (r_1->retain)
+ process_singlet(r_1, renz_1, false, barcode_log[bc], counter);
+ if (r_2->retain)
+ process_singlet(r_2, renz_2, true, barcode_log[bc], counter);
+
+ int result_1 = 1;
+ int result_2 = 1;
+
+ if (r_1->retain && r_2->retain) {
+ if (retain_header) {
+ result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_1_fhs[bc], s_1, r_1) :
+ write_fasta(pair_1_fhs[bc], s_1, r_1);
+ result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_2_fhs[bc], s_2, r_2) :
+ write_fasta(pair_2_fhs[bc], s_2, r_2);
+ } else {
+ result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_1_fhs[bc], r_1, overhang) :
+ write_fasta(pair_1_fhs[bc], r_1, overhang);
+ result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_2_fhs[bc], r_2, overhang) :
+ write_fasta(pair_2_fhs[bc], r_2, overhang);
+ }
+ } else if (r_1->retain && !r_2->retain) {
+ //
+ // Write to the remainder file.
+ //
+ if (retain_header)
+ result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(rem_1_fhs[bc], s_1, r_1) :
+ write_fasta(rem_1_fhs[bc], s_1, r_1);
+ else
+ result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(rem_1_fhs[bc], r_1, overhang) :
+ write_fasta(rem_1_fhs[bc], r_1, overhang);
+
+ } else if (!r_1->retain && r_2->retain) {
+ //
+ // Write to the remainder file.
+ //
+ if (retain_header)
+ result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(rem_2_fhs[bc], s_2, r_2) :
+ write_fasta(rem_2_fhs[bc], s_2, r_2);
+ else
+ result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(rem_2_fhs[bc], r_2, overhang) :
+ write_fasta(rem_2_fhs[bc], r_2, overhang);
+ }
+
+ if (!result_1 || !result_2) {
+ cerr << "Error writing to output file for '" << bc.str() << "'\n";
+ return_val = -1;
+ break;
+ }
+
+ if (discards && !r_1->retain)
+ result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(discard_fh_1, s_1) :
+ write_fasta(discard_fh_1, s_1);
+ if (discards && !r_2->retain)
+ result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(discard_fh_2, s_2) :
+ write_fasta(discard_fh_2, s_2);
+
+ if (!result_1 || !result_2) {
+ cerr << "Error writing to discard file for '" << bc.str() << "'\n";
+ return_val = -1;
+ break;
+ }
+
+ delete s_1;
+ delete s_2;
+
+ i++;
+ } while ((s_1 = fh_1->next_seq()) != NULL &&
+ (s_2 = fh_2->next_seq()) != NULL);
+ cerr << "\n";
if (discards) {
- delete discard_fh_1;
- delete discard_fh_2;
+ delete discard_fh_1;
+ delete discard_fh_2;
}
delete fh_1;
@@ -449,12 +454,12 @@ process_paired_reads(string prefix_1,
}
template <typename fhType>
-int
-process_reads(string prefix,
- set<string> &se_bc, set<string> &pe_bc,
- map<BarcodePair, fhType *> &pair_1_fhs,
- map<string, long> &counter,
- map<BarcodePair, map<string, long> > &barcode_log) {
+int
+process_reads(string prefix,
+ set<string> &se_bc, set<string> &pe_bc,
+ map<BarcodePair, fhType *> &pair_1_fhs,
+ map<string, long> &counter,
+ map<BarcodePair, map<string, long> > &barcode_log) {
Input *fh;
Read *r;
ofstream *discard_fh;
@@ -476,24 +481,24 @@ process_reads(string prefix,
// Open a file for recording discarded reads
//
if (discards) {
- path = out_path + prefix + ".discards";
- discard_fh = new ofstream(path.c_str(), ifstream::out);
+ path = out_path + prefix + ".discards";
+ discard_fh = new ofstream(path.c_str(), ifstream::out);
- if (discard_fh->fail()) {
- cerr << "Error opening discard output file '" << path << "'\n";
- exit(1);
- }
+ if (discard_fh->fail()) {
+ cerr << "Error opening discard output file '" << path << "'\n";
+ exit(1);
+ }
}
//
- // Read in the first record, initializing the Seq object s. Then
+ // Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
//
Seq *s = fh->next_seq();
if (s == NULL) {
- cerr << "Attempting to read first input record, unable to allocate "
- << "Seq object (Was the correct input type specified?).\n";
- exit(1);
+ cerr << "Attempting to read first input record, unable to allocate "
+ << "Seq object (Was the correct input type specified?).\n";
+ exit(1);
}
//
@@ -501,7 +506,7 @@ process_reads(string prefix,
// they will be discarded.
//
if (truncate_seq > 0)
- len_limit = truncate_seq;
+ len_limit = truncate_seq;
r = new Read(strlen(s->seq), 1, min_bc_size_1, win_size);
@@ -511,77 +516,81 @@ process_reads(string prefix,
// that reads are written to an output file of the same name as the input file.
//
if (max_bc_size_1 == 0)
- bc.set(prefix);
+ bc.set(prefix);
//cerr << "Length: " << r->len << "; Window length: " << r->win_len << "; Stop position: " << r->stop_pos << "\n";
long i = 1;
+ cerr << " Processing RAD-Tags...";
do {
- if (i % 10000 == 0) cerr << " Processing RAD-Tag " << i << " \r";
- counter["total"]++;
-
- parse_input_record(s, r);
-
- if (barcode_type == inline_null ||
- barcode_type == index_null)
- bc.set(r->se_bc);
- else if (barcode_type == index_inline ||
- barcode_type == inline_index)
- bc.set(r->se_bc, r->pe_bc);
-
- process_barcode(r, NULL, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
-
- //
- // Adjust the size of the read to accommodate truncating the sequence and variable
- // barcode lengths. With standard Illumina data we want to output constant length
- // reads even as the barcode size may change. Other technologies, like IonTorrent
- // need to be truncated uniformly.
- //
- if (truncate_seq > 0) {
- if (truncate_seq + r->inline_bc_len <= r->len)
- r->set_len(truncate_seq + r->inline_bc_len);
- } else {
- if (barcode_type == inline_null || barcode_type == inline_inline || barcode_type == inline_index)
- r->set_len(r->len - (max_bc_size_1 - r->inline_bc_len));
- }
-
- if (r->retain)
- process_singlet(r, renz_1, false, barcode_log[bc], counter);
-
- int result = 1;
-
- if (r->retain) {
- if (retain_header)
- result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_1_fhs[bc], s, r) :
- write_fasta(pair_1_fhs[bc], s, r);
- else
- result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_1_fhs[bc], r, overhang) :
- write_fasta(pair_1_fhs[bc], r, overhang);
- }
-
- if (!result) {
- cerr << "Error writing to output file for '" << bc.str() << "'\n";
- return_val = -1;
- break;
- }
-
- if (discards && !r->retain)
- result = out_file_type == FileT::fastq ?
- write_fastq(discard_fh, s) :
- write_fasta(discard_fh, s);
-
- if (!result) {
- cerr << "Error writing to discard file for '" << bc.str() << "'\n";
- return_val = -1;
- break;
- }
-
- delete s;
-
- i++;
+ if (i % 1000000 == 0)
+ cerr << i/1000000 << "M...";
+
+ counter["total"]++;
+
+ parse_input_record(s, r);
+
+ if (barcode_type == inline_null ||
+ barcode_type == index_null)
+ bc.set(r->se_bc);
+ else if (barcode_type == index_inline ||
+ barcode_type == inline_index)
+ bc.set(r->se_bc, r->pe_bc);
+
+ process_barcode(r, NULL, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
+
+ //
+ // Adjust the size of the read to accommodate truncating the sequence and variable
+ // barcode lengths. With standard Illumina data we want to output constant length
+ // reads even as the barcode size may change. Other technologies, like IonTorrent
+ // need to be truncated uniformly.
+ //
+ if (truncate_seq > 0) {
+ if (truncate_seq + r->inline_bc_len <= r->len)
+ r->set_len(truncate_seq + r->inline_bc_len);
+ } else {
+ if (barcode_type == inline_null || barcode_type == inline_inline || barcode_type == inline_index)
+ r->set_len(r->len - (max_bc_size_1 - r->inline_bc_len));
+ }
+
+ if (r->retain)
+ process_singlet(r, renz_1, false, barcode_log[bc], counter);
+
+ int result = 1;
+
+ if (r->retain) {
+ if (retain_header)
+ result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_1_fhs[bc], s, r) :
+ write_fasta(pair_1_fhs[bc], s, r);
+ else
+ result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_1_fhs[bc], r, overhang) :
+ write_fasta(pair_1_fhs[bc], r, overhang);
+ }
+
+ if (!result) {
+ cerr << "Error writing to output file for '" << bc.str() << "'\n";
+ return_val = -1;
+ break;
+ }
+
+ if (discards && !r->retain)
+ result = out_file_type == FileT::fastq ?
+ write_fastq(discard_fh, s) :
+ write_fasta(discard_fh, s);
+
+ if (!result) {
+ cerr << "Error writing to discard file for '" << bc.str() << "'\n";
+ return_val = -1;
+ break;
+ }
+
+ delete s;
+
+ i++;
} while ((s = fh->next_seq()) != NULL);
+ cerr << "\n";
if (discards) delete discard_fh;
@@ -596,126 +605,126 @@ process_reads(string prefix,
}
inline
-int
-process_singlet(Read *href,
- string res_enz, bool paired_end,
- map<string, long> &bc_log, map<string, long> &counter)
+int
+process_singlet(Read *href,
+ string res_enz, bool paired_end,
+ map<string, long> &bc_log, map<string, long> &counter)
{
char *p;
if (filter_illumina && href->filter) {
- counter["ill_filtered"]++;
- href->retain = 0;
- return 0;
+ counter["ill_filtered"]++;
+ href->retain = 0;
+ return 0;
}
//
// If this read is already shorter than our length limit, discard it.
//
if (len_limit > 0 && (href->len - href->inline_bc_len) < len_limit) {
- counter["low_quality"]++;
- if (barcode_type != null_null)
- bc_log["low_qual"]++;
- href->retain = 0;
- return 0;
+ counter["low_quality"]++;
+ if (barcode_type != null_null)
+ bc_log["low_qual"]++;
+ href->retain = 0;
+ return 0;
}
//
// Is the RADTAG intact?
//
if (check_radtag && res_enz.length() > 0) {
- bool rad_cor = false;
-
- for (int i = 0; i < renz_cnt[res_enz]; i++) {
- p = href->seq + href->inline_bc_len;
-
- if (strncmp(p, renz[res_enz][i], renz_len[res_enz]) == 0)
- rad_cor = true;
- }
- if (rad_cor == false) {
- //
- // Try to correct the RAD-Tag.
- //
- if (!correct_radtag(href, res_enz, counter)) {
- if (barcode_type != null_null) bc_log["noradtag"]++;
- counter["noradtag"]++;
- href->retain = 0;
- return 0;
- }
- }
+ bool rad_cor = false;
+
+ for (int i = 0; i < renz_cnt[res_enz]; i++) {
+ p = href->seq + href->inline_bc_len;
+
+ if (strncmp(p, renz[res_enz][i], renz_len[res_enz]) == 0)
+ rad_cor = true;
+ }
+ if (rad_cor == false) {
+ //
+ // Try to correct the RAD-Tag.
+ //
+ if (!correct_radtag(href, res_enz, counter)) {
+ if (barcode_type != null_null) bc_log["noradtag"]++;
+ counter["noradtag"]++;
+ href->retain = 0;
+ return 0;
+ }
+ }
}
//
// Drop this sequence if it has any uncalled nucleotides.
//
if (clean) {
- for (char *p = href->seq + href->inline_bc_len; *p != '\0'; p++)
- if (*p == '.' || *p == 'N') {
- counter["low_quality"]++;
- href->retain = 0;
- if (barcode_type != null_null)
- bc_log["low_qual"]++;
- return 0;
- }
+ for (char *p = href->seq + href->inline_bc_len; *p != '\0'; p++)
+ if (*p == '.' || *p == 'N') {
+ counter["low_quality"]++;
+ href->retain = 0;
+ if (barcode_type != null_null)
+ bc_log["low_qual"]++;
+ return 0;
+ }
}
//
// Drop this sequence if it has low quality scores.
//
- if (quality &&
- check_quality_scores(href, qual_offset, score_limit, len_limit, href->inline_bc_len) <= 0) {
- counter["low_quality"]++;
- if (barcode_type != null_null)
- bc_log["low_qual"]++;
- href->retain = 0;
- return 0;
+ if (quality &&
+ check_quality_scores(href, qual_offset, score_limit, len_limit, href->inline_bc_len) <= 0) {
+ counter["low_quality"]++;
+ if (barcode_type != null_null)
+ bc_log["low_qual"]++;
+ href->retain = 0;
+ return 0;
}
//
// Drop this sequence if it contains adapter sequence.
//
if (filter_adapter) {
- int res = 1;
- if (paired_end == true && adp_2_len > 0)
- res = filter_adapter_seq(href, adapter_2, adp_2_len, adp_2_kmers,
- kmer_size, distance, len_limit);
- if (paired_end == false && adp_1_len > 0)
- res = filter_adapter_seq(href, adapter_1, adp_1_len, adp_1_kmers,
- kmer_size, distance, len_limit);
- if (res <= 0) {
- // cerr << "Sequence " << href->seq << " contains adapter.\n";
- counter["adapter"]++;
- href->retain = 0;
- return 0;
- }
+ int res = 1;
+ if (paired_end == true && adp_2_len > 0)
+ res = filter_adapter_seq(href, adapter_2, adp_2_len, adp_2_kmers,
+ kmer_size, distance, len_limit);
+ if (paired_end == false && adp_1_len > 0)
+ res = filter_adapter_seq(href, adapter_1, adp_1_len, adp_1_kmers,
+ kmer_size, distance, len_limit);
+ if (res <= 0) {
+ // cerr << "Sequence " << href->seq << " contains adapter.\n";
+ counter["adapter"]++;
+ href->retain = 0;
+ return 0;
+ }
}
if (barcode_type != null_null)
- bc_log["retained"]++;
+ bc_log["retained"]++;
counter["retained"]++;
return 0;
}
-int
-correct_radtag(Read *href, string res_enz, map<string, long> &counter)
+int
+correct_radtag(Read *href, string res_enz, map<string, long> &counter)
{
if (recover == false)
- return 0;
+ return 0;
//
// If the RAD-Tag sequence is off by no more than a single nucleotide, correct it.
//
int d = 0;
for (int i = 0; i < renz_cnt[res_enz]; i++) {
-
+
d = dist(renz[res_enz][i], href->seq + href->inline_bc_len);
if (d <= 1) {
//
// Correct the read.
//
- strncpy(href->seq + href->inline_bc_len, renz[res_enz][i], renz_len[res_enz]);
+ strncpy(href->seq + href->inline_bc_len, renz[res_enz][i], renz_len[res_enz]);
counter["recovered"]++;
return 1;
@@ -727,20 +736,20 @@ correct_radtag(Read *href, string res_enz, map<string, long> &counter)
int dist(const char *res_enz, char *seq) {
const char *p; char *q;
-
+
int dist = 0;
for (p = res_enz, q = seq; *p != '\0'; p++, q++)
- if (*p != *q) dist++;
+ if (*p != *q) dist++;
return dist;
}
-int
-print_results(int argc, char **argv,
- vector<BarcodePair> &barcodes,
- map<string, map<string, long> > &counters,
- map<BarcodePair, map<string, long> > &barcode_log)
+int
+print_results(int argc, char **argv,
+ vector<BarcodePair> &barcodes,
+ map<string, map<string, long> > &counters,
+ map<BarcodePair, map<string, long> > &barcode_log)
{
map<string, map<string, long> >::iterator it;
@@ -748,8 +757,8 @@ print_results(int argc, char **argv,
ofstream log(log_path.c_str());
if (log.fail()) {
- cerr << "Unable to open log file '" << log_path << "'\n";
- return 0;
+ cerr << "Unable to open log file '" << log_path << "'\n";
+ return 0;
}
cerr << "Outputing details to log: '" << log_path << "'\n\n";
@@ -757,27 +766,27 @@ print_results(int argc, char **argv,
init_log(log, argc, argv);
log << "File\t"
- << "Retained Reads\t";
+ << "Retained Reads\t";
if (filter_illumina)
- log << "Illumina Filtered\t";
+ log << "Illumina Filtered\t";
if (filter_adapter)
- log << "Adapter Seq" << "\t";
+ log << "Adapter Seq" << "\t";
log << "Low Quality\t"
- << "Ambiguous Barcodes\t"
- << "Ambiguous RAD-Tag\t"
- << "Total\n";
+ << "Ambiguous Barcodes\t"
+ << "Ambiguous RAD-Tag\t"
+ << "Total\n";
for (it = counters.begin(); it != counters.end(); it++) {
- log << it->first << "\t"
- << it->second["retained"] << "\t";
- if (filter_illumina)
- log << it->second["ill_filtered"] << "\t";
- if (filter_adapter)
- log << it->second["adapter"] << "\t";
- log << it->second["low_quality"] << "\t"
- << it->second["ambiguous"] << "\t"
- << it->second["noradtag"] << "\t"
- << it->second["total"] << "\n";
+ log << it->first << "\t"
+ << it->second["retained"] << "\t";
+ if (filter_illumina)
+ log << it->second["ill_filtered"] << "\t";
+ if (filter_adapter)
+ log << it->second["adapter"] << "\t";
+ log << it->second["low_quality"] << "\t"
+ << it->second["ambiguous"] << "\t"
+ << it->second["noradtag"] << "\t"
+ << it->second["total"] << "\n";
}
map<string, long> c;
@@ -792,35 +801,45 @@ print_results(int argc, char **argv,
// Total up the individual counters
//
for (it = counters.begin(); it != counters.end(); it++) {
- c["total"] += it->second["total"];
- c["ill_filtered"] += it->second["ill_filtered"];
- c["low_quality"] += it->second["low_quality"];
- c["adapter"] += it->second["adapter"];
- c["ambiguous"] += it->second["ambiguous"];
- c["noradtag"] += it->second["noradtag"];
- c["retained"] += it->second["retained"];
+ c["total"] += it->second["total"];
+ c["ill_filtered"] += it->second["ill_filtered"];
+ c["low_quality"] += it->second["low_quality"];
+ c["adapter"] += it->second["adapter"];
+ c["ambiguous"] += it->second["ambiguous"];
+ c["noradtag"] += it->second["noradtag"];
+ c["retained"] += it->second["retained"];
}
- cerr << c["total"] << " total sequences;\n";
+ std::ostream cerr_bis (cerr.rdbuf());
+ cerr_bis << std::fixed << std::setprecision(1);
+
+ auto print_nreads = [&cerr_bis,&c] (long n, const string& legend) {
+ size_t nspaces = std::to_string(c["total"]).length() - std::to_string(n).length();
+ cerr_bis << string(nspaces, ' ')
+ << n << " " << legend
+ << " (" << (double) n / c["total"] * 100 << "%)\n";
+ };
+
+ cerr_bis << c["total"] << " total sequences\n";
if (filter_illumina)
- cerr << " " << c["ill_filtered"] << " failed Illumina filtered reads;\n";
+ print_nreads(c["ill_filtered"], "failed Illumina filtered reads");
if (filter_adapter)
- cerr << " " << c["adapter"] << " reads contained adapter sequence;\n";
- cerr << " " << c["ambiguous"] << " ambiguous barcode drops;\n"
- << " " << c["low_quality"] << " low quality read drops;\n"
- << " " << c["noradtag"] << " ambiguous RAD-Tag drops;\n"
- << c["retained"] << " retained reads.\n";
-
- log << "\n"
- << "Total Sequences\t" << c["total"] << "\n";
+ print_nreads(c["adapter"], "reads contained adapter sequence");
+ print_nreads(c["ambiguous"], "ambiguous barcode drops");
+ print_nreads(c["low_quality"], "low quality read drops");
+ print_nreads(c["noradtag"], "ambiguous RAD-Tag drops");
+ print_nreads(c["retained"], "retained reads");
+
+ log << "\n"
+ << "Total Sequences\t" << c["total"] << "\n";
if (filter_illumina)
- log << "Failed Illumina filtered reads\t" << c["ill_filtered"] << "\n";
+ log << "Failed Illumina filtered reads\t" << c["ill_filtered"] << "\n";
if (filter_adapter)
- log << "Reads containing adapter sequence\t" << c["adapter"] << "\n";
+ log << "Reads containing adapter sequence\t" << c["adapter"] << "\n";
log << "Ambiguous Barcodes\t" << c["ambiguous"] << "\n"
- << "Low Quality\t" << c["low_quality"] << "\n"
- << "Ambiguous RAD-Tag\t" << c["noradtag"] << "\n"
- << "Retained Reads\t" << c["retained"] << "\n";
+ << "Low Quality\t" << c["low_quality"] << "\n"
+ << "Ambiguous RAD-Tag\t" << c["noradtag"] << "\n"
+ << "Retained Reads\t" << c["retained"] << "\n";
if (max_bc_size_1 == 0) return 0;
@@ -829,45 +848,45 @@ print_results(int argc, char **argv,
//
bool bc_names = false;
for (uint i = 0; i < barcodes.size(); i++)
- if (barcodes[i].name_exists()) {
- bc_names = true;
- break;
- }
+ if (barcodes[i].name_exists()) {
+ bc_names = true;
+ break;
+ }
//
// Print out barcode information.
//
log << "\n"
- << "Barcode\t";
+ << "Barcode\t";
if (bc_names)
- log << "Filename\t";
+ log << "Filename\t";
log << "Total\t"
- << "No RadTag\t"
- << "Low Quality\t"
- << "Retained\n";
+ << "No RadTag\t"
+ << "Low Quality\t"
+ << "Retained\n";
set<BarcodePair> barcode_list;
for (uint i = 0; i < barcodes.size(); i++) {
- barcode_list.insert(barcodes[i]);
+ barcode_list.insert(barcodes[i]);
- log << barcodes[i] << "\t";
- if (bc_names)
- log << barcodes[i].name << "\t";
+ log << barcodes[i] << "\t";
+ if (bc_names)
+ log << barcodes[i].name << "\t";
if (barcode_log.count(barcodes[i]) == 0)
log << "0\t" << "0\t" << "0\t" << "0\n";
else
- log << barcode_log[barcodes[i]]["total"] << "\t"
- << barcode_log[barcodes[i]]["noradtag"] << "\t"
- << barcode_log[barcodes[i]]["low_qual"] << "\t"
+ log << barcode_log[barcodes[i]]["total"] << "\t"
+ << barcode_log[barcodes[i]]["noradtag"] << "\t"
+ << barcode_log[barcodes[i]]["low_qual"] << "\t"
<< barcode_log[barcodes[i]]["retained"] << "\n";
}
log << "\n"
- << "Sequences not recorded\n"
- << "Barcode\t"
- << "Total\n";
+ << "Sequences not recorded\n"
+ << "Barcode\t"
+ << "Total\n";
//
// Sort unused barcodes by number of occurances.
@@ -875,15 +894,15 @@ print_results(int argc, char **argv,
map<BarcodePair, map<string, long> >::iterator bit;
vector<pair<BarcodePair, int> > bcs;
for (bit = barcode_log.begin(); bit != barcode_log.end(); bit++)
- bcs.push_back(make_pair(bit->first, bit->second["total"]));
+ bcs.push_back(make_pair(bit->first, bit->second["total"]));
sort(bcs.begin(), bcs.end(), compare_barcodes);
for (uint i = 0; i < bcs.size(); i++) {
- if (barcode_list.count(bcs[i].first)) continue;
- if (bcs[i].second == 0) continue;
+ if (barcode_list.count(bcs[i].first)) continue;
+ if (bcs[i].second == 0) continue;
- log << bcs[i].first << "\t"
- << bcs[i].second << "\n";
+ log << bcs[i].first << "\t"
+ << bcs[i].second << "\n";
}
log.close();
@@ -898,310 +917,310 @@ int compare_barcodes(pair<BarcodePair, int> a, pair<BarcodePair, int> b) {
int parse_command_line(int argc, char* argv[]) {
FileT ftype;
int c;
-
+
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
{"quality", no_argument, NULL, 'q'},
{"clean", no_argument, NULL, 'c'},
{"recover", no_argument, NULL, 'r'},
- {"discards", no_argument, NULL, 'D'},
- {"paired", no_argument, NULL, 'P'},
- {"interleaved", no_argument, NULL, 'I'},
- {"merge", no_argument, NULL, 'm'},
- {"disable_rad_check", no_argument, NULL, 'R'},
- {"filter_illumina", no_argument, NULL, 'F'},
- {"retain_header", no_argument, NULL, 'H'},
- {"null_index", no_argument, NULL, 'U'},
- {"index_null", no_argument, NULL, 'u'},
- {"inline_null", no_argument, NULL, 'V'},
- {"index_index", no_argument, NULL, 'W'},
- {"inline_inline", no_argument, NULL, 'x'},
- {"index_inline", no_argument, NULL, 'Y'},
- {"inline_index", no_argument, NULL, 'Z'},
- {"barcode_dist_1", required_argument, NULL, 'B'},
- {"barcode_dist_2", required_argument, NULL, 'C'},
- {"infile_type", required_argument, NULL, 'i'},
- {"outfile_type", required_argument, NULL, 'y'},
- {"file", required_argument, NULL, 'f'},
- {"file_p1", required_argument, NULL, '1'},
- {"file_p2", required_argument, NULL, '2'},
- {"path", required_argument, NULL, 'p'},
- {"outpath", required_argument, NULL, 'o'},
- {"truncate", required_argument, NULL, 't'},
- {"renz_1", required_argument, NULL, 'e'},
- {"renz_2", required_argument, NULL, 'z'},
- {"barcodes", required_argument, NULL, 'b'},
- {"window_size", required_argument, NULL, 'w'},
- {"score_limit", required_argument, NULL, 's'},
- {"encoding", required_argument, NULL, 'E'},
- {"len_limit", required_argument, NULL, 'L'},
- {"adapter_1", required_argument, NULL, 'A'},
- {"adapter_2", required_argument, NULL, 'G'},
- {"adapter_mm", required_argument, NULL, 'T'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "HuUVWxYZhvRFIcqrDPmB:C:i:y:f:o:t:e:z:b:1:2:p:s:w:E:L:A:G:T:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 'i':
+ {"discards", no_argument, NULL, 'D'},
+ {"paired", no_argument, NULL, 'P'},
+ {"interleaved", no_argument, NULL, 'I'},
+ {"merge", no_argument, NULL, 'm'},
+ {"disable_rad_check", no_argument, NULL, 'R'},
+ {"filter_illumina", no_argument, NULL, 'F'},
+ {"retain_header", no_argument, NULL, 'H'},
+ {"null_index", no_argument, NULL, 'U'},
+ {"index_null", no_argument, NULL, 'u'},
+ {"inline_null", no_argument, NULL, 'V'},
+ {"index_index", no_argument, NULL, 'W'},
+ {"inline_inline", no_argument, NULL, 'x'},
+ {"index_inline", no_argument, NULL, 'Y'},
+ {"inline_index", no_argument, NULL, 'Z'},
+ {"barcode_dist_1", required_argument, NULL, 'B'},
+ {"barcode_dist_2", required_argument, NULL, 'C'},
+ {"infile_type", required_argument, NULL, 'i'},
+ {"outfile_type", required_argument, NULL, 'y'},
+ {"file", required_argument, NULL, 'f'},
+ {"file_p1", required_argument, NULL, '1'},
+ {"file_p2", required_argument, NULL, '2'},
+ {"path", required_argument, NULL, 'p'},
+ {"outpath", required_argument, NULL, 'o'},
+ {"truncate", required_argument, NULL, 't'},
+ {"renz_1", required_argument, NULL, 'e'},
+ {"renz_2", required_argument, NULL, 'z'},
+ {"barcodes", required_argument, NULL, 'b'},
+ {"window_size", required_argument, NULL, 'w'},
+ {"score_limit", required_argument, NULL, 's'},
+ {"encoding", required_argument, NULL, 'E'},
+ {"len_limit", required_argument, NULL, 'L'},
+ {"adapter_1", required_argument, NULL, 'A'},
+ {"adapter_2", required_argument, NULL, 'G'},
+ {"adapter_mm", required_argument, NULL, 'T'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "HuUVWxYZhvRFIcqrDPmB:C:i:y:f:o:t:e:z:b:1:2:p:s:w:E:L:A:G:T:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 'i':
if (strcasecmp(optarg, "bustard") == 0)
in_file_type = FileT::bustard;
- else if (strcasecmp(optarg, "bam") == 0)
+ else if (strcasecmp(optarg, "bam") == 0)
in_file_type = FileT::bam;
- else if (strcasecmp(optarg, "gzfastq") == 0)
+ else if (strcasecmp(optarg, "gzfastq") == 0)
in_file_type = FileT::gzfastq;
- else
+ else
in_file_type = FileT::fastq;
- break;
- case 'y':
- if (strcasecmp(optarg, "fastq") == 0)
+ break;
+ case 'y':
+ if (strcasecmp(optarg, "fastq") == 0)
out_file_type = FileT::fastq;
- else if (strcasecmp(optarg, "gzfastq") == 0)
+ else if (strcasecmp(optarg, "gzfastq") == 0)
out_file_type = FileT::gzfastq;
- else if (strcasecmp(optarg, "fasta") == 0)
+ else if (strcasecmp(optarg, "fasta") == 0)
out_file_type = FileT::fasta;
- else if (strcasecmp(optarg, "gzfasta") == 0)
+ else if (strcasecmp(optarg, "gzfasta") == 0)
out_file_type = FileT::gzfasta;
- break;
- case 'E':
+ break;
+ case 'E':
if (strcasecmp(optarg, "phred64") == 0)
qual_offset = 64;
- else if (strcasecmp(optarg, "phred33") == 0)
- qual_offset = 33;
- else {
- cerr << "Unknown quality score encoding, '" << optarg << "'\n";
- help();
- }
- break;
- case 'f':
- in_file = optarg;
- ftype = FileT::fastq;
- break;
- case 'p':
- in_path_1 = optarg;
- in_path_2 = in_path_1;
- ftype = FileT::fastq;
- break;
- case '1':
- paired = true;
- in_file_p1 = optarg;
- ftype = FileT::fastq;
- break;
- case '2':
- paired = true;
- in_file_p2 = optarg;
- ftype = FileT::fastq;
- break;
- case 'P':
- paired = true;
- break;
- case 'I':
- interleaved = true;
- break;
- case 'B':
- barcode_dist_1 = is_integer(optarg);
- break;
- case 'C':
- barcode_dist_2 = is_integer(optarg);
- break;
- case 'o':
- out_path = optarg;
- break;
- case 'q':
- quality = true;
- break;
- case 'c':
- clean = true;
- break;
- case 'r':
- recover = true;
- break;
- case 't':
- truncate_seq = is_integer(optarg);
- break;
- case 'e':
- renz_1 = optarg;
- break;
- case 'z':
- renz_2 = optarg;
- break;
- case 'b':
- barcode_file = optarg;
- if (barcode_type == null_null)
- barcode_type = inline_null;
- break;
- case 'm':
- merge = true;
- break;
- case 'D':
- discards = true;
- break;
- case 'R':
- check_radtag = false;
- break;
- case 'F':
- filter_illumina = true;
- break;
- case 'U':
- barcode_type = null_index;
- break;
- case 'u':
- barcode_type = index_null;
- break;
- case 'V':
- barcode_type = inline_null;
- break;
- case 'W':
- barcode_type = index_index;
- break;
- case 'x':
- barcode_type = inline_inline;
- break;
- case 'Y':
- barcode_type = index_inline;
- break;
- case 'Z':
- barcode_type = inline_index;
- break;
- case 'A':
- adapter_1 = new char[strlen(optarg) + 1];
- strcpy(adapter_1, optarg);
- filter_adapter = true;
- break;
- case 'G':
- adapter_2 = new char[strlen(optarg) + 1];
- strcpy(adapter_2, optarg);
- filter_adapter = true;
- break;
- case 'T':
- distance = is_integer(optarg);
- break;
- case 'H':
- retain_header = true;
- break;
- case 'L':
- len_limit = is_integer(optarg);
- break;
- case 'w':
- win_size = is_double(optarg);
- break;
- case 's':
- score_limit = is_integer(optarg);
- break;
+ else if (strcasecmp(optarg, "phred33") == 0)
+ qual_offset = 33;
+ else {
+ cerr << "Unknown quality score encoding, '" << optarg << "'\n";
+ help();
+ }
+ break;
+ case 'f':
+ in_file = optarg;
+ ftype = FileT::fastq;
+ break;
+ case 'p':
+ in_path_1 = optarg;
+ in_path_2 = in_path_1;
+ ftype = FileT::fastq;
+ break;
+ case '1':
+ paired = true;
+ in_file_p1 = optarg;
+ ftype = FileT::fastq;
+ break;
+ case '2':
+ paired = true;
+ in_file_p2 = optarg;
+ ftype = FileT::fastq;
+ break;
+ case 'P':
+ paired = true;
+ break;
+ case 'I':
+ interleaved = true;
+ break;
+ case 'B':
+ barcode_dist_1 = is_integer(optarg);
+ break;
+ case 'C':
+ barcode_dist_2 = is_integer(optarg);
+ break;
+ case 'o':
+ out_path = optarg;
+ break;
+ case 'q':
+ quality = true;
+ break;
+ case 'c':
+ clean = true;
+ break;
+ case 'r':
+ recover = true;
+ break;
+ case 't':
+ truncate_seq = is_integer(optarg);
+ break;
+ case 'e':
+ renz_1 = optarg;
+ break;
+ case 'z':
+ renz_2 = optarg;
+ break;
+ case 'b':
+ barcode_file = optarg;
+ if (barcode_type == null_null)
+ barcode_type = inline_null;
+ break;
+ case 'm':
+ merge = true;
+ break;
+ case 'D':
+ discards = true;
+ break;
+ case 'R':
+ check_radtag = false;
+ break;
+ case 'F':
+ filter_illumina = true;
+ break;
+ case 'U':
+ barcode_type = null_index;
+ break;
+ case 'u':
+ barcode_type = index_null;
+ break;
+ case 'V':
+ barcode_type = inline_null;
+ break;
+ case 'W':
+ barcode_type = index_index;
+ break;
+ case 'x':
+ barcode_type = inline_inline;
+ break;
+ case 'Y':
+ barcode_type = index_inline;
+ break;
+ case 'Z':
+ barcode_type = inline_index;
+ break;
+ case 'A':
+ adapter_1 = new char[strlen(optarg) + 1];
+ strcpy(adapter_1, optarg);
+ filter_adapter = true;
+ break;
+ case 'G':
+ adapter_2 = new char[strlen(optarg) + 1];
+ strcpy(adapter_2, optarg);
+ filter_adapter = true;
+ break;
+ case 'T':
+ distance = is_integer(optarg);
+ break;
+ case 'H':
+ retain_header = true;
+ break;
+ case 'L':
+ len_limit = is_integer(optarg);
+ break;
+ case 'w':
+ win_size = is_double(optarg);
+ break;
+ case 's':
+ score_limit = is_integer(optarg);
+ break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
-
- default:
- cerr << "Unknown command line option '" << (char) c << "'\n";
- help();
- abort();
- }
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
+
+ default:
+ cerr << "Unknown command line option '" << (char) c << "'\n";
+ help();
+ abort();
+ }
}
if (in_file.length() == 0 && in_path_1.length() == 0 && in_file_p1.length() == 0) {
- cerr << "You must specify an input file of a directory path to a set of input files.\n";
- help();
+ cerr << "You must specify an input file of a directory path to a set of input files.\n";
+ help();
}
if (in_file.length() > 0 && in_path_1.length() > 0) {
- cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
- help();
+ cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
+ help();
}
if (in_file.length() > 0 && (in_file_p1.length() > 0 || in_file_p2.length() > 0)) {
- cerr << "You must specify either a single input file (-f) or a set of paired files (-1, -2), not both.\n";
- help();
+ cerr << "You must specify either a single input file (-f) or a set of paired files (-1, -2), not both.\n";
+ help();
}
if (in_path_1.length() > 0 && (in_file_p1.length() > 0 || in_file_p2.length() > 0)) {
- cerr << "You must specify either a file path (-p) or a set of paired files (-1, -2), not both.\n";
- help();
+ cerr << "You must specify either a file path (-p) or a set of paired files (-1, -2), not both.\n";
+ help();
}
- if (in_path_1.length() > 0 && in_path_1.at(in_path_1.length() - 1) != '/')
- in_path_1 += "/";
+ if (in_path_1.length() > 0 && in_path_1.at(in_path_1.length() - 1) != '/')
+ in_path_1 += "/";
- if (in_path_2.length() > 0 && in_path_2.at(in_path_2.length() - 1) != '/')
- in_path_2 += "/";
+ if (in_path_2.length() > 0 && in_path_2.at(in_path_2.length() - 1) != '/')
+ in_path_2 += "/";
- if (out_path.length() == 0)
- out_path = ".";
+ if (out_path.length() == 0)
+ out_path = ".";
- if (out_path.at(out_path.length() - 1) != '/')
- out_path += "/";
+ if (out_path.at(out_path.length() - 1) != '/')
+ out_path += "/";
if (in_file_type == FileT::unknown)
- in_file_type = ftype;
+ in_file_type = ftype;
if (in_file_type == FileT::bam && paired == true && interleaved == false) {
- cerr << "You may only specify a BAM input file for paired-end data if the read pairs are interleaved.\n";
- help();
+ cerr << "You may only specify a BAM input file for paired-end data if the read pairs are interleaved.\n";
+ help();
}
if (in_file_type == FileT::bam && (barcode_type != inline_null && barcode_type != inline_inline && barcode_type != null_null)) {
- cerr << "For BAM input files only inline or unbarcoded data can be processed.\n";
- help();
+ cerr << "For BAM input files only inline or unbarcoded data can be processed.\n";
+ help();
}
if (barcode_file.length() == 0 && barcode_type != null_null) {
- cerr << "You specified a barcode type without providing a file containing barcodes.\n";
- help();
+ cerr << "You specified a barcode type without providing a file containing barcodes.\n";
+ help();
}
if (barcode_file.length() == 0)
- cerr << "No barcodes specified, files will not be demultiplexed.\n";
+ cerr << "No barcodes specified, files will not be demultiplexed.\n";
if (barcode_file.length() > 0 && merge) {
- cerr << "You may specify a set of barcodes, or that all files should be merged, not both.\n";
- help();
+ cerr << "You may specify a set of barcodes, or that all files should be merged, not both.\n";
+ help();
}
if (check_radtag && renz_1.length() == 0) {
- cerr << "You must specify the restriction enzyme used.\n";
- help();
+ cerr << "You must specify the restriction enzyme used.\n";
+ help();
}
if (check_radtag && renz.count(renz_1) == 0) {
- cerr << "Unrecognized restriction enzyme specified: '" << renz_1.c_str() << "'.\n";
- help();
+ cerr << "Unrecognized restriction enzyme specified: '" << renz_1.c_str() << "'.\n";
+ help();
}
if (check_radtag && renz_2.length() > 0 && renz.count(renz_2) == 0) {
- cerr << "Unrecognized restriction enzyme specified: '" << renz_2.c_str() << "'.\n";
- help();
+ cerr << "Unrecognized restriction enzyme specified: '" << renz_2.c_str() << "'.\n";
+ help();
}
if (score_limit > 40) {
- cerr << "Score limit must be between 0 and 40.\n";
- help();
+ cerr << "Score limit must be between 0 and 40.\n";
+ help();
}
if (win_size < 0 || win_size >= 1) {
- cerr << "Window size is a fraction between 0 and 1.\n";
- help();
+ cerr << "Window size is a fraction between 0 and 1.\n";
+ help();
}
if (recover && barcode_type != null_null) {
- if (barcode_type != index_null && barcode_type != inline_null && barcode_dist_2 < 0)
- barcode_dist_2 = barcode_dist_1;
+ if (barcode_type != index_null && barcode_type != inline_null && barcode_dist_2 < 0)
+ barcode_dist_2 = barcode_dist_1;
}
return 0;
@@ -1216,69 +1235,69 @@ void version() {
void help() {
std::cerr << "process_radtags " << VERSION << "\n"
<< "process_radtags [-f in_file | -p in_dir [-P] [-I] | -1 pair_1 -2 pair_2] -b barcode_file -o out_dir -e enz [-c] [-q] [-r] [-t len] [-D] [-w size] [-s lim] [-h]\n"
- << " f: path to the input file if processing single-end sequences.\n"
- << " i: input file type, either 'bustard' for the Illumina BUSTARD format, 'bam', 'fastq' (default), or 'gzfastq' for gzipped FASTQ.\n"
- << " y: output type, either 'fastq', 'gzfastq', 'fasta', or 'gzfasta' (default is to match the input file type).\n"
- << " p: path to a directory of files.\n"
- << " P: files contained within directory specified by '-p' are paired.\n"
- << " I: specify that the paired-end reads are interleaved in single files.\n"
- << " 1: first input file in a set of paired-end sequences.\n"
- << " 2: second input file in a set of paired-end sequences.\n"
- << " o: path to output the processed files.\n"
- << " b: path to a file containing barcodes for this run.\n"
- << " c: clean data, remove any read with an uncalled base.\n"
- << " q: discard reads with low quality scores.\n"
- << " r: rescue barcodes and RAD-Tags.\n"
- << " t: truncate final read length to this value.\n"
- << " E: specify how quality scores are encoded, 'phred33' (Illumina 1.8+, Sanger, default) or 'phred64' (Illumina 1.3 - 1.5).\n"
- << " D: capture discarded reads to a file.\n"
- << " w: set the size of the sliding window as a fraction of the read length, between 0 and 1 (default 0.15).\n"
- << " s: set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).\n"
- << " h: display this help messsage." << "\n\n"
- << " Barcode options:\n"
- << " --inline_null: barcode is inline with sequence, occurs only on single-end read (default).\n"
- << " --index_null: barcode is provded in FASTQ header (Illumina i5 or i7 read).\n"
- << " --null_index: barcode is provded in FASTQ header (Illumina i7 read if both i5 and i7 read are provided).\n"
- << " --inline_inline: barcode is inline with sequence, occurs on single and paired-end read.\n"
- << " --index_index: barcode is provded in FASTQ header (Illumina i5 and i7 reads).\n"
- << " --inline_index: barcode is inline with sequence on single-end read and occurs in FASTQ header (from either i5 or i7 read).\n"
- << " --index_inline: barcode occurs in FASTQ header (Illumina i5 or i7 read) and is inline with single-end sequence (for single-end data) on paired-end read (for paired-end data).\n\n"
- << " Restriction enzyme options:\n"
- << " -e <enz>, --renz_1 <enz>: provide the restriction enzyme used (cut site occurs on single-end read)\n"
- << " --renz_2 <enz>: if a double digest was used, provide the second restriction enzyme used (cut site occurs on the paired-end read).\n"
- << " Currently supported enzymes include:\n"
- << " ";
+ << " f: path to the input file if processing single-end sequences.\n"
+ << " i: input file type, either 'bustard' for the Illumina BUSTARD format, 'bam', 'fastq' (default), or 'gzfastq' for gzipped FASTQ.\n"
+ << " y: output type, either 'fastq', 'gzfastq', 'fasta', or 'gzfasta' (default is to match the input file type).\n"
+ << " p: path to a directory of files.\n"
+ << " P: files contained within directory specified by '-p' are paired.\n"
+ << " I: specify that the paired-end reads are interleaved in single files.\n"
+ << " 1: first input file in a set of paired-end sequences.\n"
+ << " 2: second input file in a set of paired-end sequences.\n"
+ << " o: path to output the processed files.\n"
+ << " b: path to a file containing barcodes for this run.\n"
+ << " c: clean data, remove any read with an uncalled base.\n"
+ << " q: discard reads with low quality scores.\n"
+ << " r: rescue barcodes and RAD-Tags.\n"
+ << " t: truncate final read length to this value.\n"
+ << " E: specify how quality scores are encoded, 'phred33' (Illumina 1.8+, Sanger, default) or 'phred64' (Illumina 1.3 - 1.5).\n"
+ << " D: capture discarded reads to a file.\n"
+ << " w: set the size of the sliding window as a fraction of the read length, between 0 and 1 (default 0.15).\n"
+ << " s: set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).\n"
+ << " h: display this help messsage." << "\n\n"
+ << " Barcode options:\n"
+ << " --inline_null: barcode is inline with sequence, occurs only on single-end read (default).\n"
+ << " --index_null: barcode is provded in FASTQ header (Illumina i5 or i7 read).\n"
+ << " --null_index: barcode is provded in FASTQ header (Illumina i7 read if both i5 and i7 read are provided).\n"
+ << " --inline_inline: barcode is inline with sequence, occurs on single and paired-end read.\n"
+ << " --index_index: barcode is provded in FASTQ header (Illumina i5 and i7 reads).\n"
+ << " --inline_index: barcode is inline with sequence on single-end read and occurs in FASTQ header (from either i5 or i7 read).\n"
+ << " --index_inline: barcode occurs in FASTQ header (Illumina i5 or i7 read) and is inline with single-end sequence (for single-end data) on paired-end read (for paired-end data).\n\n"
+ << " Restriction enzyme options:\n"
+ << " -e <enz>, --renz_1 <enz>: provide the restriction enzyme used (cut site occurs on single-end read)\n"
+ << " --renz_2 <enz>: if a double digest was used, provide the second restriction enzyme used (cut site occurs on the paired-end read).\n"
+ << " Currently supported enzymes include:\n"
+ << " ";
map<string, int>::iterator it;
uint cnt = renz_cnt.size();
it = renz_cnt.begin();
for (uint i = 1; i <= cnt; i++) {
- std::cerr << "'" << it->first << "'";
- if (i < cnt - 1)
- std::cerr << ", ";
- else if (i == cnt - 1)
- std::cerr << ", or ";
+ std::cerr << "'" << it->first << "'";
+ if (i < cnt - 1)
+ std::cerr << ", ";
+ else if (i == cnt - 1)
+ std::cerr << ", or ";
- if (i % 8 == 0)
- std::cerr << "\n ";
+ if (i % 8 == 0)
+ std::cerr << "\n ";
- it++;
+ it++;
}
- std::cerr << "\n"
- << " Adapter options:\n"
- << " --adapter_1 <sequence>: provide adaptor sequence that may occur on the single-end read for filtering.\n"
- << " --adapter_2 <sequence>: provide adaptor sequence that may occur on the paired-read for filtering.\n"
- << " --adapter_mm <mismatches>: number of mismatches allowed in the adapter sequence.\n\n"
- << " Output options:\n"
- << " --retain_header: retain unmodified FASTQ headers in the output.\n"
- << " --merge: if no barcodes are specified, merge all input files into a single output file.\n\n"
- << " Advanced options:\n"
- << " --filter_illumina: discard reads that have been marked by Illumina's chastity/purity filter as failing.\n"
- << " --disable_rad_check: disable checking if the RAD site is intact.\n"
- << " --len_limit <limit>: specify a minimum sequence length (useful if your data has already been trimmed).\n"
- << " --barcode_dist_1: the number of allowed mismatches when rescuing single-end barcodes (default 1).\n"
- << " --barcode_dist_2: the number of allowed mismatches when rescuing paired-end barcodes (defaults to --barcode_dist_1).\n";
+ std::cerr << "\n"
+ << " Adapter options:\n"
+ << " --adapter_1 <sequence>: provide adaptor sequence that may occur on the single-end read for filtering.\n"
+ << " --adapter_2 <sequence>: provide adaptor sequence that may occur on the paired-read for filtering.\n"
+ << " --adapter_mm <mismatches>: number of mismatches allowed in the adapter sequence.\n\n"
+ << " Output options:\n"
+ << " --retain_header: retain unmodified FASTQ headers in the output.\n"
+ << " --merge: if no barcodes are specified, merge all input files into a single output file.\n\n"
+ << " Advanced options:\n"
+ << " --filter_illumina: discard reads that have been marked by Illumina's chastity/purity filter as failing.\n"
+ << " --disable_rad_check: disable checking if the RAD site is intact.\n"
+ << " --len_limit <limit>: specify a minimum sequence length (useful if your data has already been trimmed).\n"
+ << " --barcode_dist_1: the number of allowed mismatches when rescuing single-end barcodes (default 1).\n"
+ << " --barcode_dist_2: the number of allowed mismatches when rescuing paired-end barcodes (defaults to --barcode_dist_1).\n";
exit(0);
}
diff --git a/src/process_radtags.h b/src/process_radtags.h
index acfb49e..67d57f9 100644
--- a/src/process_radtags.h
+++ b/src/process_radtags.h
@@ -47,7 +47,7 @@ using std::set;
#include <utility>
using std::pair;
-#include "constants.h"
+#include "constants.h"
#include "renz.h"
#include "clean.h"
#include "file_io.h"
@@ -63,21 +63,21 @@ void help( void );
void version( void );
int parse_command_line(int, char **);
template<typename fhType>
-int process_reads(string,
- set<string> &, set<string> &,
- map<BarcodePair, fhType *> &,
- map<string, long> &, map<BarcodePair, map<string, long> > &);
+int process_reads(string,
+ set<string> &, set<string> &,
+ map<BarcodePair, fhType *> &,
+ map<string, long> &, map<BarcodePair, map<string, long> > &);
template<typename fhType>
-int process_paired_reads(string, string,
- set<string> &, set<string> &,
- map<BarcodePair, fhType *> &,
- map<BarcodePair, fhType *> &,
- map<BarcodePair, fhType *> &,
- map<BarcodePair, fhType *> &,
- map<string, long> &, map<BarcodePair, map<string, long> > &);
-int process_singlet(Read *,
- string, bool,
- map<string, long> &, map<string, long> &);
+int process_paired_reads(string, string,
+ set<string> &, set<string> &,
+ map<BarcodePair, fhType *> &,
+ map<BarcodePair, fhType *> &,
+ map<BarcodePair, fhType *> &,
+ map<BarcodePair, fhType *> &,
+ map<string, long> &, map<BarcodePair, map<string, long> > &);
+int process_singlet(Read *,
+ string, bool,
+ map<string, long> &, map<string, long> &);
int correct_radtag(Read *, string, map<string, long> &);
int check_quality_scores(Read *, bool);
int dist(const char *, char *);
diff --git a/src/process_shortreads.cc b/src/process_shortreads.cc
index 31b2b54..cf84b25 100644
--- a/src/process_shortreads.cc
+++ b/src/process_shortreads.cc
@@ -93,29 +93,29 @@ int main (int argc, char* argv[]) {
// If input files are gzipped, output gziped files, unless the user chooses an output type.
//
if (out_file_type == FileT::unknown) {
- if (in_file_type == FileT::gzfastq || in_file_type == FileT::bam)
- out_file_type = FileT::gzfastq;
- else
- out_file_type = FileT::fastq;
+ if (in_file_type == FileT::gzfastq || in_file_type == FileT::bam)
+ out_file_type = FileT::gzfastq;
+ else
+ out_file_type = FileT::fastq;
}
cerr << "Using Phred+" << qual_offset << " encoding for quality scores.\n"
- << "Reads trimmed shorter than " << len_limit << " nucleotides will be discarded.\n";
+ << "Reads trimmed shorter than " << len_limit << " nucleotides will be discarded.\n";
if (truncate_seq > 0)
- cerr << "Reads will be truncated to " << truncate_seq << "bp\n";
+ cerr << "Reads will be truncated to " << truncate_seq << "bp\n";
if (filter_illumina)
- cerr << "Discarding reads marked as 'failed' by Illumina's chastity/purity filters.\n";
+ cerr << "Discarding reads marked as 'failed' by Illumina's chastity/purity filters.\n";
if (filter_adapter) {
- cerr << "Filtering reads for adapter sequence:\n";
- if (adapter_1 != NULL) {
- cerr << " " << adapter_1 << "\n";
- init_adapter_seq(kmer_size, adapter_1, adp_1_len, adp_1_kmers);
- }
- if (adapter_2 != NULL) {
- cerr << " " << adapter_2 << "\n";
- init_adapter_seq(kmer_size, adapter_2, adp_2_len, adp_2_kmers);
- }
- cerr << " " << distance << " mismatches allowed to adapter sequence.\n";
+ cerr << "Filtering reads for adapter sequence:\n";
+ if (adapter_1 != NULL) {
+ cerr << " " << adapter_1 << "\n";
+ init_adapter_seq(kmer_size, adapter_1, adp_1_len, adp_1_kmers);
+ }
+ if (adapter_2 != NULL) {
+ cerr << " " << adapter_2 << "\n";
+ init_adapter_seq(kmer_size, adapter_2, adp_2_len, adp_2_kmers);
+ }
+ cerr << " " << distance << " mismatches allowed to adapter sequence.\n";
}
vector<pair<string, string> > files;
@@ -129,91 +129,91 @@ int main (int argc, char* argv[]) {
build_file_list(files);
load_barcodes(barcode_file, barcodes, se_bc, pe_bc, min_bc_size_1, max_bc_size_1, min_bc_size_2, max_bc_size_2);
if (recover && barcode_type != null_null) {
- if (barcode_type == index_null || barcode_type == inline_null)
- cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " mismatches.\n";
- else
- cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " / " << barcode_dist_2 << " mismatches.\n";
+ if (barcode_type == index_null || barcode_type == inline_null)
+ cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " mismatches.\n";
+ else
+ cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " / " << barcode_dist_2 << " mismatches.\n";
}
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
- open_files(files, barcodes, pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs, counters);
+ open_files(files, barcodes, pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs, counters);
else
- open_files(files, barcodes, pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs, counters);
+ open_files(files, barcodes, pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs, counters);
int result = 1;
for (uint i = 0; i < files.size(); i++) {
- cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].first.c_str() << "]\n";
-
- counters[files[i].first]["total"] = 0;
- counters[files[i].first]["ill_filtered"] = 0;
- counters[files[i].first]["low_quality"] = 0;
- counters[files[i].first]["trimmed"] = 0;
- counters[files[i].first]["adapter"] = 0;
- counters[files[i].first]["ambiguous"] = 0;
- counters[files[i].first]["retained"] = 0;
- counters[files[i].first]["orphaned"] = 0;
- counters[files[i].first]["recovered"] = 0;
-
- if (paired) {
- if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
- result = process_paired_reads(files[i].first, files[i].second,
- se_bc, pe_bc,
- pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs,
- counters[files[i].first], barcode_log);
- else
- result = process_paired_reads(files[i].first, files[i].second,
- se_bc, pe_bc,
- pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs,
- counters[files[i].first], barcode_log);
- } else {
- if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
- result = process_reads(files[i].first,
- se_bc, pe_bc,
- pair_1_gzfhs,
- counters[files[i].first], barcode_log);
- else
- result = process_reads(files[i].first,
- se_bc, pe_bc,
- pair_1_fhs,
- counters[files[i].first], barcode_log);
- }
- cerr << " "
- << counters[files[i].first]["total"] << " total reads; ";
- if (filter_illumina)
- cerr << "-" << counters[files[i].first]["ill_filtered"] << " failed Illumina reads; ";
- cerr
- << "-" << counters[files[i].first]["ambiguous"] << " ambiguous barcodes; "
- << "+" << counters[files[i].first]["recovered"] << " recovered; "
- << "-" << counters[files[i].first]["low_quality"] << " low quality reads; "
- << counters[files[i].first]["retained"] << " retained reads.\n"
- << " ";
- if (filter_adapter)
- cerr << counters[files[i].first]["adapter"] << " reads with adapter sequence; ";
- cerr << counters[files[i].first]["trimmed"] << " trimmed reads; "
- << counters[files[i].first]["orphaned"] << " orphaned paired-ends.\n";
-
- if (!result) {
- cerr << "Error processing reads.\n";
- break;
- }
+ cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].first.c_str() << "]\n";
+
+ counters[files[i].first]["total"] = 0;
+ counters[files[i].first]["ill_filtered"] = 0;
+ counters[files[i].first]["low_quality"] = 0;
+ counters[files[i].first]["trimmed"] = 0;
+ counters[files[i].first]["adapter"] = 0;
+ counters[files[i].first]["ambiguous"] = 0;
+ counters[files[i].first]["retained"] = 0;
+ counters[files[i].first]["orphaned"] = 0;
+ counters[files[i].first]["recovered"] = 0;
+
+ if (paired) {
+ if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
+ result = process_paired_reads(files[i].first, files[i].second,
+ se_bc, pe_bc,
+ pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs,
+ counters[files[i].first], barcode_log);
+ else
+ result = process_paired_reads(files[i].first, files[i].second,
+ se_bc, pe_bc,
+ pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs,
+ counters[files[i].first], barcode_log);
+ } else {
+ if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
+ result = process_reads(files[i].first,
+ se_bc, pe_bc,
+ pair_1_gzfhs,
+ counters[files[i].first], barcode_log);
+ else
+ result = process_reads(files[i].first,
+ se_bc, pe_bc,
+ pair_1_fhs,
+ counters[files[i].first], barcode_log);
+ }
+ cerr << " "
+ << counters[files[i].first]["total"] << " total reads; ";
+ if (filter_illumina)
+ cerr << "-" << counters[files[i].first]["ill_filtered"] << " failed Illumina reads; ";
+ cerr
+ << "-" << counters[files[i].first]["ambiguous"] << " ambiguous barcodes; "
+ << "+" << counters[files[i].first]["recovered"] << " recovered; "
+ << "-" << counters[files[i].first]["low_quality"] << " low quality reads; "
+ << counters[files[i].first]["retained"] << " retained reads.\n"
+ << " ";
+ if (filter_adapter)
+ cerr << counters[files[i].first]["adapter"] << " reads with adapter sequence; ";
+ cerr << counters[files[i].first]["trimmed"] << " trimmed reads; "
+ << counters[files[i].first]["orphaned"] << " orphaned paired-ends.\n";
+
+ if (!result) {
+ cerr << "Error processing reads.\n";
+ break;
+ }
}
cerr << "Closing files, flushing buffers...\n";
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
- close_file_handles(pair_1_gzfhs);
- if (paired) {
- close_file_handles(rem_1_gzfhs);
- close_file_handles(rem_2_gzfhs);
- close_file_handles(pair_2_gzfhs);
- }
+ close_file_handles(pair_1_gzfhs);
+ if (paired) {
+ close_file_handles(rem_1_gzfhs);
+ close_file_handles(rem_2_gzfhs);
+ close_file_handles(pair_2_gzfhs);
+ }
} else {
- close_file_handles(pair_1_fhs);
- if (paired) {
- close_file_handles(rem_1_fhs);
- close_file_handles(rem_2_fhs);
- close_file_handles(pair_2_fhs);
- }
+ close_file_handles(pair_1_fhs);
+ if (paired) {
+ close_file_handles(rem_1_fhs);
+ close_file_handles(rem_2_fhs);
+ close_file_handles(pair_2_fhs);
+ }
}
print_results(argc, argv, barcodes, counters, barcode_log);
@@ -222,16 +222,16 @@ int main (int argc, char* argv[]) {
}
template<typename fhType>
-int
+int
process_paired_reads(string prefix_1,
- string prefix_2,
- set<string> &se_bc, set<string> &pe_bc,
- map<BarcodePair, fhType *> &pair_1_fhs,
- map<BarcodePair, fhType *> &pair_2_fhs,
- map<BarcodePair, fhType *> &rem_1_fhs,
- map<BarcodePair, fhType *> &rem_2_fhs,
- map<string, long> &counter,
- map<BarcodePair, map<string, long> > &barcode_log) {
+ string prefix_2,
+ set<string> &se_bc, set<string> &pe_bc,
+ map<BarcodePair, fhType *> &pair_1_fhs,
+ map<BarcodePair, fhType *> &pair_2_fhs,
+ map<BarcodePair, fhType *> &rem_1_fhs,
+ map<BarcodePair, fhType *> &rem_2_fhs,
+ map<string, long> &counter,
+ map<BarcodePair, map<string, long> > &barcode_log) {
Input *fh_1, *fh_2;
Read *r_1, *r_2;
ofstream *discard_fh_1, *discard_fh_2;
@@ -242,19 +242,19 @@ process_paired_reads(string prefix_1,
string path_2 = in_path_2 + prefix_2;
if (interleaved)
- cerr << " Reading data from:\n " << path_1 << "\n";
+ cerr << " Reading data from:\n " << path_1 << "\n";
else
- cerr << " Reading data from:\n " << path_1 << " and\n " << path_2 << "\n";
+ cerr << " Reading data from:\n " << path_1 << " and\n " << path_2 << "\n";
if (in_file_type == FileT::fastq) {
fh_1 = new Fastq(path_1.c_str());
- fh_2 = interleaved ? fh_1 : new Fastq(path_2.c_str());
+ fh_2 = interleaved ? fh_1 : new Fastq(path_2.c_str());
} else if (in_file_type == FileT::gzfastq) {
fh_1 = new GzFastq(path_1.c_str());
- fh_2 = interleaved ? fh_1 : new GzFastq(path_2.c_str());
+ fh_2 = interleaved ? fh_1 : new GzFastq(path_2.c_str());
} else if (in_file_type == FileT::bam) {
fh_1 = new BamUnAln(path_1.c_str());
- fh_2 = fh_1;
+ fh_2 = fh_1;
} else if (in_file_type == FileT::bustard) {
fh_1 = new Bustard(path_1.c_str());
fh_2 = interleaved ? fh_1 : new Bustard(path_2.c_str());
@@ -264,33 +264,33 @@ process_paired_reads(string prefix_1,
// Open a file for recording discarded reads
//
if (discards) {
- path_1 = out_path + prefix_1 + ".discards";
- discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
+ path_1 = out_path + prefix_1 + ".discards";
+ discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
- if (discard_fh_1->fail()) {
- cerr << "Error opening discard output file '" << path_1 << "'\n";
- exit(1);
- }
+ if (discard_fh_1->fail()) {
+ cerr << "Error opening discard output file '" << path_1 << "'\n";
+ exit(1);
+ }
- path_2 = out_path + prefix_2 + ".discards";
- discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
+ path_2 = out_path + prefix_2 + ".discards";
+ discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
- if (discard_fh_1->fail()) {
- cerr << "Error opening discard output file '" << path_2 << "'\n";
- exit(1);
- }
+ if (discard_fh_1->fail()) {
+ cerr << "Error opening discard output file '" << path_2 << "'\n";
+ exit(1);
+ }
}
//
- // Read in the first record, initializing the Seq object s. Then
+ // Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
//
Seq *s_1 = fh_1->next_seq();
Seq *s_2 = fh_2->next_seq();
if (s_1 == NULL || s_2 == NULL) {
- cerr << "Attempting to read first pair of input records, unable to allocate "
- << "Seq object (Was the correct input type specified?).\n";
- exit(1);
+ cerr << "Attempting to read first pair of input records, unable to allocate "
+ << "Seq object (Was the correct input type specified?).\n";
+ exit(1);
}
r_1 = new Read(strlen(s_1->seq), 1, min_bc_size_1, win_size);
@@ -301,131 +301,131 @@ process_paired_reads(string prefix_1,
// If no barcodes were specified, set the barcode object to be the input file names.
//
if (max_bc_size_1 == 0)
- bc.set(prefix_1, prefix_2);
+ bc.set(prefix_1, prefix_2);
long i = 1;
do {
if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
- parse_input_record(s_1, r_1);
- parse_input_record(s_2, r_2);
- counter["total"] += 2;
-
- if (barcode_type != null_null &&
- barcode_type != inline_null &&
- barcode_type != index_null)
- bc.set(r_1->se_bc, r_2->pe_bc);
- else if (barcode_type != null_null)
- bc.set(r_1->se_bc);
-
- process_barcode(r_1, r_2, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
-
- //
- // Adjust the size of the read to accommodate truncating the sequence and variable
- // barcode lengths. With standard Illumina data we want to output constant length
- // reads even as the barcode size may change. Other technologies, like IonTorrent
- // need to be truncated uniformly.
- //
- if (truncate_seq > 0) {
- if (truncate_seq + r_1->inline_bc_len <= r_1->len)
- r_1->set_len(truncate_seq + r_1->inline_bc_len);
- if (truncate_seq + r_2->inline_bc_len <= r_2->len)
- r_2->set_len(truncate_seq + r_2->inline_bc_len);
- } else {
- if (barcode_type == inline_null || barcode_type == inline_inline || barcode_type == inline_index)
- r_1->set_len(r_1->len - (max_bc_size_1 - r_1->inline_bc_len));
- if (barcode_type == inline_index || barcode_type == index_index)
- r_2->set_len(r_2->len - (max_bc_size_2 - r_2->inline_bc_len));
- }
-
- if (r_1->retain)
- process_singlet(r_1, false, barcode_log[bc], counter);
- if (r_2->retain)
- process_singlet(r_2, true, barcode_log[bc], counter);
-
- if (matepair) {
- rev_complement(r_1->seq, r_1->inline_bc_len, overhang);
- reverse_qual(r_1->phred, r_1->inline_bc_len, overhang);
- }
-
- int result_1 = 1;
- int result_2 = 1;
-
- if (r_1->retain && r_2->retain) {
- if (retain_header) {
- result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_1_fhs[bc], s_1, r_1) :
- write_fasta(pair_1_fhs[bc], s_1, r_1);
- result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_2_fhs[bc], s_2, r_2) :
- write_fasta(pair_2_fhs[bc], s_2, r_2);
- } else {
- result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_1_fhs[bc], r_1, overhang) :
- write_fasta(pair_1_fhs[bc], r_1, overhang);
- result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_2_fhs[bc], r_2, overhang) :
- write_fasta(pair_2_fhs[bc], r_2, overhang);
- }
-
- } else if (r_1->retain && !r_2->retain) {
- //
- // Write to a remainder file.
- //
- if (retain_header)
- result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(rem_1_fhs[bc], s_1, r_1) :
- write_fasta(rem_1_fhs[bc], s_1, r_1);
- else
- result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(rem_1_fhs[bc], r_1, overhang) :
- write_fasta(rem_1_fhs[bc], r_1, overhang);
-
- } else if (!r_1->retain && r_2->retain) {
- // Write to a remainder file.
- if (retain_header)
- result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(rem_2_fhs[bc], s_2, r_2) :
- write_fasta(rem_2_fhs[bc], s_2, r_2);
- else
- result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(rem_2_fhs[bc], r_2, overhang) :
- write_fasta(rem_2_fhs[bc], r_2, overhang);
- }
-
- if (!result_1 || !result_2) {
- cerr << "Error writing to output file for '" << bc.str() << "'\n";
- return_val = -1;
- break;
- }
-
- if (discards && !r_1->retain)
- result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(discard_fh_1, s_1) :
- write_fasta(discard_fh_1, s_1);
- if (discards && !r_2->retain)
- result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(discard_fh_2, s_2) :
- write_fasta(discard_fh_2, s_2);
-
- delete s_1;
- delete s_2;
-
- if (!result_1 || !result_2) {
- cerr << "Error writing to discard file for '" << bc.str() << "'\n";
- return_val = -1;
- break;
- }
-
- i++;
- } while ((s_1 = fh_1->next_seq()) != NULL &&
- (s_2 = fh_2->next_seq()) != NULL);
+ parse_input_record(s_1, r_1);
+ parse_input_record(s_2, r_2);
+ counter["total"] += 2;
+
+ if (barcode_type != null_null &&
+ barcode_type != inline_null &&
+ barcode_type != index_null)
+ bc.set(r_1->se_bc, r_2->pe_bc);
+ else if (barcode_type != null_null)
+ bc.set(r_1->se_bc);
+
+ process_barcode(r_1, r_2, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
+
+ //
+ // Adjust the size of the read to accommodate truncating the sequence and variable
+ // barcode lengths. With standard Illumina data we want to output constant length
+ // reads even as the barcode size may change. Other technologies, like IonTorrent
+ // need to be truncated uniformly.
+ //
+ if (truncate_seq > 0) {
+ if (truncate_seq + r_1->inline_bc_len <= r_1->len)
+ r_1->set_len(truncate_seq + r_1->inline_bc_len);
+ if (truncate_seq + r_2->inline_bc_len <= r_2->len)
+ r_2->set_len(truncate_seq + r_2->inline_bc_len);
+ } else {
+ if (barcode_type == inline_null || barcode_type == inline_inline || barcode_type == inline_index)
+ r_1->set_len(r_1->len - (max_bc_size_1 - r_1->inline_bc_len));
+ if (barcode_type == inline_index || barcode_type == index_index)
+ r_2->set_len(r_2->len - (max_bc_size_2 - r_2->inline_bc_len));
+ }
+
+ if (r_1->retain)
+ process_singlet(r_1, false, barcode_log[bc], counter);
+ if (r_2->retain)
+ process_singlet(r_2, true, barcode_log[bc], counter);
+
+ if (matepair) {
+ rev_complement(r_1->seq, r_1->inline_bc_len, overhang);
+ reverse_qual(r_1->phred, r_1->inline_bc_len, overhang);
+ }
+
+ int result_1 = 1;
+ int result_2 = 1;
+
+ if (r_1->retain && r_2->retain) {
+ if (retain_header) {
+ result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_1_fhs[bc], s_1, r_1) :
+ write_fasta(pair_1_fhs[bc], s_1, r_1);
+ result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_2_fhs[bc], s_2, r_2) :
+ write_fasta(pair_2_fhs[bc], s_2, r_2);
+ } else {
+ result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_1_fhs[bc], r_1, overhang) :
+ write_fasta(pair_1_fhs[bc], r_1, overhang);
+ result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_2_fhs[bc], r_2, overhang) :
+ write_fasta(pair_2_fhs[bc], r_2, overhang);
+ }
+
+ } else if (r_1->retain && !r_2->retain) {
+ //
+ // Write to a remainder file.
+ //
+ if (retain_header)
+ result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(rem_1_fhs[bc], s_1, r_1) :
+ write_fasta(rem_1_fhs[bc], s_1, r_1);
+ else
+ result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(rem_1_fhs[bc], r_1, overhang) :
+ write_fasta(rem_1_fhs[bc], r_1, overhang);
+
+ } else if (!r_1->retain && r_2->retain) {
+ // Write to a remainder file.
+ if (retain_header)
+ result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(rem_2_fhs[bc], s_2, r_2) :
+ write_fasta(rem_2_fhs[bc], s_2, r_2);
+ else
+ result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(rem_2_fhs[bc], r_2, overhang) :
+ write_fasta(rem_2_fhs[bc], r_2, overhang);
+ }
+
+ if (!result_1 || !result_2) {
+ cerr << "Error writing to output file for '" << bc.str() << "'\n";
+ return_val = -1;
+ break;
+ }
+
+ if (discards && !r_1->retain)
+ result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(discard_fh_1, s_1) :
+ write_fasta(discard_fh_1, s_1);
+ if (discards && !r_2->retain)
+ result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(discard_fh_2, s_2) :
+ write_fasta(discard_fh_2, s_2);
+
+ delete s_1;
+ delete s_2;
+
+ if (!result_1 || !result_2) {
+ cerr << "Error writing to discard file for '" << bc.str() << "'\n";
+ return_val = -1;
+ break;
+ }
+
+ i++;
+ } while ((s_1 = fh_1->next_seq()) != NULL &&
+ (s_2 = fh_2->next_seq()) != NULL);
if (discards) {
- delete discard_fh_1;
- delete discard_fh_2;
+ delete discard_fh_1;
+ delete discard_fh_2;
}
delete fh_1;
@@ -435,12 +435,12 @@ process_paired_reads(string prefix_1,
}
template<typename fhType>
-int
-process_reads(string prefix,
- set<string> &se_bc, set<string> &pe_bc,
- map<BarcodePair, fhType *> &pair_1_fhs,
- map<string, long> &counter,
- map<BarcodePair, map<string, long> > &barcode_log) {
+int
+process_reads(string prefix,
+ set<string> &se_bc, set<string> &pe_bc,
+ map<BarcodePair, fhType *> &pair_1_fhs,
+ map<string, long> &counter,
+ map<BarcodePair, map<string, long> > &barcode_log) {
Input *fh;
Read *r;
ofstream *discard_fh;
@@ -462,24 +462,24 @@ process_reads(string prefix,
// Open a file for recording discarded reads
//
if (discards) {
- path = path + ".discards";
- discard_fh = new ofstream(path.c_str(), ifstream::out);
+ path = path + ".discards";
+ discard_fh = new ofstream(path.c_str(), ifstream::out);
- if (discard_fh->fail()) {
- cerr << "Error opening discard output file '" << path << "'\n";
- exit(1);
- }
+ if (discard_fh->fail()) {
+ cerr << "Error opening discard output file '" << path << "'\n";
+ exit(1);
+ }
}
//
- // Read in the first record, initializing the Seq object s. Then
+ // Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
//
Seq *s = fh->next_seq();
if (s == NULL) {
- cerr << "Attempting to read first input record, unable to allocate "
- << "Seq object (Was the correct input type specified?).\n";
- exit(1);
+ cerr << "Attempting to read first input record, unable to allocate "
+ << "Seq object (Was the correct input type specified?).\n";
+ exit(1);
}
r = new Read(strlen(s->seq), 1, min_bc_size_1, win_size);
@@ -490,76 +490,76 @@ process_reads(string prefix,
// that reads are written to an output file of the same name as the input file.
//
if (max_bc_size_1 == 0)
- bc.set(prefix);
+ bc.set(prefix);
//cerr << "Length: " << r->len << "; Window length: " << r->win_len << "; Stop position: " << r->stop_pos << "\n";
long i = 1;
do {
- if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
- counter["total"]++;
-
- parse_input_record(s, r);
-
- if (barcode_type == inline_null ||
- barcode_type == index_null)
- bc.set(r->se_bc);
- else if (barcode_type == index_inline ||
- barcode_type == inline_index)
- bc.set(r->se_bc, r->pe_bc);
-
- process_barcode(r, NULL, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
-
- //
- // Adjust the size of the read to accommodate truncating the sequence and variable
- // barcode lengths. With standard Illumina data we want to output constant length
- // reads even as the barcode size may change. Other technologies, like IonTorrent
- // need to be truncated uniformly.
- //
- if (truncate_seq > 0) {
- if (truncate_seq + r->inline_bc_len <= r->len)
- r->set_len(truncate_seq + r->inline_bc_len);
- } else {
- if (barcode_type == inline_null || barcode_type == inline_inline || barcode_type == inline_index)
- r->set_len(r->len - (max_bc_size_1 - r->inline_bc_len));
- }
-
- if (r->retain)
- process_singlet(r, false, barcode_log[bc], counter);
-
- int result = 1;
-
- if (r->retain) {
- if (retain_header)
- result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_1_fhs[bc], s, r) :
- write_fasta(pair_1_fhs[bc], s, r);
- else
- result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(pair_1_fhs[bc], r, overhang) :
- write_fasta(pair_1_fhs[bc], r, overhang);
- }
-
- if (!result) {
- cerr << "Error writing to output file for '" << bc.str() << "'\n";
- return_val = -1;
- break;
- }
-
- if (discards && !r->retain)
- result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(discard_fh, s) :
- write_fasta(discard_fh, s);
-
- if (!result) {
- cerr << "Error writing to discard file for '" << bc.str() << "'\n";
- return_val = -1;
- break;
- }
-
- delete s;
- i++;
+ if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
+ counter["total"]++;
+
+ parse_input_record(s, r);
+
+ if (barcode_type == inline_null ||
+ barcode_type == index_null)
+ bc.set(r->se_bc);
+ else if (barcode_type == index_inline ||
+ barcode_type == inline_index)
+ bc.set(r->se_bc, r->pe_bc);
+
+ process_barcode(r, NULL, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
+
+ //
+ // Adjust the size of the read to accommodate truncating the sequence and variable
+ // barcode lengths. With standard Illumina data we want to output constant length
+ // reads even as the barcode size may change. Other technologies, like IonTorrent
+ // need to be truncated uniformly.
+ //
+ if (truncate_seq > 0) {
+ if (truncate_seq + r->inline_bc_len <= r->len)
+ r->set_len(truncate_seq + r->inline_bc_len);
+ } else {
+ if (barcode_type == inline_null || barcode_type == inline_inline || barcode_type == inline_index)
+ r->set_len(r->len - (max_bc_size_1 - r->inline_bc_len));
+ }
+
+ if (r->retain)
+ process_singlet(r, false, barcode_log[bc], counter);
+
+ int result = 1;
+
+ if (r->retain) {
+ if (retain_header)
+ result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_1_fhs[bc], s, r) :
+ write_fasta(pair_1_fhs[bc], s, r);
+ else
+ result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(pair_1_fhs[bc], r, overhang) :
+ write_fasta(pair_1_fhs[bc], r, overhang);
+ }
+
+ if (!result) {
+ cerr << "Error writing to output file for '" << bc.str() << "'\n";
+ return_val = -1;
+ break;
+ }
+
+ if (discards && !r->retain)
+ result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
+ write_fastq(discard_fh, s) :
+ write_fasta(discard_fh, s);
+
+ if (!result) {
+ cerr << "Error writing to discard file for '" << bc.str() << "'\n";
+ return_val = -1;
+ break;
+ }
+
+ delete s;
+ i++;
} while ((s = fh->next_seq()) != NULL);
if (discards) delete discard_fh;
@@ -572,27 +572,27 @@ process_reads(string prefix,
return return_val;
}
-inline int
+inline int
process_singlet(Read *href,
- bool paired_end,
- map<string, long> &bc_log, map<string, long> &counter)
+ bool paired_end,
+ map<string, long> &bc_log, map<string, long> &counter)
{
if (filter_illumina && href->filter) {
- counter["ill_filtered"]++;
- href->retain = 0;
- return 0;
+ counter["ill_filtered"]++;
+ href->retain = 0;
+ return 0;
}
//
// Drop this sequence if it has any uncalled nucleotides
//
if (clean) {
- for (char *p = href->seq + href->inline_bc_len; *p != '\0'; p++)
- if (*p == '.' || *p == 'N') {
- counter["low_quality"]++;
- href->retain = 0;
- return 0;
- }
+ for (char *p = href->seq + href->inline_bc_len; *p != '\0'; p++)
+ if (*p == '.' || *p == 'N') {
+ counter["low_quality"]++;
+ href->retain = 0;
+ return 0;
+ }
}
bool adapter_trim = false;
@@ -602,52 +602,52 @@ process_singlet(Read *href,
// Drop or trim this sequence if it has low quality scores
//
if (quality) {
- int res = check_quality_scores(href, qual_offset, score_limit, len_limit, href->inline_bc_len);
-
- if (trim_reads) {
- if (res == 0) {
- counter["low_quality"]++;
- href->retain = 0;
- return 0;
- } else if (res < 0) {
- quality_trim = true;
- }
- } else {
- if (res <= 0) {
- counter["low_quality"]++;
- href->retain = 0;
- return 0;
- }
- }
+ int res = check_quality_scores(href, qual_offset, score_limit, len_limit, href->inline_bc_len);
+
+ if (trim_reads) {
+ if (res == 0) {
+ counter["low_quality"]++;
+ href->retain = 0;
+ return 0;
+ } else if (res < 0) {
+ quality_trim = true;
+ }
+ } else {
+ if (res <= 0) {
+ counter["low_quality"]++;
+ href->retain = 0;
+ return 0;
+ }
+ }
}
//
// Drop or trim this sequence if it contains adapter sequence.
//
if (filter_adapter) {
- int res = 1;
- if (paired_end == true && adp_2_len > 0)
- res = filter_adapter_seq(href, adapter_2, adp_2_len, adp_2_kmers,
- kmer_size, distance, len_limit);
- if (paired_end == false && adp_1_len > 0)
- res = filter_adapter_seq(href, adapter_1, adp_1_len, adp_1_kmers,
- kmer_size, distance, len_limit);
- if (res == 0) {
- counter["adapter"]++;
- href->retain = 0;
- return 0;
-
- } else if (res < 0) {
- counter["adapter"]++;
- adapter_trim = true;
- }
+ int res = 1;
+ if (paired_end == true && adp_2_len > 0)
+ res = filter_adapter_seq(href, adapter_2, adp_2_len, adp_2_kmers,
+ kmer_size, distance, len_limit);
+ if (paired_end == false && adp_1_len > 0)
+ res = filter_adapter_seq(href, adapter_1, adp_1_len, adp_1_kmers,
+ kmer_size, distance, len_limit);
+ if (res == 0) {
+ counter["adapter"]++;
+ href->retain = 0;
+ return 0;
+
+ } else if (res < 0) {
+ counter["adapter"]++;
+ adapter_trim = true;
+ }
}
if (adapter_trim || quality_trim)
- counter["trimmed"]++;
+ counter["trimmed"]++;
- if (barcode_type != null_null)
- bc_log["retained"]++;
+ if (barcode_type != null_null)
+ bc_log["retained"]++;
counter["retained"]++;
return 0;
@@ -655,20 +655,20 @@ process_singlet(Read *href,
int dist(const char *res_enz, char *seq) {
const char *p; char *q;
-
+
int dist = 0;
for (p = res_enz, q = seq; *p != '\0'; p++, q++)
- if (*p != *q) dist++;
+ if (*p != *q) dist++;
return dist;
}
-int
-print_results(int argc, char **argv,
- vector<BarcodePair> &barcodes,
- map<string, map<string, long> > &counters,
- map<BarcodePair, map<string, long> > &barcode_log)
+int
+print_results(int argc, char **argv,
+ vector<BarcodePair> &barcodes,
+ map<string, map<string, long> > &counters,
+ map<BarcodePair, map<string, long> > &barcode_log)
{
map<string, map<string, long> >::iterator it;
@@ -676,8 +676,8 @@ print_results(int argc, char **argv,
ofstream log(log_path.c_str());
if (log.fail()) {
- cerr << "Unable to open log file '" << log_path << "'\n";
- return 0;
+ cerr << "Unable to open log file '" << log_path << "'\n";
+ return 0;
}
cerr << "Outputing details to log: '" << log_path << "'\n\n";
@@ -685,29 +685,29 @@ print_results(int argc, char **argv,
init_log(log, argc, argv);
log << "File\t"
- << "Retained Reads\t";
+ << "Retained Reads\t";
if (filter_illumina)
- log << "Illumina Filtered\t";
+ log << "Illumina Filtered\t";
if (filter_adapter)
- log << "Adapter Seq" << "\t";
+ log << "Adapter Seq" << "\t";
log << "Low Quality\t"
- << "Ambiguous Barcodes\t"
- << "Trimmed Reads\t"
- << "Orphaned paired-end reads\t"
- << "Total\n";
+ << "Ambiguous Barcodes\t"
+ << "Trimmed Reads\t"
+ << "Orphaned paired-end reads\t"
+ << "Total\n";
for (it = counters.begin(); it != counters.end(); it++) {
- log << it->first << "\t"
- << it->second["retained"] << "\t";
- if (filter_illumina)
- log << it->second["ill_filtered"] << "\t";
- if (filter_adapter)
- log << it->second["adapter"] << "\t";
- log << it->second["low_quality"] << "\t"
- << it->second["ambiguous"] << "\t"
- << it->second["trimmed"] << "\t"
- << it->second["orphaned"] << "\t"
- << it->second["total"] << "\n";
+ log << it->first << "\t"
+ << it->second["retained"] << "\t";
+ if (filter_illumina)
+ log << it->second["ill_filtered"] << "\t";
+ if (filter_adapter)
+ log << it->second["adapter"] << "\t";
+ log << it->second["low_quality"] << "\t"
+ << it->second["ambiguous"] << "\t"
+ << it->second["trimmed"] << "\t"
+ << it->second["orphaned"] << "\t"
+ << it->second["total"] << "\n";
}
map<string, long> c;
@@ -723,39 +723,39 @@ print_results(int argc, char **argv,
// Total up the individual counters
//
for (it = counters.begin(); it != counters.end(); it++) {
- c["total"] += it->second["total"];
- c["ill_filtered"] += it->second["ill_filtered"];
- c["adapter"] += it->second["adapter"];
- c["low_quality"] += it->second["low_quality"];
- c["ambiguous"] += it->second["ambiguous"];
- c["trimmed"] += it->second["trimmed"];
- c["orphaned"] += it->second["orphaned"];
- c["retained"] += it->second["retained"];
+ c["total"] += it->second["total"];
+ c["ill_filtered"] += it->second["ill_filtered"];
+ c["adapter"] += it->second["adapter"];
+ c["low_quality"] += it->second["low_quality"];
+ c["ambiguous"] += it->second["ambiguous"];
+ c["trimmed"] += it->second["trimmed"];
+ c["orphaned"] += it->second["orphaned"];
+ c["retained"] += it->second["retained"];
}
cerr << c["total"] << " total sequences;\n";
if (filter_illumina)
- cerr << " " << c["ill_filtered"] << " failed Illumina filtered reads;\n";
+ cerr << " " << c["ill_filtered"] << " failed Illumina filtered reads;\n";
if (filter_adapter)
- cerr << " " << c["adapter"] << " reads contained adapter sequence;\n";
+ cerr << " " << c["adapter"] << " reads contained adapter sequence;\n";
cerr << " " << c["ambiguous"] << " ambiguous barcode drops;\n"
- << " " << c["low_quality"] << " low quality read drops;\n"
- << " " << c["trimmed"] << " trimmed reads;\n"
- << " " << c["orphaned"] << " orphaned paired-end reads;\n"
- << c["retained"] << " retained reads.\n";
+ << " " << c["low_quality"] << " low quality read drops;\n"
+ << " " << c["trimmed"] << " trimmed reads;\n"
+ << " " << c["orphaned"] << " orphaned paired-end reads;\n"
+ << c["retained"] << " retained reads.\n";
- log << "\n"
- << "Total Sequences\t" << c["total"] << "\n";
+ log << "\n"
+ << "Total Sequences\t" << c["total"] << "\n";
if (filter_illumina)
- log << "Failed Illumina filtered reads\t" << c["ill_filtered"] << "\n";
+ log << "Failed Illumina filtered reads\t" << c["ill_filtered"] << "\n";
if (filter_adapter)
- log << "Reads containing adapter sequence\t" << c["adapter"] << "\n";
- log
- << "Ambiguous Barcodes\t" << c["ambiguous"] << "\n"
- << "Low Quality\t" << c["low_quality"] << "\n"
- << "Trimmed Reads\t" << c["trimmed"] << "\n"
- << "Orphaned Paired-ends\t" << c["orphaned"] << "\n"
- << "Retained Reads\t" << c["retained"] << "\n";
+ log << "Reads containing adapter sequence\t" << c["adapter"] << "\n";
+ log
+ << "Ambiguous Barcodes\t" << c["ambiguous"] << "\n"
+ << "Low Quality\t" << c["low_quality"] << "\n"
+ << "Trimmed Reads\t" << c["trimmed"] << "\n"
+ << "Orphaned Paired-ends\t" << c["orphaned"] << "\n"
+ << "Retained Reads\t" << c["retained"] << "\n";
if (max_bc_size_1 == 0) return 0;
@@ -764,40 +764,40 @@ print_results(int argc, char **argv,
//
bool bc_names = false;
for (uint i = 0; i < barcodes.size(); i++)
- if (barcodes[i].name_exists()) {
- bc_names = true;
- break;
- }
+ if (barcodes[i].name_exists()) {
+ bc_names = true;
+ break;
+ }
//
// Print out barcode information.
//
log << "\n"
- << "Barcode\t";
+ << "Barcode\t";
if (bc_names)
- log << "Filename\t";
+ log << "Filename\t";
log << "Total\t"
- << "Retained\n";
+ << "Retained\n";
set<BarcodePair> barcode_list;
for (uint i = 0; i < barcodes.size(); i++) {
- barcode_list.insert(barcodes[i]);
-
- log << barcodes[i] << "\t";
- if (bc_names)
- log << barcodes[i].name << "\t";
- if (barcode_log.count(barcodes[i]) == 0)
- log << "0\t" << "0\t" << "0\n";
- else
- log << barcode_log[barcodes[i]]["total"] << "\t"
- << barcode_log[barcodes[i]]["retained"] << "\n";
+ barcode_list.insert(barcodes[i]);
+
+ log << barcodes[i] << "\t";
+ if (bc_names)
+ log << barcodes[i].name << "\t";
+ if (barcode_log.count(barcodes[i]) == 0)
+ log << "0\t" << "0\t" << "0\n";
+ else
+ log << barcode_log[barcodes[i]]["total"] << "\t"
+ << barcode_log[barcodes[i]]["retained"] << "\n";
}
log << "\n"
- << "Sequences not recorded\n"
- << "Barcode\t"
- << "Total\n";
+ << "Sequences not recorded\n"
+ << "Barcode\t"
+ << "Total\n";
//
// Sort unused barcodes by number of occurances.
@@ -805,15 +805,15 @@ print_results(int argc, char **argv,
map<BarcodePair, map<string, long> >::iterator bit;
vector<pair<BarcodePair, int> > bcs;
for (bit = barcode_log.begin(); bit != barcode_log.end(); bit++)
- bcs.push_back(make_pair(bit->first, bit->second["total"]));
+ bcs.push_back(make_pair(bit->first, bit->second["total"]));
sort(bcs.begin(), bcs.end(), compare_barcodes);
for (uint i = 0; i < bcs.size(); i++) {
- if (barcode_list.count(bcs[i].first)) continue;
- if (bcs[i].second == 0) continue;
+ if (barcode_list.count(bcs[i].first)) continue;
+ if (bcs[i].second == 0) continue;
- log << bcs[i].first << "\t"
- << bcs[i].second << "\n";
+ log << bcs[i].first << "\t"
+ << bcs[i].second << "\n";
}
log.close();
@@ -828,288 +828,288 @@ int compare_barcodes(pair<BarcodePair, int> a, pair<BarcodePair, int> b) {
int parse_command_line(int argc, char* argv[]) {
FileT ftype;
int c;
-
+
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
{"quality", no_argument, NULL, 'q'},
{"clean", no_argument, NULL, 'c'},
{"recover", no_argument, NULL, 'r'},
- {"discards", no_argument, NULL, 'D'},
- {"paired", no_argument, NULL, 'P'},
- {"interleaved", no_argument, NULL, 'I'},
- {"merge", no_argument, NULL, 'm'},
- {"mate-pair", no_argument, NULL, 'M'},
- {"no_overhang", no_argument, NULL, 'O'},
- {"filter_illumina", no_argument, NULL, 'F'},
- {"retain_header", no_argument, NULL, 'H'},
- {"no_read_trimming", no_argument, NULL, 'N'},
- {"null_index", no_argument, NULL, 'U'},
- {"index_null", no_argument, NULL, 'u'},
- {"inline_null", no_argument, NULL, 'V'},
- {"index_index", no_argument, NULL, 'W'},
- {"inline_inline", no_argument, NULL, 'x'},
- {"index_inline", no_argument, NULL, 'Y'},
- {"inline_index", no_argument, NULL, 'Z'},
- {"barcode_dist_1", required_argument, NULL, 'B'},
- {"barcode_dist_2", required_argument, NULL, 'C'},
- {"infile_type", required_argument, NULL, 'i'},
- {"outfile_type", required_argument, NULL, 'y'},
- {"file", required_argument, NULL, 'f'},
- {"file_p1", required_argument, NULL, '1'},
- {"file_p2", required_argument, NULL, '2'},
- {"path", required_argument, NULL, 'p'},
- {"outpath", required_argument, NULL, 'o'},
- {"truncate", required_argument, NULL, 't'},
- {"barcodes", required_argument, NULL, 'b'},
- {"window_size", required_argument, NULL, 'w'},
- {"score_limit", required_argument, NULL, 's'},
- {"encoding", required_argument, NULL, 'E'},
- {"len_limit", required_argument, NULL, 'L'},
- {"adapter_1", required_argument, NULL, 'A'},
- {"adapter_2", required_argument, NULL, 'G'},
- {"adapter_mm", required_argument, NULL, 'T'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "hHvcqrINFuVWxYZOPmDi:y:f:o:t:B:C:b:1:2:p:s:w:E:L:A:G:T:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 'i':
+ {"discards", no_argument, NULL, 'D'},
+ {"paired", no_argument, NULL, 'P'},
+ {"interleaved", no_argument, NULL, 'I'},
+ {"merge", no_argument, NULL, 'm'},
+ {"mate-pair", no_argument, NULL, 'M'},
+ {"no_overhang", no_argument, NULL, 'O'},
+ {"filter_illumina", no_argument, NULL, 'F'},
+ {"retain_header", no_argument, NULL, 'H'},
+ {"no_read_trimming", no_argument, NULL, 'N'},
+ {"null_index", no_argument, NULL, 'U'},
+ {"index_null", no_argument, NULL, 'u'},
+ {"inline_null", no_argument, NULL, 'V'},
+ {"index_index", no_argument, NULL, 'W'},
+ {"inline_inline", no_argument, NULL, 'x'},
+ {"index_inline", no_argument, NULL, 'Y'},
+ {"inline_index", no_argument, NULL, 'Z'},
+ {"barcode_dist_1", required_argument, NULL, 'B'},
+ {"barcode_dist_2", required_argument, NULL, 'C'},
+ {"infile_type", required_argument, NULL, 'i'},
+ {"outfile_type", required_argument, NULL, 'y'},
+ {"file", required_argument, NULL, 'f'},
+ {"file_p1", required_argument, NULL, '1'},
+ {"file_p2", required_argument, NULL, '2'},
+ {"path", required_argument, NULL, 'p'},
+ {"outpath", required_argument, NULL, 'o'},
+ {"truncate", required_argument, NULL, 't'},
+ {"barcodes", required_argument, NULL, 'b'},
+ {"window_size", required_argument, NULL, 'w'},
+ {"score_limit", required_argument, NULL, 's'},
+ {"encoding", required_argument, NULL, 'E'},
+ {"len_limit", required_argument, NULL, 'L'},
+ {"adapter_1", required_argument, NULL, 'A'},
+ {"adapter_2", required_argument, NULL, 'G'},
+ {"adapter_mm", required_argument, NULL, 'T'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hHvcqrINFuVWxYZOPmDi:y:f:o:t:B:C:b:1:2:p:s:w:E:L:A:G:T:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 'i':
if (strcasecmp(optarg, "bustard") == 0)
in_file_type = FileT::bustard;
- else if (strcasecmp(optarg, "bam") == 0)
+ else if (strcasecmp(optarg, "bam") == 0)
in_file_type = FileT::bam;
- else if (strcasecmp(optarg, "gzfastq") == 0)
+ else if (strcasecmp(optarg, "gzfastq") == 0)
in_file_type = FileT::gzfastq;
else
in_file_type = FileT::fastq;
- break;
- case 'y':
- if (strcasecmp(optarg, "fastq") == 0)
+ break;
+ case 'y':
+ if (strcasecmp(optarg, "fastq") == 0)
out_file_type = FileT::fastq;
- else if (strcasecmp(optarg, "gzfastq") == 0)
+ else if (strcasecmp(optarg, "gzfastq") == 0)
out_file_type = FileT::gzfastq;
- else if (strcasecmp(optarg, "fasta") == 0)
+ else if (strcasecmp(optarg, "fasta") == 0)
out_file_type = FileT::fasta;
- else if (strcasecmp(optarg, "gzfasta") == 0)
+ else if (strcasecmp(optarg, "gzfasta") == 0)
out_file_type = FileT::gzfasta;
- break;
- case 'E':
+ break;
+ case 'E':
if (strcasecmp(optarg, "phred64") == 0)
qual_offset = 64;
- else if (strcasecmp(optarg, "phred33") == 0)
- qual_offset = 33;
- break;
- case 'f':
- in_file = optarg;
- ftype = FileT::fastq;
- break;
- case 'p':
- in_path_1 = optarg;
- in_path_2 = in_path_1;
- ftype = FileT::fastq;
- break;
- case '1':
- paired = true;
- in_file_p1 = optarg;
- ftype = FileT::fastq;
- break;
- case '2':
- paired = true;
- in_file_p2 = optarg;
- ftype = FileT::fastq;
- break;
- case 'P':
- paired = true;
- break;
- case 'I':
- interleaved = true;
- break;
- case 'B':
- barcode_dist_1 = is_integer(optarg);
- break;
- case 'C':
- barcode_dist_2 = is_integer(optarg);
- break;
- case 'o':
- out_path = optarg;
- break;
- case 'm':
- merge = true;
- break;
- case 'M':
- matepair = true;
- break;
- case 'D':
- discards = true;
- break;
- case 'q':
- quality = true;
- break;
- case 'c':
- clean = true;
- break;
- case 'r':
- recover = true;
- break;
- case 'O':
- overhang = false;
- break;
- case 'F':
- filter_illumina = true;
- break;
- case 'H':
- retain_header = true;
- break;
- case 'N':
- trim_reads = false;
- break;
- case 't':
- truncate_seq = is_integer(optarg);
- break;
- case 'b':
- barcode_file = optarg;
- if (barcode_type == null_null)
- barcode_type = inline_null;
- break;
- case 'U':
- barcode_type = null_index;
- break;
- case 'u':
- barcode_type = index_null;
- break;
- case 'V':
- barcode_type = inline_null;
- break;
- case 'W':
- barcode_type = index_index;
- break;
- case 'x':
- barcode_type = inline_inline;
- break;
- case 'Y':
- barcode_type = index_inline;
- break;
- case 'Z':
- barcode_type = inline_index;
- break;
- case 'A':
- adapter_1 = new char[strlen(optarg) + 1];
- strcpy(adapter_1, optarg);
- filter_adapter = true;
- break;
- case 'G':
- adapter_2 = new char[strlen(optarg) + 1];
- strcpy(adapter_2, optarg);
- filter_adapter = true;
- break;
- case 'T':
- distance = is_integer(optarg);
- break;
- case 'L':
- len_limit = is_integer(optarg);
- break;
- case 'w':
- win_size = is_double(optarg);
- break;
- case 's':
- score_limit = is_integer(optarg);
- break;
+ else if (strcasecmp(optarg, "phred33") == 0)
+ qual_offset = 33;
+ break;
+ case 'f':
+ in_file = optarg;
+ ftype = FileT::fastq;
+ break;
+ case 'p':
+ in_path_1 = optarg;
+ in_path_2 = in_path_1;
+ ftype = FileT::fastq;
+ break;
+ case '1':
+ paired = true;
+ in_file_p1 = optarg;
+ ftype = FileT::fastq;
+ break;
+ case '2':
+ paired = true;
+ in_file_p2 = optarg;
+ ftype = FileT::fastq;
+ break;
+ case 'P':
+ paired = true;
+ break;
+ case 'I':
+ interleaved = true;
+ break;
+ case 'B':
+ barcode_dist_1 = is_integer(optarg);
+ break;
+ case 'C':
+ barcode_dist_2 = is_integer(optarg);
+ break;
+ case 'o':
+ out_path = optarg;
+ break;
+ case 'm':
+ merge = true;
+ break;
+ case 'M':
+ matepair = true;
+ break;
+ case 'D':
+ discards = true;
+ break;
+ case 'q':
+ quality = true;
+ break;
+ case 'c':
+ clean = true;
+ break;
+ case 'r':
+ recover = true;
+ break;
+ case 'O':
+ overhang = false;
+ break;
+ case 'F':
+ filter_illumina = true;
+ break;
+ case 'H':
+ retain_header = true;
+ break;
+ case 'N':
+ trim_reads = false;
+ break;
+ case 't':
+ truncate_seq = is_integer(optarg);
+ break;
+ case 'b':
+ barcode_file = optarg;
+ if (barcode_type == null_null)
+ barcode_type = inline_null;
+ break;
+ case 'U':
+ barcode_type = null_index;
+ break;
+ case 'u':
+ barcode_type = index_null;
+ break;
+ case 'V':
+ barcode_type = inline_null;
+ break;
+ case 'W':
+ barcode_type = index_index;
+ break;
+ case 'x':
+ barcode_type = inline_inline;
+ break;
+ case 'Y':
+ barcode_type = index_inline;
+ break;
+ case 'Z':
+ barcode_type = inline_index;
+ break;
+ case 'A':
+ adapter_1 = new char[strlen(optarg) + 1];
+ strcpy(adapter_1, optarg);
+ filter_adapter = true;
+ break;
+ case 'G':
+ adapter_2 = new char[strlen(optarg) + 1];
+ strcpy(adapter_2, optarg);
+ filter_adapter = true;
+ break;
+ case 'T':
+ distance = is_integer(optarg);
+ break;
+ case 'L':
+ len_limit = is_integer(optarg);
+ break;
+ case 'w':
+ win_size = is_double(optarg);
+ break;
+ case 's':
+ score_limit = is_integer(optarg);
+ break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
-
- default:
- cerr << "Unknown command line option '" << (char) c << "'\n";
- help();
- abort();
- }
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
+
+ default:
+ cerr << "Unknown command line option '" << (char) c << "'\n";
+ help();
+ abort();
+ }
}
if (in_file.length() == 0 && in_path_1.length() == 0 && in_file_p1.length() == 0) {
- cerr << "You must specify an input file of a directory path to a set of input files.\n";
- help();
+ cerr << "You must specify an input file of a directory path to a set of input files.\n";
+ help();
}
if (in_file.length() > 0 && in_path_1.length() > 0) {
- cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
- help();
+ cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
+ help();
}
if (in_file.length() > 0 && (in_file_p1.length() > 0 || in_file_p2.length() > 0)) {
- cerr << "You must specify either a single input file (-f) or a set of paired files (-1, -2), not both.\n";
- help();
+ cerr << "You must specify either a single input file (-f) or a set of paired files (-1, -2), not both.\n";
+ help();
}
if (in_path_1.length() > 0 && (in_file_p1.length() > 0 || in_file_p2.length() > 0)) {
- cerr << "You must specify either a file path (-p) or a set of paired files (-1, -2), not both.\n";
- help();
+ cerr << "You must specify either a file path (-p) or a set of paired files (-1, -2), not both.\n";
+ help();
}
- if (in_path_1.length() > 0 && in_path_1.at(in_path_1.length() - 1) != '/')
- in_path_1 += "/";
+ if (in_path_1.length() > 0 && in_path_1.at(in_path_1.length() - 1) != '/')
+ in_path_1 += "/";
- if (in_path_2.length() > 0 && in_path_2.at(in_path_2.length() - 1) != '/')
- in_path_2 += "/";
+ if (in_path_2.length() > 0 && in_path_2.at(in_path_2.length() - 1) != '/')
+ in_path_2 += "/";
- if (out_path.length() == 0)
- out_path = ".";
+ if (out_path.length() == 0)
+ out_path = ".";
- if (out_path.at(out_path.length() - 1) != '/')
- out_path += "/";
+ if (out_path.at(out_path.length() - 1) != '/')
+ out_path += "/";
if (barcode_file.length() == 0) {
- overhang = false;
- cerr << "No barcodes specified, files will not be demultiplexed.\n";
+ overhang = false;
+ cerr << "No barcodes specified, files will not be demultiplexed.\n";
}
if (barcode_file.length() > 0 && merge) {
- cerr << "You may specify a set of barcodes, or that all files should be merged, not both.\n";
- help();
+ cerr << "You may specify a set of barcodes, or that all files should be merged, not both.\n";
+ help();
}
if (in_file_type == FileT::unknown)
- in_file_type = ftype;
+ in_file_type = ftype;
if (in_file_type == FileT::bam && paired == true && interleaved == false) {
- cerr << "You may only specify a BAM input file for paired-end data if the read pairs are interleaved.\n";
- help();
+ cerr << "You may only specify a BAM input file for paired-end data if the read pairs are interleaved.\n";
+ help();
}
if (in_file_type == FileT::bam && (barcode_type != inline_null && barcode_type != inline_inline && barcode_type != null_null)) {
- cerr << "For BAM input files only inline or unbarcoded data can be processed.\n";
- help();
+ cerr << "For BAM input files only inline or unbarcoded data can be processed.\n";
+ help();
}
if (score_limit < 0 || score_limit > 40) {
- cerr << "Score limit must be between 0 and 40.\n";
- help();
+ cerr << "Score limit must be between 0 and 40.\n";
+ help();
}
if (win_size < 0 || win_size >= 1) {
- cerr << "Window size is a fraction between 0 and 1.\n";
- help();
+ cerr << "Window size is a fraction between 0 and 1.\n";
+ help();
}
if (recover && barcode_type != null_null) {
- if (barcode_type != index_null && barcode_type != inline_null && barcode_dist_2 < 0)
- barcode_dist_2 = barcode_dist_1;
+ if (barcode_type != index_null && barcode_type != inline_null && barcode_dist_2 < 0)
+ barcode_dist_2 = barcode_dist_1;
}
return 0;
@@ -1124,47 +1124,47 @@ void version() {
void help() {
std::cerr << "process_shortreads " << VERSION << "\n"
<< "process_shortreads [-f in_file | -p in_dir [-P] [-I] | -1 pair_1 -2 pair_2] -b barcode_file -o out_dir [-i type] [-y type] [-c] [-q] [-r] [-E encoding] [-t len] [-D] [-w size] [-s lim] [-h]\n"
- << " f: path to the input file if processing single-end seqeunces.\n"
- << " i: input file type, either 'bustard' for the Illumina BUSTARD format, 'bam', 'fastq' (default), or 'gzfastq' for gzipped FASTQ.\n"
- << " p: path to a directory of single-end Illumina files.\n"
- << " 1: first input file in a set of paired-end sequences.\n"
- << " 2: second input file in a set of paired-end sequences.\n"
- << " P: specify that input is paired (for use with '-p').\n"
- << " I: specify that the paired-end reads are interleaved in single files.\n"
- << " o: path to output the processed files.\n"
- << " y: output type, either 'fastq' or 'fasta' (default fastq).\n"
- << " b: a list of barcodes for this run.\n"
- << " c: clean data, remove any read with an uncalled base.\n"
- << " q: discard reads with low quality scores.\n"
- << " r: rescue barcodes.\n"
- << " t: truncate final read length to this value.\n"
- << " E: specify how quality scores are encoded, 'phred33' (Illumina 1.8+, Sanger) or 'phred64' (Illumina 1.3 - 1.5, default).\n"
- << " D: capture discarded reads to a file.\n"
- << " w: set the size of the sliding window as a fraction of the read length, between 0 and 1 (default 0.15).\n"
- << " s: set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).\n"
- << " h: display this help messsage.\n\n"
- << " Barcode options:\n"
- << " --inline_null: barcode is inline with sequence, occurs only on single-end read (default).\n"
- << " --index_null: barcode is provded in FASTQ header (Illumina i5 or i7 read).\n"
- << " --null_index: barcode is provded in FASTQ header (Illumina i7 read if both i5 and i7 read are provided).\n"
- << " --inline_inline: barcode is inline with sequence, occurs on single and paired-end read.\n"
- << " --index_index: barcode is provded in FASTQ header (Illumina i5 and i7 reads).\n"
- << " --inline_index: barcode is inline with sequence on single-end read and occurs in FASTQ header (from either i5 or i7 read).\n"
- << " --index_inline: barcode occurs in FASTQ header (Illumina i5 or i7 read) and is inline with single-end sequence (for single-end data) on paired-end read (for paired-end data).\n\n"
- << " Adapter options:\n"
- << " --adapter_1 <sequence>: provide adaptor sequence that may occur on the first read for filtering.\n"
- << " --adapter_2 <sequence>: provide adaptor sequence that may occur on the paired-read for filtering.\n"
- << " --adapter_mm <mismatches>: number of mismatches allowed in the adapter sequence.\n\n"
- << " Output options:\n"
- << " --retain_header: retain unmodified FASTQ headers in the output.\n"
- << " --merge: if no barcodes are specified, merge all input files into a single output file (or single pair of files).\n\n"
- << " Advanced options:\n"
- << " --no_read_trimming: do not trim low quality reads, just discard them.\n"
- << " --len_limit <limit>: when trimming sequences, specify the minimum length a sequence must be to keep it (default 31bp).\n"
- << " --filter_illumina: discard reads that have been marked by Illumina's chastity/purity filter as failing.\n"
- << " --barcode_dist: provide the distace between barcodes to allow for barcode rescue (default 2)\n"
- << " --mate-pair: raw reads are circularized mate-pair data, first read will be reverse complemented.\n"
- << " --no_overhang: data does not contain an overhang nucleotide between barcode and seqeunce.\n";
+ << " f: path to the input file if processing single-end seqeunces.\n"
+ << " i: input file type, either 'bustard' for the Illumina BUSTARD format, 'bam', 'fastq' (default), or 'gzfastq' for gzipped FASTQ.\n"
+ << " p: path to a directory of single-end Illumina files.\n"
+ << " 1: first input file in a set of paired-end sequences.\n"
+ << " 2: second input file in a set of paired-end sequences.\n"
+ << " P: specify that input is paired (for use with '-p').\n"
+ << " I: specify that the paired-end reads are interleaved in single files.\n"
+ << " o: path to output the processed files.\n"
+ << " y: output type, either 'fastq' or 'fasta' (default fastq).\n"
+ << " b: a list of barcodes for this run.\n"
+ << " c: clean data, remove any read with an uncalled base.\n"
+ << " q: discard reads with low quality scores.\n"
+ << " r: rescue barcodes.\n"
+ << " t: truncate final read length to this value.\n"
+ << " E: specify how quality scores are encoded, 'phred33' (Illumina 1.8+, Sanger) or 'phred64' (Illumina 1.3 - 1.5, default).\n"
+ << " D: capture discarded reads to a file.\n"
+ << " w: set the size of the sliding window as a fraction of the read length, between 0 and 1 (default 0.15).\n"
+ << " s: set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).\n"
+ << " h: display this help messsage.\n\n"
+ << " Barcode options:\n"
+ << " --inline_null: barcode is inline with sequence, occurs only on single-end read (default).\n"
+ << " --index_null: barcode is provded in FASTQ header (Illumina i5 or i7 read).\n"
+ << " --null_index: barcode is provded in FASTQ header (Illumina i7 read if both i5 and i7 read are provided).\n"
+ << " --inline_inline: barcode is inline with sequence, occurs on single and paired-end read.\n"
+ << " --index_index: barcode is provded in FASTQ header (Illumina i5 and i7 reads).\n"
+ << " --inline_index: barcode is inline with sequence on single-end read and occurs in FASTQ header (from either i5 or i7 read).\n"
+ << " --index_inline: barcode occurs in FASTQ header (Illumina i5 or i7 read) and is inline with single-end sequence (for single-end data) on paired-end read (for paired-end data).\n\n"
+ << " Adapter options:\n"
+ << " --adapter_1 <sequence>: provide adaptor sequence that may occur on the first read for filtering.\n"
+ << " --adapter_2 <sequence>: provide adaptor sequence that may occur on the paired-read for filtering.\n"
+ << " --adapter_mm <mismatches>: number of mismatches allowed in the adapter sequence.\n\n"
+ << " Output options:\n"
+ << " --retain_header: retain unmodified FASTQ headers in the output.\n"
+ << " --merge: if no barcodes are specified, merge all input files into a single output file (or single pair of files).\n\n"
+ << " Advanced options:\n"
+ << " --no_read_trimming: do not trim low quality reads, just discard them.\n"
+ << " --len_limit <limit>: when trimming sequences, specify the minimum length a sequence must be to keep it (default 31bp).\n"
+ << " --filter_illumina: discard reads that have been marked by Illumina's chastity/purity filter as failing.\n"
+ << " --barcode_dist: provide the distace between barcodes to allow for barcode rescue (default 2)\n"
+ << " --mate-pair: raw reads are circularized mate-pair data, first read will be reverse complemented.\n"
+ << " --no_overhang: data does not contain an overhang nucleotide between barcode and seqeunce.\n";
exit(0);
}
diff --git a/src/process_shortreads.h b/src/process_shortreads.h
index fcc3f4e..a128b71 100644
--- a/src/process_shortreads.h
+++ b/src/process_shortreads.h
@@ -47,7 +47,7 @@ using std::set;
#include <utility>
using std::pair;
-#include "constants.h"
+#include "constants.h"
#include "clean.h"
#include "file_io.h"
#include "utils.h"
@@ -62,21 +62,21 @@ void help( void );
void version( void );
int parse_command_line(int, char **);
template<typename fhType>
-int process_reads(string,
- set<string> &, set<string> &,
- map<BarcodePair, fhType *> &,
- map<string, long> &, map<BarcodePair, map<string, long> > &);
+int process_reads(string,
+ set<string> &, set<string> &,
+ map<BarcodePair, fhType *> &,
+ map<string, long> &, map<BarcodePair, map<string, long> > &);
template<typename fhType>
-int process_paired_reads(string, string,
- set<string> &, set<string> &,
- map<BarcodePair, fhType *> &,
- map<BarcodePair, fhType *> &,
- map<BarcodePair, fhType *> &,
- map<BarcodePair, fhType *> &,
- map<string, long> &, map<BarcodePair, map<string, long> > &);
-int process_singlet(Read *,
- bool,
- map<string, long> &, map<string, long> &);
+int process_paired_reads(string, string,
+ set<string> &, set<string> &,
+ map<BarcodePair, fhType *> &,
+ map<BarcodePair, fhType *> &,
+ map<BarcodePair, fhType *> &,
+ map<BarcodePair, fhType *> &,
+ map<string, long> &, map<BarcodePair, map<string, long> > &);
+int process_singlet(Read *,
+ bool,
+ map<string, long> &, map<string, long> &);
int dist(const char *, char *);
int print_results(int, char **, vector<BarcodePair> &, map<string, map<string, long> > &, map<BarcodePair, map<string, long> > &);
diff --git a/src/pstacks.cc b/src/pstacks.cc
index 11bb13b..56a2566 100644
--- a/src/pstacks.cc
+++ b/src/pstacks.cc
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -33,6 +33,8 @@ FileT out_file_type;
string out_path;
int sql_id = 0;
int min_stack_cov = 3;
+double req_pct_aln = 0.85;
+bool keep_sec_alns = false;
int num_threads = 1;
//
@@ -57,7 +59,7 @@ int main (int argc, char* argv[]) {
case snp:
cerr << "SNP\n";
break;
- case fixed:
+ case fixed:
cerr << "Fixed\n";
break;
case bounded:
@@ -67,7 +69,7 @@ int main (int argc, char* argv[]) {
cerr << "Alpha significance level for model: " << alpha << "\n";
//
- // Set limits to call het or homozygote according to chi-square distribution with one
+ // Set limits to call het or homozygote according to chi-square distribution with one
// degree of freedom:
// http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value
//
@@ -92,13 +94,17 @@ int main (int argc, char* argv[]) {
omp_set_num_threads(num_threads);
#endif
- HashMap radtags;
- set<int> merge_map;
+ HashMap* radtags = new HashMap();
map<int, PStack *> unique;
- load_radtags(in_file, radtags);
+ load_radtags(in_file, *radtags);
- reduce_radtags(radtags, unique);
+ reduce_radtags(*radtags, unique);
+
+ for (auto& stack : *radtags)
+ for (Seq* read : stack.second)
+ delete read;
+ delete radtags;
//dump_stacks(unique);
@@ -148,7 +154,7 @@ int call_alleles(MergedStack *mtag, vector<DNANSeq *> &reads) {
// Check to make sure the nucleotide at the location of this SNP is
// of one of the two possible states the multinomial model called.
//
- if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
+ if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
allele += base;
else
break;
@@ -168,13 +174,13 @@ int call_consensus(map<int, MergedStack *> &merged, map<int, PStack *> &unique,
//
map<int, MergedStack *>::iterator it;
vector<int> keys;
- for (it = merged.begin(); it != merged.end(); it++)
+ for (it = merged.begin(); it != merged.end(); it++)
keys.push_back(it->first);
int i;
#pragma omp parallel private(i)
- {
- #pragma omp for schedule(dynamic)
+ {
+ #pragma omp for schedule(dynamic)
for (i = 0; i < (int) keys.size(); i++) {
MergedStack *mtag;
PStack *utag;
@@ -184,7 +190,7 @@ int call_consensus(map<int, MergedStack *> &merged, map<int, PStack *> &unique,
//
// Create a two-dimensional array, each row containing one read. For
// each unique tag that has been merged together, add the sequence for
- // that tag into our array as many times as it originally occurred.
+ // that tag into our array as many times as it originally occurred.
//
vector<int>::iterator j;
vector<DNANSeq *> reads;
@@ -227,7 +233,7 @@ int call_consensus(map<int, MergedStack *> &merged, map<int, PStack *> &unique,
max = nuc.end();
for (n = nuc.begin(); n != nuc.end(); n++) {
- if (n->first == 'N')
+ if (n->first == 'N')
continue;
if (max == nuc.end() || n->second > max->second)
max = n;
@@ -306,7 +312,7 @@ double calc_coverage_distribution(map<int, PStack *> &unique, map<int, MergedSta
if (depth < min_stack_cov)
continue;
- if (depth > max)
+ if (depth > max)
max = depth;
sum += depth;
@@ -378,7 +384,7 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
string snp_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".snps.tsv";
string all_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".alleles.tsv";
string mod_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".models.tsv";
-
+
if (gzip) {
tag_file += ".gz";
snp_file += ".gz";
@@ -459,7 +465,7 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
time(&rawtime);
timeinfo = localtime(&rawtime);
strftime(date, 32, "%F %T", timeinfo);
- log << "# pstacks version " << VERSION << "; generated on " << date << "\n";
+ log << "# pstacks version " << VERSION << "; generated on " << date << "\n";
if (gzip) {
gzputs(gz_tags, log.str().c_str());
gzputs(gz_mods, log.str().c_str());
@@ -502,15 +508,15 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
if (tag_1->blacklisted) blacklisted++;
// First write the consensus sequence
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
<< tag_1->loc.chr << "\t"
<< tag_1->loc.bp << "\t"
<< (tag_1->loc.strand == strand_plus ? "+" : "-") << "\t"
- << "consensus\t" << "\t\t"
- << tag_1->con << "\t"
- << tag_1->deleveraged << "\t"
+ << "consensus\t" << "\t\t"
+ << tag_1->con << "\t"
+ << tag_1->deleveraged << "\t"
<< tag_1->blacklisted << "\t"
<< tag_1->lumberjackstack << "\t"
<< tag_1->lnl << "\n";
@@ -518,9 +524,9 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
//
// Write a sequence recording the output of the SNP model for each nucleotide.
//
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
<< "\t"
<< "\t"
<< "\t"
@@ -544,7 +550,7 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
<< "\t"
<< "\t"
<< "\n";
-
+
if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
if (gzip) gzputs(gz_mods, sstr.str().c_str()); else mods << sstr.str();
sstr.str("");
@@ -568,9 +574,9 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
// Write out the model calls for each nucleotide in this locus.
//
for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
<< (*s)->col << "\t";
switch((*s)->type) {
@@ -586,8 +592,8 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
}
sstr << std::fixed << std::setprecision(2)
- << (*s)->lratio << "\t"
- << (*s)->rank_1 << "\t"
+ << (*s)->lratio << "\t"
+ << (*s)->rank_1 << "\t"
<< (*s)->rank_2 << "\t\t\n";
}
@@ -600,11 +606,11 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
char pct[id_len];
for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
sprintf(pct, "%.2f", ((t->second/total) * 100));
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
- << t->first << "\t"
- << pct << "\t"
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
+ << t->first << "\t"
+ << pct << "\t"
<< t->second << "\n";
}
if (gzip) gzputs(gz_alle, sstr.str().c_str()); else alle << sstr.str();
@@ -643,9 +649,9 @@ int populate_merged_tags(map<int, PStack *> &unique, map<int, MergedStack *> &me
// Create a map of each unique Stack that has been aligned to the same genomic location.
//
for (i = unique.begin(); i != unique.end(); i++) {
- snprintf(id, id_len - 1, "%s|%d|%s",
- i->second->loc.chr,
- i->second->loc.bp,
+ snprintf(id, id_len - 1, "%s|%d|%s",
+ i->second->loc.chr,
+ i->second->loc.bp,
i->second->loc.strand == strand_plus ? "+" : "-");
locations[id].insert(i->second->id);
}
@@ -656,7 +662,7 @@ int populate_merged_tags(map<int, PStack *> &unique, map<int, MergedStack *> &me
m = new MergedStack;
m->id = global_id;
- //
+ //
// Record the consensus and physical location for this stack.
//
s = k->second.begin();
@@ -694,7 +700,7 @@ int populate_merged_tags(map<int, PStack *> &unique, map<int, MergedStack *> &me
int reduce_radtags(HashMap &radtags, map<int, PStack *> &unique) {
HashMap::iterator it;
vector<Seq *>::iterator sit;
-
+
PStack *u;
int global_id = 1;
@@ -715,7 +721,7 @@ int reduce_radtags(HashMap &radtags, map<int, PStack *> &unique) {
u = new PStack;
u->id = global_id;
u->count = lit->second;
- u->add_seq(it->first);
+ u->add_seq(&it->first);
//
// Record the physical location of this stack.
@@ -733,7 +739,7 @@ int reduce_radtags(HashMap &radtags, map<int, PStack *> &unique) {
}
cerr << " " << radtags.size() << " unique stacks were aligned to " << unique.size() << " genomic locations.\n";
-
+
return 0;
}
@@ -743,7 +749,7 @@ int reduce_radtags(HashMap &radtags, map<int, PStack *> &unique) {
//
int load_radtags(string in_file, HashMap &radtags) {
Input *fh = NULL;
- Seq *c;
+ Seq c;
if (in_file_type == FileT::bowtie)
fh = new Bowtie(in_file.c_str());
@@ -756,23 +762,64 @@ int load_radtags(string in_file, HashMap &radtags) {
cerr << "Parsing " << in_file.c_str() << "\n";
- int i = 1;
- while ((c = fh->next_seq()) != NULL) {
- if (i % 10000 == 0) cerr << "Loading aligned sequence " << i << " \r";
- // cerr << "Loading aligned sequence " << i << " \n";
+ int secondary = 0;
+ int supplementary = 0;
+ int below_req_aln = 0;
+ int i = 0;
+ cerr << "Loading aligned sequences...";
+ while ((fh->next_seq(c)) != 0) {
+ if (i % 1000000 == 0 && i>0)
+ cerr << i/1000000 << "M...";
- radtags[c->seq].push_back(c);
i++;
+
+ switch (c.aln_type) {
+ case sec_aln:
+ secondary++;
+ if (!keep_sec_alns)
+ continue;
+ break;
+ case sup_aln:
+ supplementary++;
+ continue;
+ break;
+ case pri_aln:
+ default:
+ break;
+ }
+
+ if (c.pct_aln < req_pct_aln) {
+ below_req_aln++;
+ continue;
+ }
+
+ HashMap::iterator element = radtags.insert({DNANSeq(strlen(c.seq), c.seq), vector<Seq*>()}).first;
+ element->second.push_back(new Seq(c));
+ Seq& the_seq = *element->second.back();
+ if (the_seq.seq != NULL) {
+ delete[] the_seq.seq;
+ the_seq.seq = NULL;
+ }
+ if (the_seq.qual != NULL) {
+ delete[] the_seq.qual;
+ the_seq.qual = NULL;
+ }
}
+ cerr << "done\n";
if (i == 0) {
cerr << "Error: Unable to load data from '" << in_file.c_str() << "'.\n";
- exit(1);
+ exit(-1);
}
- cerr << " " <<
- "Analyzed " << i - 1 << " sequence reads; " <<
- "Identified " << radtags.size() << " unique stacks from those reads.\n";
+ cerr << "Loaded " << i << " sequence reads; "
+ << "identified " << radtags.size() << " unique stacks from those reads.\n"
+ << " Discarded " << below_req_aln << " reads where the aligned percentage of the read was too low.\n";
+ if (keep_sec_alns)
+ cerr << " Kept " << secondary << " secondarily aligned reads (reads may be present in the data set more than once).\n";
+ else
+ cerr << " Discarded " << secondary << " secondarily aligned reads (primary alignments were retained).\n";
+ cerr << " Discarded " << supplementary << " supplementary aligned (chimeric) reads.\n";
//
// Close the file and delete the Input object.
@@ -792,7 +839,7 @@ int dump_stacks(map<int, PStack *> &u) {
cerr << "Stack ID: " << (*it).second->id << "\n"
<< " Seq: " << (*it).second->seq->seq() << "\n"
- << " IDs: ";
+ << " IDs: ";
for (fit = (*it).second->map.begin(); fit != (*it).second->map.end(); fit++)
cerr << *fit << " ";
@@ -814,9 +861,9 @@ int dump_merged_stacks(map<int, MergedStack *> &m) {
<< " Consensus: ";
if (it->second->con != NULL)
cerr << it->second->con << "\n";
- else
+ else
cerr << "\n";
- cerr << " IDs: ";
+ cerr << " IDs: ";
for (fit = it->second->utags.begin(); fit != it->second->utags.end(); fit++)
cerr << (*fit) << " ";
@@ -846,6 +893,8 @@ int parse_command_line(int argc, char* argv[]) {
{"outpath", required_argument, NULL, 'o'},
{"id", required_argument, NULL, 'i'},
{"min_cov", required_argument, NULL, 'm'},
+ {"pct_aln", required_argument, NULL, 'a'},
+ {"keep_sec_aln", required_argument, NULL, 'k'},
{"num_threads", required_argument, NULL, 'p'},
{"bc_err_freq", required_argument, NULL, 'e'},
{"model_type", required_argument, NULL, 'T'},
@@ -858,7 +907,7 @@ int parse_command_line(int argc, char* argv[]) {
// getopt_long stores the option index here.
int option_index = 0;
- c = getopt_long(argc, argv, "hvOT:A:L:U:f:o:i:e:p:m:s:f:t:y:", long_options, &option_index);
+ c = getopt_long(argc, argv, "hkvOT:a:A:L:U:f:o:i:e:p:m:s:f:t:y:", long_options, &option_index);
// Detect the end of the options.
if (c == -1)
@@ -902,6 +951,19 @@ int parse_command_line(int argc, char* argv[]) {
case 'm':
min_stack_cov = atoi(optarg);
break;
+ case 'a':
+ req_pct_aln = is_double(optarg);
+ if (req_pct_aln > 1)
+ req_pct_aln = req_pct_aln / 100;
+
+ if (req_pct_aln < 0 || req_pct_aln > 1.0) {
+ cerr << "Unable to parse the required alignment percentage.\n";
+ help();
+ }
+ break;
+ case 'k':
+ keep_sec_alns = true;
+ break;
case 'e':
barcode_err_freq = atof(optarg);
break;
@@ -935,7 +997,7 @@ int parse_command_line(int argc, char* argv[]) {
// getopt_long already printed an error message.
help();
break;
-
+
default:
cerr << "Unknown command line option '" << (char) c << "'\n";
help();
@@ -967,10 +1029,10 @@ int parse_command_line(int argc, char* argv[]) {
help();
}
- if (out_path.length() == 0)
+ if (out_path.length() == 0)
out_path = ".";
- if (out_path.at(out_path.length() - 1) != '/')
+ if (out_path.at(out_path.length() - 1) != '/')
out_path += "/";
if (model_type == fixed && barcode_err_freq == 0) {
@@ -994,10 +1056,12 @@ void help() {
<< " f: input file path.\n"
<< " o: output path to write results.\n"
<< " i: SQL ID to insert into the output to identify this sample.\n"
- << " m: minimum depth of coverage to report a stack (default 1).\n"
+ << " m: minimum depth of coverage to report a stack (default 3).\n"
<< " p: enable parallel execution with num_threads threads.\n"
<< " h: display this help messsage.\n"
- << " Model options:\n"
+ << " --pct_aln <num>: require read alignments to use at least this percentage of the read (default 85%).\n"
+ << " --keep_sec_alns: keep secondary alignments (default: false, only keep primary alignments).\n"
+ << " Model options:\n"
<< " --model_type <type>: either 'snp' (default), 'bounded', or 'fixed'\n"
<< " For the SNP or Bounded SNP model:\n"
<< " --alpha <num>: chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001.\n"
diff --git a/src/pstacks.h b/src/pstacks.h
index 1fe1803..3a10cb2 100644
--- a/src/pstacks.h
+++ b/src/pstacks.h
@@ -52,7 +52,8 @@ using std::set;
#include <utility>
using std::pair;
-#include "constants.h"
+#include "config.h"
+#include "constants.h"
#include "stacks.h" // Major data structures for holding stacks
#include "mstack.h"
#include "kmers.h"
@@ -67,9 +68,9 @@ using std::pair;
const int barcode_size = 5;
#ifdef HAVE_SPARSEHASH
-typedef sparse_hash_map<const char *, vector<Seq *>, hash_charptr, eqstr> HashMap;
+typedef sparse_hash_map<DNANSeq, vector<Seq*> > HashMap;
#else
-typedef unordered_map<const char *, vector<Seq *>, hash_charptr, eqstr> HashMap;
+typedef unordered_map<DNANSeq, vector<Seq*> > HashMap;
#endif
void help( void );
diff --git a/src/rxstacks.cc b/src/rxstacks.cc
index b800df1..87c71a4 100644
--- a/src/rxstacks.cc
+++ b/src/rxstacks.cc
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2013-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2013-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -62,27 +62,27 @@ int main (int argc, char* argv[]) {
parse_command_line(argc, argv);
cerr
- << "Log liklihood filtering: " << (filter_lnl == true ? "on" : "off") << "; threshold: " << lnl_limit << "\n"
- << "Prune haplotypes: " << (prune_haplotypes == true ? "yes" : "no") << "\n"
- << "Filter confounded loci: " << (filter_confounded == true ? "yes" : "no") << "\n";
+ << "Log liklihood filtering: " << (filter_lnl == true ? "on" : "off") << "; threshold: " << lnl_limit << "\n"
+ << "Prune haplotypes: " << (prune_haplotypes == true ? "yes" : "no") << "\n"
+ << "Filter confounded loci: " << (filter_confounded == true ? "yes" : "no") << "\n";
//
- // Set limits to call het or homozygote according to chi-square distribution with one
+ // Set limits to call het or homozygote according to chi-square distribution with one
// degree of freedom:
// http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value
//
if (alpha == 0.1) {
- heterozygote_limit = -2.71;
- homozygote_limit = 2.71;
+ heterozygote_limit = -2.71;
+ homozygote_limit = 2.71;
} else if (alpha == 0.05) {
- heterozygote_limit = -3.84;
- homozygote_limit = 3.84;
+ heterozygote_limit = -3.84;
+ homozygote_limit = 3.84;
} else if (alpha == 0.01) {
- heterozygote_limit = -6.64;
- homozygote_limit = 6.64;
+ heterozygote_limit = -6.64;
+ homozygote_limit = 6.64;
} else if (alpha == 0.001) {
- heterozygote_limit = -10.83;
- homozygote_limit = 10.83;
+ heterozygote_limit = -10.83;
+ homozygote_limit = 10.83;
}
//
@@ -115,14 +115,14 @@ int main (int argc, char* argv[]) {
int res;
catalog_file << in_path << "batch_" << batch_id << ".catalog";
if ((res = load_loci(catalog_file.str(), catalog, false, false, compressed)) == 0) {
- cerr << "Unable to load the catalog '" << catalog_file.str() << "'\n";
- return 0;
+ cerr << "Unable to load the catalog '" << catalog_file.str() << "'\n";
+ return 0;
}
in_file_type = compressed == true ? FileT::gzsql : FileT::sql;
//
- // Let's fill in the SNP model calls to include both hets and homozygotes to
+ // Let's fill in the SNP model calls to include both hets and homozygotes to
// make it easier to iterate over them later.
//
fill_catalog_snps(catalog);
@@ -167,7 +167,7 @@ int main (int argc, char* argv[]) {
//
// Create the population map
- //
+ //
cerr << "Populating observed haplotypes for " << mpopi.samples().size() << " samples, " << catalog.size() << " loci.\n";
PopMap<CSLocus> *pmap = new PopMap<CSLocus>(mpopi, catalog.size());
pmap->populate(catalog, catalog_matches);
@@ -190,62 +190,62 @@ int main (int argc, char* argv[]) {
for (uint i = 0; i < mpopi.samples().size(); i++) {
const Sample& sample = mpopi.samples()[i];
- cerr << "Loading stacks from sample " << sample.name << " [" << i+1 << " of " << mpopi.samples().size() << "]...\n";
-
- //////
- //////
- ////// if (sample_id != 176) continue;
- //////
- //////
-
- map<int, Locus *> stacks;
- int res;
- if ((res = load_loci(in_path + sample.name, stacks, true, true, compressed)) == 0) {
- cerr << "Unable to load sample file '" << sample.name << "'\n";
- continue;
- }
-
- cerr << "Making corrections to sample " << sample.name << "...";
-
- set<pair<int, int> > uniq_matches;
- set<pair<int, int> >::iterator it;
- vector<pair<int, int> > matches;
-
- //
- // There are multiple matches per stack, but we only need to process
- // each stack once to make corrections.
- //
- for (uint j = 0; j < catalog_matches[i].size(); j++) {
- catalog_id = catalog_matches[i][j]->cat_id;
- tag_id = catalog_matches[i][j]->tag_id;
-
- uniq_matches.insert(make_pair(catalog_id, tag_id));
- }
-
- //
- // Put the catalog/tag ID pairs into a vector for parallel processing.
- //
- for (it = uniq_matches.begin(); it != uniq_matches.end(); it++)
- matches.push_back(*it);
-
- unsigned long int nuc_cnt = 0;
- unsigned long int unk_hom_cnt = 0;
- unsigned long int unk_het_cnt = 0;
- unsigned long int het_unk_cnt = 0;
- unsigned long int hom_unk_cnt = 0;
- unsigned long int het_hom_cnt = 0;
- unsigned long int hom_het_cnt = 0;
- unsigned long int conf_loci_cnt = 0;
- unsigned long int pruned_hap_cnt = 0;
- unsigned long int pruned_mst_hap_cnt = 0;
- unsigned long int blacklist_cnt = 0;
- unsigned long int lnl_cnt = 0;
+ cerr << "Loading stacks from sample " << sample.name << " [" << i+1 << " of " << mpopi.samples().size() << "]...\n";
+
+ //////
+ //////
+ ////// if (sample_id != 176) continue;
+ //////
+ //////
+
+ map<int, Locus *> stacks;
+ int res;
+ if ((res = load_loci(in_path + sample.name, stacks, true, true, compressed)) == 0) {
+ cerr << "Unable to load sample file '" << sample.name << "'\n";
+ continue;
+ }
+
+ cerr << "Making corrections to sample " << sample.name << "...";
+
+ set<pair<int, int> > uniq_matches;
+ set<pair<int, int> >::iterator it;
+ vector<pair<int, int> > matches;
+
+ //
+ // There are multiple matches per stack, but we only need to process
+ // each stack once to make corrections.
+ //
+ for (uint j = 0; j < catalog_matches[i].size(); j++) {
+ catalog_id = catalog_matches[i][j]->cat_id;
+ tag_id = catalog_matches[i][j]->tag_id;
+
+ uniq_matches.insert(make_pair(catalog_id, tag_id));
+ }
+
+ //
+ // Put the catalog/tag ID pairs into a vector for parallel processing.
+ //
+ for (it = uniq_matches.begin(); it != uniq_matches.end(); it++)
+ matches.push_back(*it);
+
+ unsigned long int nuc_cnt = 0;
+ unsigned long int unk_hom_cnt = 0;
+ unsigned long int unk_het_cnt = 0;
+ unsigned long int het_unk_cnt = 0;
+ unsigned long int hom_unk_cnt = 0;
+ unsigned long int het_hom_cnt = 0;
+ unsigned long int hom_het_cnt = 0;
+ unsigned long int conf_loci_cnt = 0;
+ unsigned long int pruned_hap_cnt = 0;
+ unsigned long int pruned_mst_hap_cnt = 0;
+ unsigned long int blacklist_cnt = 0;
+ unsigned long int lnl_cnt = 0;
#pragma omp parallel private(catalog_id, tag_id)
- {
- Datum *d;
- Locus *loc;
- CSLocus *cloc;
+ {
+ Datum *d;
+ Locus *loc;
+ CSLocus *cloc;
uint seq_len;
string seq, model;
char *adj_seq;
@@ -253,42 +253,42 @@ int main (int argc, char* argv[]) {
vector<char *> reads;
#pragma omp for schedule(dynamic, 1) reduction(+:nuc_cnt) reduction(+:unk_hom_cnt) reduction(+:unk_het_cnt) \
- reduction(+:hom_unk_cnt) reduction(+:het_unk_cnt) reduction(+:hom_het_cnt) reduction(+:het_hom_cnt) \
- reduction(+:conf_loci_cnt) reduction(+:pruned_hap_cnt) reduction(+:pruned_mst_hap_cnt) reduction(+:blacklist_cnt) reduction(+:lnl_cnt)
- for (uint j = 0; j < matches.size(); j++) {
- catalog_id = matches[j].first;
- tag_id = matches[j].second;
-
- //if (tag_id == 10970) {
- // cerr << "Hit the tag.\n";
- //}
-
- //// if (catalog_id != 3080) continue;
-
- if (catalog.count(catalog_id) == 0) continue;
-
- cloc = catalog[catalog_id];
- loc = stacks[tag_id];
-
- if (filter_confounded &&
- ((double) cloc->confounded_cnt / (double) cloc->cnt > confounded_limit)) {
- // cerr << "Catalog locus " << cloc->id << " is confounded; confounded cnt: "
- // << cloc->confounded_cnt << "; total: " << cloc->cnt
- // << "; freq: " << (double) cloc->confounded_cnt / (double)cloc->cnt << "\n";
- loc->blacklisted = true;
- conf_loci_cnt++;
- continue;
- }
+ reduction(+:hom_unk_cnt) reduction(+:het_unk_cnt) reduction(+:hom_het_cnt) reduction(+:het_hom_cnt) \
+ reduction(+:conf_loci_cnt) reduction(+:pruned_hap_cnt) reduction(+:pruned_mst_hap_cnt) reduction(+:blacklist_cnt) reduction(+:lnl_cnt)
+ for (uint j = 0; j < matches.size(); j++) {
+ catalog_id = matches[j].first;
+ tag_id = matches[j].second;
+
+ //if (tag_id == 10970) {
+ // cerr << "Hit the tag.\n";
+ //}
+
+ //// if (catalog_id != 3080) continue;
+
+ if (catalog.count(catalog_id) == 0) continue;
+
+ cloc = catalog[catalog_id];
+ loc = stacks[tag_id];
+
+ if (filter_confounded &&
+ ((double) cloc->confounded_cnt / (double) cloc->cnt > confounded_limit)) {
+ // cerr << "Catalog locus " << cloc->id << " is confounded; confounded cnt: "
+ // << cloc->confounded_cnt << "; total: " << cloc->cnt
+ // << "; freq: " << (double) cloc->confounded_cnt / (double)cloc->cnt << "\n";
+ loc->blacklisted = true;
+ conf_loci_cnt++;
+ continue;
+ }
- d = pmap->datum(catalog_id, sample.id);
+ d = pmap->datum(catalog_id, sample.id);
- if (d == NULL) continue;
+ if (d == NULL) continue;
- if (filter_lnl && cloc->lnl < lnl_limit) {
- loc->blacklisted = true;
- lnl_cnt++;
- continue;
- }
+ if (filter_lnl && cloc->lnl < lnl_limit) {
+ loc->blacklisted = true;
+ lnl_cnt++;
+ continue;
+ }
//
// If this locus was matched to the catalog using a gapped alignment, adjust the sequence
@@ -311,23 +311,23 @@ int main (int argc, char* argv[]) {
}
}
- prune_nucleotides(cloc, loc, d, log_snp_fh,
- nuc_cnt,
- unk_hom_cnt, unk_het_cnt,
- hom_unk_cnt, het_unk_cnt,
- hom_het_cnt, het_hom_cnt);
+ prune_nucleotides(cloc, loc, d, log_snp_fh,
+ nuc_cnt,
+ unk_hom_cnt, unk_het_cnt,
+ hom_unk_cnt, het_unk_cnt,
+ hom_het_cnt, het_hom_cnt);
- //
- // Prune haplotypes from this locus.
- //
- if (prune_haplotypes) {
- prune_mst_haplotypes(cloc, d, loc, pruned_mst_hap_cnt, log_hap_fh);
+ //
+ // Prune haplotypes from this locus.
+ //
+ if (prune_haplotypes) {
+ prune_mst_haplotypes(cloc, d, loc, pruned_mst_hap_cnt, log_hap_fh);
- prune_locus_haplotypes(cloc, d, loc, pruned_hap_cnt, log_hap_fh);
+ prune_locus_haplotypes(cloc, d, loc, pruned_hap_cnt, log_hap_fh);
- if (loc->blacklisted)
- blacklist_cnt++;
- }
+ if (loc->blacklisted)
+ blacklist_cnt++;
+ }
//
// If this locus was matched to the catalog using a gapped alignment, de-adjust the sequence
@@ -347,70 +347,70 @@ int main (int argc, char* argv[]) {
}
reads.clear();
}
- }
+ }
}
- cerr << "done.\n";
-
- unsigned long int total = unk_hom_cnt + unk_het_cnt + hom_unk_cnt + het_unk_cnt + hom_het_cnt + het_hom_cnt;
-
- cerr << "Total nucleotides processed: " << nuc_cnt << "\n"
- << " Total nucleotides converted: " << total << "\n"
- << " Converted from unknown to homozygous: " << unk_hom_cnt << " nucleotides.\n"
- << " Converted from unknown to heterozygous: " << unk_het_cnt << " nucleotides.\n"
- << " Converted from homozygous to unknown: " << hom_unk_cnt << " nucleotides.\n"
- << " Converted from heterozygous to unknown: " << het_unk_cnt << " nucleotides.\n"
- << " Converted from homozygous to heterozygous: " << hom_het_cnt << " nucleotides.\n"
- << " Converted from heterozygous to homozygous: " << het_hom_cnt << " nucleotides.\n"
- << "Pruned: " << pruned_mst_hap_cnt << " haplotypes using a tree method.\n"
- << "Pruned: " << pruned_hap_cnt << " haplotypes using a rare haplotype method.\n"
- << "Blacklisted: " << blacklist_cnt << " loci due to inability to call haplotypes.\n"
- << "Blacklisted: " << lnl_cnt << " loci due to log likelihoods below threshold.\n"
- << "Blacklisted: " << conf_loci_cnt << " confounded loci.\n";
-
- log_fh << sample.name << "\t"
- << nuc_cnt << "\t"
- << total << "\t"
- << unk_hom_cnt << "\t"
- << unk_het_cnt << "\t"
- << hom_unk_cnt << "\t"
- << het_unk_cnt << "\t"
- << hom_het_cnt << "\t"
- << het_hom_cnt << "\t"
- << blacklist_cnt << "\t"
- << conf_loci_cnt << "\t"
- << lnl_cnt << "\t"
- << pruned_hap_cnt << "\t"
- << pruned_mst_hap_cnt << "\n";
-
- cerr << "Writing modified stacks, SNPs, alleles to '" << out_path << "'...";
-
- //
- // Rewrite stacks, model outputs, and haplotypes.
- //
- write_results(sample.name, stacks);
-
- //
- // Free up memory
- //
- cerr << "Freeing memory...";
- map<int, Locus *>::iterator stack_it;
- for (stack_it = stacks.begin(); stack_it != stacks.end(); stack_it++)
- delete stack_it->second;
- cerr << "done.\n";
+ cerr << "done.\n";
+
+ unsigned long int total = unk_hom_cnt + unk_het_cnt + hom_unk_cnt + het_unk_cnt + hom_het_cnt + het_hom_cnt;
+
+ cerr << "Total nucleotides processed: " << nuc_cnt << "\n"
+ << " Total nucleotides converted: " << total << "\n"
+ << " Converted from unknown to homozygous: " << unk_hom_cnt << " nucleotides.\n"
+ << " Converted from unknown to heterozygous: " << unk_het_cnt << " nucleotides.\n"
+ << " Converted from homozygous to unknown: " << hom_unk_cnt << " nucleotides.\n"
+ << " Converted from heterozygous to unknown: " << het_unk_cnt << " nucleotides.\n"
+ << " Converted from homozygous to heterozygous: " << hom_het_cnt << " nucleotides.\n"
+ << " Converted from heterozygous to homozygous: " << het_hom_cnt << " nucleotides.\n"
+ << "Pruned: " << pruned_mst_hap_cnt << " haplotypes using a tree method.\n"
+ << "Pruned: " << pruned_hap_cnt << " haplotypes using a rare haplotype method.\n"
+ << "Blacklisted: " << blacklist_cnt << " loci due to inability to call haplotypes.\n"
+ << "Blacklisted: " << lnl_cnt << " loci due to log likelihoods below threshold.\n"
+ << "Blacklisted: " << conf_loci_cnt << " confounded loci.\n";
+
+ log_fh << sample.name << "\t"
+ << nuc_cnt << "\t"
+ << total << "\t"
+ << unk_hom_cnt << "\t"
+ << unk_het_cnt << "\t"
+ << hom_unk_cnt << "\t"
+ << het_unk_cnt << "\t"
+ << hom_het_cnt << "\t"
+ << het_hom_cnt << "\t"
+ << blacklist_cnt << "\t"
+ << conf_loci_cnt << "\t"
+ << lnl_cnt << "\t"
+ << pruned_hap_cnt << "\t"
+ << pruned_mst_hap_cnt << "\n";
+
+ cerr << "Writing modified stacks, SNPs, alleles to '" << out_path << "'...";
+
+ //
+ // Rewrite stacks, model outputs, and haplotypes.
+ //
+ write_results(sample.name, stacks);
+
+ //
+ // Free up memory
+ //
+ cerr << "Freeing memory...";
+ map<int, Locus *>::iterator stack_it;
+ for (stack_it = stacks.begin(); stack_it != stacks.end(); stack_it++)
+ delete stack_it->second;
+ cerr << "done.\n";
}
log_fh.close();
if (verbose) {
- log_snp_fh.close();
- log_hap_fh.close();
+ log_snp_fh.close();
+ log_hap_fh.close();
}
return 0;
}
-int
+int
dist(string hap_1, string hap_2) {
int dist = 0;
const char *p = hap_1.c_str();
@@ -423,9 +423,9 @@ dist(string hap_1, string hap_2) {
// between the two sequences.
//
while (p < p_end && q < q_end) {
- dist += (*p == *q) ? 0 : 1;
- p++;
- q++;
+ dist += (*p == *q) ? 0 : 1;
+ p++;
+ q++;
}
return dist;
@@ -443,60 +443,60 @@ calc_lnl_means(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
ofstream log_fh;
if (lnl_dist) {
- stringstream log;
- log << "batch_" << batch_id << ".rxstacks_lnls.tsv";
- string log_path = out_path + log.str();
- log_fh.open(log_path.c_str(), ofstream::out);
-
- if (log_fh.fail()) {
- cerr << "Error opening log file '" << log_path << "'\n";
- exit(1);
- }
- log_fh << "# Catalog Locus\tMean\tMedian\n";
+ stringstream log;
+ log << "batch_" << batch_id << ".rxstacks_lnls.tsv";
+ string log_path = out_path + log.str();
+ log_fh.open(log_path.c_str(), ofstream::out);
+
+ if (log_fh.fail()) {
+ cerr << "Error opening log file '" << log_path << "'\n";
+ exit(1);
+ }
+ log_fh << "# Catalog Locus\tMean\tMedian\n";
}
tot = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- cloc = it->second;
+ cloc = it->second;
- d = pmap->locus(cloc->id);
- cnt = pmap->sample_cnt();
- mean = 0.0;
- lnls.clear();
+ d = pmap->locus(cloc->id);
+ cnt = pmap->sample_cnt();
+ mean = 0.0;
+ lnls.clear();
- for (uint i = 0; i < cnt; i++) {
- if (d[i] == NULL) continue;
+ for (uint i = 0; i < cnt; i++) {
+ if (d[i] == NULL) continue;
- lnls.push_back(d[i]->lnl);
- mean += d[i]->lnl;
- }
+ lnls.push_back(d[i]->lnl);
+ mean += d[i]->lnl;
+ }
- if (lnls.size() == 0) continue;
+ if (lnls.size() == 0) continue;
- sort(lnls.begin(), lnls.end());
+ sort(lnls.begin(), lnls.end());
- mid = lnls.size() / 2;
- median = lnls.size() % 2 == 0 ? lnls[mid] + lnls[mid+1] / 2.0 : lnls[mid+1];
- mean = mean / (double) lnls.size();
+ mid = lnls.size() / 2;
+ median = lnls.size() % 2 == 0 ? lnls[mid] + lnls[mid+1] / 2.0 : lnls[mid+1];
+ mean = mean / (double) lnls.size();
- cloc->lnl = mean;
+ cloc->lnl = mean;
- //
- // If the mean log likelihood for this catalog locus is below the threshold, count it as
- // its constituent components will be filtered as encountered later.
- //
- if (filter_lnl && cloc->lnl < lnl_limit)
- tot++;
+ //
+ // If the mean log likelihood for this catalog locus is below the threshold, count it as
+ // its constituent components will be filtered as encountered later.
+ //
+ if (filter_lnl && cloc->lnl < lnl_limit)
+ tot++;
- if (lnl_dist)
- log_fh << cloc->id << "\t"
- << mean << "\t"
- << median << "\n";
+ if (lnl_dist)
+ log_fh << cloc->id << "\t"
+ << mean << "\t"
+ << median << "\n";
}
if (lnl_dist)
- log_fh.close();
+ log_fh.close();
//
// Print number of catalog loci that are confounded and will be removed.
@@ -515,27 +515,27 @@ sum_haplotype_counts(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
uint cnt;
for (it = catalog.begin(); it != catalog.end(); it++) {
- cloc = it->second;
-
- d = pmap->locus(cloc->id);
- cnt = pmap->sample_cnt();
-
- for (uint i = 0; i < cnt; i++) {
- if (d[i] == NULL) continue;
-
- if (d[i]->obshap.size() == 1) {
- if (cloc->hap_cnts.count(d[i]->obshap[0]) == 0)
- cloc->hap_cnts[d[i]->obshap[0]] = 2;
- else
- cloc->hap_cnts[d[i]->obshap[0]] += 2;
- } else {
- for (uint j = 0; j < d[i]->obshap.size(); j++)
- if (cloc->hap_cnts.count(d[i]->obshap[j]) == 0)
- cloc->hap_cnts[d[i]->obshap[j]] = 1;
- else
- cloc->hap_cnts[d[i]->obshap[j]] += 1;
- }
- }
+ cloc = it->second;
+
+ d = pmap->locus(cloc->id);
+ cnt = pmap->sample_cnt();
+
+ for (uint i = 0; i < cnt; i++) {
+ if (d[i] == NULL) continue;
+
+ if (d[i]->obshap.size() == 1) {
+ if (cloc->hap_cnts.count(d[i]->obshap[0]) == 0)
+ cloc->hap_cnts[d[i]->obshap[0]] = 2;
+ else
+ cloc->hap_cnts[d[i]->obshap[0]] += 2;
+ } else {
+ for (uint j = 0; j < d[i]->obshap.size(); j++)
+ if (cloc->hap_cnts.count(d[i]->obshap[j]) == 0)
+ cloc->hap_cnts[d[i]->obshap[j]] = 1;
+ else
+ cloc->hap_cnts[d[i]->obshap[j]] += 1;
+ }
+ }
}
return 0;
@@ -557,8 +557,8 @@ prune_mst_haplotypes(CSLocus *cloc, Datum *d, Locus *loc, unsigned long &pruned_
for (it = cloc->hap_cnts.begin(); it != cloc->hap_cnts.end(); it++) {
n = mst->add_node(it->first);
- haps.push_back(it->first);
- keys.push_back(n->id);
+ haps.push_back(it->first);
+ keys.push_back(n->id);
}
//
@@ -569,24 +569,24 @@ prune_mst_haplotypes(CSLocus *cloc, Datum *d, Locus *loc, unsigned long &pruned_
uint snp_pos = 0;
for (uint i = 0; i < cloc->snps.size(); i++) {
- if (cloc->snps[i]->type != snp_type_het)
- continue;
-
- for (uint j = 0; j < haps.size(); j++) {
- for (uint k = j + 1; k < haps.size(); k++) {
- //
- // If these two haplotypes differ by this SNP (and only this SNP), connect them in the graph.
- //
- if (haps[j].at(snp_pos) != haps[k].at(snp_pos) &&
- dist(haps[j], haps[k]) == 1) {
- n_1 = mst->node(haps[j]);
- n_2 = mst->node(haps[k]);
- n_1->add_edge(n_2, 1);
- n_2->add_edge(n_1, 1);
- }
- }
- }
- snp_pos++;
+ if (cloc->snps[i]->type != snp_type_het)
+ continue;
+
+ for (uint j = 0; j < haps.size(); j++) {
+ for (uint k = j + 1; k < haps.size(); k++) {
+ //
+ // If these two haplotypes differ by this SNP (and only this SNP), connect them in the graph.
+ //
+ if (haps[j].at(snp_pos) != haps[k].at(snp_pos) &&
+ dist(haps[j], haps[k]) == 1) {
+ n_1 = mst->node(haps[j]);
+ n_2 = mst->node(haps[k]);
+ n_1->add_edge(n_2, 1);
+ n_2->add_edge(n_1, 1);
+ }
+ }
+ }
+ snp_pos++;
}
//
@@ -600,7 +600,7 @@ prune_mst_haplotypes(CSLocus *cloc, Datum *d, Locus *loc, unsigned long &pruned_
vector<pair<string, double> > haplotypes;
for (uint i = 0; i < d->obshap.size(); i++)
- haplotypes.push_back(make_pair(string(d->obshap[i]), (double) d->depth[i]));
+ haplotypes.push_back(make_pair(string(d->obshap[i]), (double) d->depth[i]));
uint size = haplotypes.size();
@@ -610,7 +610,7 @@ prune_mst_haplotypes(CSLocus *cloc, Datum *d, Locus *loc, unsigned long &pruned_
sort(haplotypes.begin(), haplotypes.end(), compare_pair_haplotype_rev);
if (size <= 2)
- return 0;
+ return 0;
//
// Pull out the two most frequently occuring haplotypes.
@@ -622,12 +622,12 @@ prune_mst_haplotypes(CSLocus *cloc, Datum *d, Locus *loc, unsigned long &pruned_
haplotypes.pop_back();
if (haplotypes[size - 2].second > haplotypes[size - 3].second) {
- hap_2 = haplotypes[size - 2].first;
- hap_2_depth = haplotypes[size - 2].second;
- haplotypes.pop_back();
+ hap_2 = haplotypes[size - 2].first;
+ hap_2_depth = haplotypes[size - 2].second;
+ haplotypes.pop_back();
} else {
- hap_2 = "";
- hap_2_depth = 0.0;
+ hap_2 = "";
+ hap_2_depth = 0.0;
}
//
@@ -640,82 +640,82 @@ prune_mst_haplotypes(CSLocus *cloc, Datum *d, Locus *loc, unsigned long &pruned_
for (uint i = 0; i < haplotypes.size(); i++) {
- //
- // Find the current haplotype in the MST.
- //
- n_1 = mst->node(haplotypes[i].first);
-
- max = 0.0;
- hap = "";
- weighted = 0.0;
- //
- // Check any potential edges in the graph for merging.
- //
- for (uint j = 0; j < n_1->edges.size(); j++) {
- label = n_1->edges[j]->child->label;
-
- if (label == hap_1) {
- weighted = (double) cloc->hap_cnts[label] * log(hap_1_depth);
- // cerr << "Cloc hap: " << label << "; popcnt: " << cloc->hap_cnts[label] << "; hap depth: " << hap_1_depth << "; weighted: " << weighted << "\n";
- } else if (label == hap_2) {
- weighted = (double) cloc->hap_cnts[label] * log(hap_2_depth);
- // cerr << "Cloc hap: " << label << "; popcnt: " << cloc->hap_cnts[label] << "; hap depth: " << hap_2_depth << "; weighted: " << weighted << "\n";
- } else
- continue;
-
- if (weighted == max) {
- //
- // There is more than one identical possibility, we can do no more.
- //
- hap = "";
- break;
-
- } else if (weighted > max) {
- max = weighted;
- hap = label;
- }
- }
-
- if (hap.length() == 0)
- continue;
-
- src_hap = convert_catalog_haplotype_to_sample(haplotypes[i].first, cloc, loc);
- dest_hap = convert_catalog_haplotype_to_sample(hap, cloc, loc);
-
- if (verbose) {
- #pragma omp critical
- log_fh << cloc->id << "\t"
- << loc->sample_id << "\t"
- << loc->id << "\t"
- << src_hap << "\t"
- << haplotypes[i].first << "\t"
- << dest_hap << "\t"
- << hap << "\t"
- << "mst" << "\n";
- }
- pruned_hap_cnt++;
-
- //
- // Remove the haplotype.
- //
- it = loc->alleles.find(src_hap);
-
- if (it != loc->alleles.end()) {
- loc->alleles.erase(it);
- }
-
- //
- // Add to the count of the merged-to haplotype.
- //
- if (loc->alleles.count(dest_hap) > 0) {
- loc->alleles[dest_hap]++;
- } else {
- cerr << "Error finding allele\n";
- }
+ //
+ // Find the current haplotype in the MST.
+ //
+ n_1 = mst->node(haplotypes[i].first);
+
+ max = 0.0;
+ hap = "";
+ weighted = 0.0;
+ //
+ // Check any potential edges in the graph for merging.
+ //
+ for (uint j = 0; j < n_1->edges.size(); j++) {
+ label = n_1->edges[j]->child->label;
+
+ if (label == hap_1) {
+ weighted = (double) cloc->hap_cnts[label] * log(hap_1_depth);
+ // cerr << "Cloc hap: " << label << "; popcnt: " << cloc->hap_cnts[label] << "; hap depth: " << hap_1_depth << "; weighted: " << weighted << "\n";
+ } else if (label == hap_2) {
+ weighted = (double) cloc->hap_cnts[label] * log(hap_2_depth);
+ // cerr << "Cloc hap: " << label << "; popcnt: " << cloc->hap_cnts[label] << "; hap depth: " << hap_2_depth << "; weighted: " << weighted << "\n";
+ } else
+ continue;
+
+ if (weighted == max) {
+ //
+ // There is more than one identical possibility, we can do no more.
+ //
+ hap = "";
+ break;
+
+ } else if (weighted > max) {
+ max = weighted;
+ hap = label;
+ }
+ }
+
+ if (hap.length() == 0)
+ continue;
+
+ src_hap = convert_catalog_haplotype_to_sample(haplotypes[i].first, cloc, loc);
+ dest_hap = convert_catalog_haplotype_to_sample(hap, cloc, loc);
+
+ if (verbose) {
+ #pragma omp critical
+ log_fh << cloc->id << "\t"
+ << loc->sample_id << "\t"
+ << loc->id << "\t"
+ << src_hap << "\t"
+ << haplotypes[i].first << "\t"
+ << dest_hap << "\t"
+ << hap << "\t"
+ << "mst" << "\n";
+ }
+ pruned_hap_cnt++;
+
+ //
+ // Remove the haplotype.
+ //
+ it = loc->alleles.find(src_hap);
+
+ if (it != loc->alleles.end()) {
+ loc->alleles.erase(it);
+ }
+
+ //
+ // Add to the count of the merged-to haplotype.
+ //
+ if (loc->alleles.count(dest_hap) > 0) {
+ loc->alleles[dest_hap]++;
+ } else {
+ cerr << "Error finding allele\n";
+ }
}
//
- // Update the matched haplotypes in the Datum object, so the haplotype pruner can
+ // Update the matched haplotypes in the Datum object, so the haplotype pruner can
// operate on newly generated, spurious haplotypes.
//
generate_matched_haplotypes(cloc, loc, d);
@@ -735,14 +735,14 @@ prune_locus_haplotypes(CSLocus *cloc, Datum *d, Locus *loc, unsigned long &prune
double weighted_hap;
for (uint i = 0; i < d->obshap.size(); i++) {
- //
- // Lookup the number of occurrences of this haplotype in the
- // population as well as the depth of the haplotype in this indiviudal.
- // We will weight the occurrences of the haplotype in the population by the natural log
- // of the read depth of the haplotype in this individual, storing the result.
- //
- weighted_hap = (double) cloc->hap_cnts[d->obshap[i]] * log((double) d->depth[i]);
- haplotypes.push_back(make_pair(string(d->obshap[i]), weighted_hap));
+ //
+ // Lookup the number of occurrences of this haplotype in the
+ // population as well as the depth of the haplotype in this indiviudal.
+ // We will weight the occurrences of the haplotype in the population by the natural log
+ // of the read depth of the haplotype in this individual, storing the result.
+ //
+ weighted_hap = (double) cloc->hap_cnts[d->obshap[i]] * log((double) d->depth[i]);
+ haplotypes.push_back(make_pair(string(d->obshap[i]), weighted_hap));
}
//
@@ -751,51 +751,51 @@ prune_locus_haplotypes(CSLocus *cloc, Datum *d, Locus *loc, unsigned long &prune
sort(haplotypes.begin(), haplotypes.end(), compare_pair_haplotype);
if (haplotypes.size() == 0) {
- cerr << "Error processing catalog locus " << cloc->id << "\n";
- return -1;
+ cerr << "Error processing catalog locus " << cloc->id << "\n";
+ return -1;
}
//
// Prune out excess haplotypes.
//
for (uint i = 2; i < haplotypes.size(); i++) {
- //
- // Make sure that those haplotypes we want to discard occur at a frequency lower
- // than the second most frequent haplotype, instead of being tied for second.
- //
- if (haplotypes[i].second >= haplotypes[1].second ||
- (max_haplotype_cnt > 0 && haplotypes[i].second > max_haplotype_cnt))
- continue;
-
- remove_haplotype(cloc, loc, haplotypes[i].first, pruned_hap_cnt, log_fh, "rare_step_1");
- haplotypes.erase(haplotypes.begin() + i);
+ //
+ // Make sure that those haplotypes we want to discard occur at a frequency lower
+ // than the second most frequent haplotype, instead of being tied for second.
+ //
+ if (haplotypes[i].second >= haplotypes[1].second ||
+ (max_haplotype_cnt > 0 && haplotypes[i].second > max_haplotype_cnt))
+ continue;
+
+ remove_haplotype(cloc, loc, haplotypes[i].first, pruned_hap_cnt, log_fh, "rare_step_1");
+ haplotypes.erase(haplotypes.begin() + i);
}
//
- // If there are more than two haplotypes remaining and the second, third, etc
+ // If there are more than two haplotypes remaining and the second, third, etc
// haplotype exist only in this individual, prune them out.
//
if (haplotypes.size() > 2) {
- int stop_pos = haplotypes.size() - 1;
- int start_pos = stop_pos;
- double score = haplotypes[stop_pos].second;
- while (start_pos > 1) {
- if (cloc->hap_cnts[haplotypes[start_pos].first] == 1 &&
- haplotypes[start_pos].second == score)
- start_pos--;
- else
- break;
- }
-
- if (start_pos < stop_pos) {
- for (int i = start_pos; i <= stop_pos; i++)
- remove_haplotype(cloc, loc, haplotypes[i].first, pruned_hap_cnt, log_fh, "rare_step_1");
- }
+ int stop_pos = haplotypes.size() - 1;
+ int start_pos = stop_pos;
+ double score = haplotypes[stop_pos].second;
+ while (start_pos > 1) {
+ if (cloc->hap_cnts[haplotypes[start_pos].first] == 1 &&
+ haplotypes[start_pos].second == score)
+ start_pos--;
+ else
+ break;
+ }
+
+ if (start_pos < stop_pos) {
+ for (int i = start_pos; i <= stop_pos; i++)
+ remove_haplotype(cloc, loc, haplotypes[i].first, pruned_hap_cnt, log_fh, "rare_step_1");
+ }
}
//
- // Update the matched haplotypes in the Datum object, so the haplotype pruner can
+ // Update the matched haplotypes in the Datum object, so the haplotype pruner can
// operate on newly generated, spurious haplotypes.
//
generate_matched_haplotypes(cloc, loc, d);
@@ -816,38 +816,38 @@ convert_catalog_haplotype_to_sample(string cat_haplotype, CSLocus *cloc, Locus *
string hap;
do {
- j++;
- loc_idx++;
- //
- // Advance to a het in the sample locus.
- //
- while (j < (int) loc->snps.size() && loc->snps[j]->type != snp_type_het) j++;
- if (j >= (int) loc->snps.size()) break;
- loc_snp = loc->snps[j]->col;
-
- do {
- k++;
- cat_idx++;
- //
- // Advance to the het in the catalog locus that corresponds to the sample locus.
- //
- while (k < (int) cloc->snps.size() && cloc->snps[k]->type != snp_type_het) k++;
- if (k >= (int) cloc->snps.size()) break;
- cat_snp = cloc->snps[k]->col;
-
- } while (cat_snp < loc_snp);
-
- //
- // Extract out the nucleotide from the catalog haplotype that matches the sample
- // haplotype. For example, catalog haplotype may be 'ACGTG' while sample haplotype
- // is 'CT'.
- //
- if (j < (int) loc->snps.size() && k < (int) cloc->snps.size() && cat_snp == loc_snp) {
- hap += cat_haplotype.at(cat_idx);
- } else {
- cerr << "Error processing catalog locus " << cloc->id << "\n";
- return "";
- }
+ j++;
+ loc_idx++;
+ //
+ // Advance to a het in the sample locus.
+ //
+ while (j < (int) loc->snps.size() && loc->snps[j]->type != snp_type_het) j++;
+ if (j >= (int) loc->snps.size()) break;
+ loc_snp = loc->snps[j]->col;
+
+ do {
+ k++;
+ cat_idx++;
+ //
+ // Advance to the het in the catalog locus that corresponds to the sample locus.
+ //
+ while (k < (int) cloc->snps.size() && cloc->snps[k]->type != snp_type_het) k++;
+ if (k >= (int) cloc->snps.size()) break;
+ cat_snp = cloc->snps[k]->col;
+
+ } while (cat_snp < loc_snp);
+
+ //
+ // Extract out the nucleotide from the catalog haplotype that matches the sample
+ // haplotype. For example, catalog haplotype may be 'ACGTG' while sample haplotype
+ // is 'CT'.
+ //
+ if (j < (int) loc->snps.size() && k < (int) cloc->snps.size() && cat_snp == loc_snp) {
+ hap += cat_haplotype.at(cat_idx);
+ } else {
+ cerr << "Error processing catalog locus " << cloc->id << "\n";
+ return "";
+ }
} while (j < (int) loc->snps.size());
@@ -855,8 +855,8 @@ convert_catalog_haplotype_to_sample(string cat_haplotype, CSLocus *cloc, Locus *
}
int
-remove_haplotype(CSLocus *cloc, Locus *loc, string haplotype,
- unsigned long &pruned_hap_cnt, ofstream &log_fh, string alg_type)
+remove_haplotype(CSLocus *cloc, Locus *loc, string haplotype,
+ unsigned long &pruned_hap_cnt, ofstream &log_fh, string alg_type)
{
map<string, int>::iterator it;
string hap = "";
@@ -865,14 +865,14 @@ remove_haplotype(CSLocus *cloc, Locus *loc, string haplotype,
if (verbose) {
#pragma omp critical
- log_fh << cloc->id << "\t"
- << loc->sample_id << "\t"
- << loc->id << "\t"
- << hap << "\t"
- << haplotype << "\t"
- << "\t"
- << "\t"
- << alg_type << "\n";
+ log_fh << cloc->id << "\t"
+ << loc->sample_id << "\t"
+ << loc->id << "\t"
+ << hap << "\t"
+ << haplotype << "\t"
+ << "\t"
+ << "\t"
+ << alg_type << "\n";
}
//
@@ -881,24 +881,24 @@ remove_haplotype(CSLocus *cloc, Locus *loc, string haplotype,
it = loc->alleles.find(hap);
if (it != loc->alleles.end()) {
- loc->alleles.erase(it);
- pruned_hap_cnt++;
- }
+ loc->alleles.erase(it);
+ pruned_hap_cnt++;
+ }
//
// Decrement the count for this haplotype in the catalog locus.
//
if (cloc->hap_cnts.count(haplotype) > 0)
- cloc->hap_cnts[haplotype]--;
+ cloc->hap_cnts[haplotype]--;
return 0;
}
int
-prune_nucleotides(CSLocus *cloc, Locus *loc, Datum *d, ofstream &log_fh, unsigned long int &nuc_cnt,
- unsigned long int &unk_hom_cnt, unsigned long int &unk_het_cnt,
- unsigned long int &hom_unk_cnt, unsigned long int &het_unk_cnt,
- unsigned long int &hom_het_cnt, unsigned long int &het_hom_cnt)
+prune_nucleotides(CSLocus *cloc, Locus *loc, Datum *d, ofstream &log_fh, unsigned long int &nuc_cnt,
+ unsigned long int &unk_hom_cnt, unsigned long int &unk_het_cnt,
+ unsigned long int &hom_unk_cnt, unsigned long int &het_unk_cnt,
+ unsigned long int &hom_het_cnt, unsigned long int &het_hom_cnt)
{
map<char, int> nucs;
set<char> cnucs;
@@ -918,73 +918,73 @@ prune_nucleotides(CSLocus *cloc, Locus *loc, Datum *d, ofstream &log_fh, unsigne
if (loc->snps[i]->type == snp_type_het && cloc->snps[i]->type == snp_type_hom)
cerr << "Warning: sample locus is variable while catalog locus is fixed; catalog locus " << cloc->id
<< " and sample " << loc->sample_id << ", locus " << loc->id << "; col: " << loc->snps[i]->col << "\n";
-
- //
- // Either there is an unknown call in locus, or, there is a snp in the catalog and any state in the locus.
- //
- if ((loc->snps[i]->type == snp_type_unk) ||
- (cloc->snps[i]->type == snp_type_het && loc->snps[i]->type == snp_type_hom)) {
-
- // cerr << " Looking at SNP call in tag " << loc->id << " at position " << i << "; col: " << loc->snps[i]->col << "\n"
- // << " Catalog column: " << cloc->snps[i]->col << " (" << i << "); Sample column: " << loc->snps[i]->col << " (" << i << ")\n"
- // << " Sample has model call type: " << (loc->snps[i]->type == snp_type_unk ? "Unknown" : "Homozygous") << "; nucleotides: '"
- // << loc->snps[i]->rank_1 << "' and '" << loc->snps[i]->rank_2 << "'; model call: " << loc->model[loc->snps[i]->col] << "\n"
- // << " Catalog has model call type: " << (cloc->snps[i]->type == snp_type_het ? "Heterozygous" : "Homozygous") << "; nucleotides: '"
- // << cloc->snps[i]->rank_1 << "' and '" << cloc->snps[i]->rank_2 << "'\n";
-
- if (loc->snps[i]->rank_1 == 'N' || cloc->snps[i]->rank_1 == 'N' ||
+
+ //
+ // Either there is an unknown call in locus, or, there is a snp in the catalog and any state in the locus.
+ //
+ if ((loc->snps[i]->type == snp_type_unk) ||
+ (cloc->snps[i]->type == snp_type_het && loc->snps[i]->type == snp_type_hom)) {
+
+ // cerr << " Looking at SNP call in tag " << loc->id << " at position " << i << "; col: " << loc->snps[i]->col << "\n"
+ // << " Catalog column: " << cloc->snps[i]->col << " (" << i << "); Sample column: " << loc->snps[i]->col << " (" << i << ")\n"
+ // << " Sample has model call type: " << (loc->snps[i]->type == snp_type_unk ? "Unknown" : "Homozygous") << "; nucleotides: '"
+ // << loc->snps[i]->rank_1 << "' and '" << loc->snps[i]->rank_2 << "'; model call: " << loc->model[loc->snps[i]->col] << "\n"
+ // << " Catalog has model call type: " << (cloc->snps[i]->type == snp_type_het ? "Heterozygous" : "Homozygous") << "; nucleotides: '"
+ // << cloc->snps[i]->rank_1 << "' and '" << cloc->snps[i]->rank_2 << "'\n";
+
+ if (loc->snps[i]->rank_1 == 'N' || cloc->snps[i]->rank_1 == 'N' ||
loc->snps[i]->rank_1 == '-' || cloc->snps[i]->rank_1 == '-')
continue;
- cnucs.insert(cloc->snps[i]->rank_1);
- if (cloc->snps[i]->rank_2 != 0) cnucs.insert(cloc->snps[i]->rank_2);
- if (cloc->snps[i]->rank_3 != 0) cnucs.insert(cloc->snps[i]->rank_3);
- if (cloc->snps[i]->rank_4 != 0) cnucs.insert(cloc->snps[i]->rank_4);
-
- // cerr << " Catalog has nucleotides: ";
- // for (it = cnucs.begin(); it != cnucs.end(); it++)
- // cerr << *it << ", ";
- // cerr << "\n";
-
- //
- // Tally the number of occurances of each nucleotide also present in the
- // catalog in order to fuel the snp calling model.
- //
- // Note reads that contain nucleotides not present in the catalog so they
- // can be excluded when calling haplotypes from the read.
- //
- nucs['A'] = 0;
- nucs['C'] = 0;
- nucs['G'] = 0;
- nucs['T'] = 0;
- nucs['N'] = 0;
-
- for (uint k = 0; k < loc->reads.size(); k++) {
- if (cnucs.count(loc->reads[k][i]) > 0)
- nucs[loc->reads[k][i]]++;
- else if (loc->reads[k][i] != 'N')
- rows.insert(k);
- }
-
- //
- // Test pruned data for homozygosity or heterozygosity.
- //
- invoke_model(loc, i, nucs);
+ cnucs.insert(cloc->snps[i]->rank_1);
+ if (cloc->snps[i]->rank_2 != 0) cnucs.insert(cloc->snps[i]->rank_2);
+ if (cloc->snps[i]->rank_3 != 0) cnucs.insert(cloc->snps[i]->rank_3);
+ if (cloc->snps[i]->rank_4 != 0) cnucs.insert(cloc->snps[i]->rank_4);
+
+ // cerr << " Catalog has nucleotides: ";
+ // for (it = cnucs.begin(); it != cnucs.end(); it++)
+ // cerr << *it << ", ";
+ // cerr << "\n";
+
+ //
+ // Tally the number of occurances of each nucleotide also present in the
+ // catalog in order to fuel the snp calling model.
+ //
+ // Note reads that contain nucleotides not present in the catalog so they
+ // can be excluded when calling haplotypes from the read.
+ //
+ nucs['A'] = 0;
+ nucs['C'] = 0;
+ nucs['G'] = 0;
+ nucs['T'] = 0;
+ nucs['N'] = 0;
+
+ for (uint k = 0; k < loc->reads.size(); k++) {
+ if (cnucs.count(loc->reads[k][i]) > 0)
+ nucs[loc->reads[k][i]]++;
+ else if (loc->reads[k][i] != 'N')
+ rows.insert(k);
+ }
+
+ //
+ // Test pruned data for homozygosity or heterozygosity.
+ //
+ invoke_model(loc, i, nucs);
cnucs.clear();
- }
+ }
}
if (verbose) {
#pragma omp critical
log_model_calls(loc, log_fh,
- unk_hom_cnt, unk_het_cnt,
- hom_unk_cnt, het_unk_cnt,
+ unk_hom_cnt, unk_het_cnt,
+ hom_unk_cnt, het_unk_cnt,
hom_het_cnt, het_hom_cnt);
} else {
log_model_calls(loc, log_fh,
- unk_hom_cnt, unk_het_cnt,
- hom_unk_cnt, het_unk_cnt,
+ unk_hom_cnt, unk_het_cnt,
+ hom_unk_cnt, het_unk_cnt,
hom_het_cnt, het_hom_cnt);
}
@@ -1001,14 +1001,14 @@ prune_nucleotides(CSLocus *cloc, Locus *loc, Datum *d, ofstream &log_fh, unsigne
// the catalog and all the reads are removed for the purpose of reading haplotypes.
//
if (loc->alleles.size() <= 1)
- for (uint j = 0; j < loc->snps.size(); j++)
- if (loc->snps[j]->type == snp_type_het) {
- loc->blacklisted = 1;
- break;
- }
+ for (uint j = 0; j < loc->snps.size(); j++)
+ if (loc->snps[j]->type == snp_type_het) {
+ loc->blacklisted = 1;
+ break;
+ }
//
- // Update the matched haplotypes in the Datum object, so the haplotype pruner can
+ // Update the matched haplotypes in the Datum object, so the haplotype pruner can
// operate on newly generated, spurious haplotypes.
//
generate_matched_haplotypes(cloc, loc, d);
@@ -1016,28 +1016,28 @@ prune_nucleotides(CSLocus *cloc, Locus *loc, Datum *d, ofstream &log_fh, unsigne
return 0;
}
-int
-invoke_model(Locus *loc, int col, map<char, int> &nucs)
+int
+invoke_model(Locus *loc, int col, map<char, int> &nucs)
{
//
// Search this column for the presence of a SNP
//
switch(model_type) {
case snp:
- call_multinomial_snp(loc, col, nucs);
- break;
+ call_multinomial_snp(loc, col, nucs);
+ break;
case bounded:
- call_bounded_multinomial_snp(loc, col, nucs);
- break;
- default:
- break;
+ call_bounded_multinomial_snp(loc, col, nucs);
+ break;
+ default:
+ break;
}
return 0;
}
-int
-call_alleles(Locus *loc, set<int> &rows)
+int
+call_alleles(Locus *loc, set<int> &rows)
{
int row;
int height = loc->reads.size();
@@ -1046,55 +1046,55 @@ call_alleles(Locus *loc, set<int> &rows)
vector<SNP *>::iterator snp;
for (row = 0; row < height; row++) {
- //
- // If a read had a nucleotide not present in the catalog, do not call
- // a haplotype from it.
- //
- if (rows.count(row) > 0)
- continue;
+ //
+ // If a read had a nucleotide not present in the catalog, do not call
+ // a haplotype from it.
+ //
+ if (rows.count(row) > 0)
+ continue;
- allele.clear();
+ allele.clear();
- uint snp_cnt = 0;
+ uint snp_cnt = 0;
- for (snp = loc->snps.begin(); snp != loc->snps.end(); snp++) {
- if ((*snp)->type != snp_type_het) continue;
+ for (snp = loc->snps.begin(); snp != loc->snps.end(); snp++) {
+ if ((*snp)->type != snp_type_het) continue;
- snp_cnt++;
+ snp_cnt++;
- base = loc->reads[row][(*snp)->col];
+ base = loc->reads[row][(*snp)->col];
- //
- // Check to make sure the nucleotide at the location of this SNP is
- // of one of the two possible states the multinomial model called.
- //
- if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
- allele += base;
- else
- break;
- }
+ //
+ // Check to make sure the nucleotide at the location of this SNP is
+ // of one of the two possible states the multinomial model called.
+ //
+ if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
+ allele += base;
+ else
+ break;
+ }
- if (snp_cnt > 0 && allele.length() == snp_cnt)
- loc->alleles[allele]++;
+ if (snp_cnt > 0 && allele.length() == snp_cnt)
+ loc->alleles[allele]++;
}
return 0;
}
-int
-generate_matched_haplotypes(CSLocus *cloc, Locus *loc, Datum *d)
+int
+generate_matched_haplotypes(CSLocus *cloc, Locus *loc, Datum *d)
{
//
// Free the existing matched haplotypes.
//
for (uint i = 0; i < d->obshap.size(); i++)
- delete [] d->obshap[i];
+ delete [] d->obshap[i];
d->obshap.clear();
d->depth.clear();
//
// Construct a set of haplotypes from the locus relative to the catalog locus.
- // (The locus already has a set of haplotypes, however, they don't necessarily
+ // (The locus already has a set of haplotypes, however, they don't necessarily
// account for all the SNPs in the catalog, so we will augment them with sequence
// from the consensus.)
//
@@ -1104,26 +1104,26 @@ generate_matched_haplotypes(CSLocus *cloc, Locus *loc, Datum *d)
vector<pair<string, SNP *> >::iterator k;
for (uint i = 0; i < cloc->snps.size(); i++) {
- if (cloc->snps[i]->type != snp_type_het)
- continue;
- columns[cloc->snps[i]->col] = make_pair("catalog", cloc->snps[i]);
+ if (cloc->snps[i]->type != snp_type_het)
+ continue;
+ columns[cloc->snps[i]->col] = make_pair("catalog", cloc->snps[i]);
}
for (uint i = 0; i < loc->snps.size(); i++) {
- if (loc->snps[i]->type != snp_type_het)
- continue;
-
- //
- // Is this column already represented in the catalog?
- //
- if (columns.count(loc->snps[i]->col))
- columns[loc->snps[i]->col] = make_pair("both", loc->snps[i]);
- else
- columns[loc->snps[i]->col] = make_pair("query", loc->snps[i]);
+ if (loc->snps[i]->type != snp_type_het)
+ continue;
+
+ //
+ // Is this column already represented in the catalog?
+ //
+ if (columns.count(loc->snps[i]->col))
+ columns[loc->snps[i]->col] = make_pair("both", loc->snps[i]);
+ else
+ columns[loc->snps[i]->col] = make_pair("query", loc->snps[i]);
}
- for (c = columns.begin(); c != columns.end(); c++)
- merged_snps.push_back((*c).second);
+ for (c = columns.begin(); c != columns.end(); c++)
+ merged_snps.push_back((*c).second);
//
// Sort the SNPs by column
@@ -1135,29 +1135,29 @@ generate_matched_haplotypes(CSLocus *cloc, Locus *loc, Datum *d)
int pos;
for (b = loc->alleles.begin(); b != loc->alleles.end(); b++) {
- old_allele = b->first;
- new_allele = "";
- pos = 0;
-
- for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
- //
- // If the SNPs from the catalog haplotype beyond the length of the query, add Ns
- //
- if (k->first == "catalog") {
- new_allele += (k->second->col > loc->len - 1) ? 'N' : loc->con[k->second->col];
- } else {
- new_allele += old_allele[pos];
- pos++;
- }
- }
-
- char *h = new char[new_allele.length() + 1];
- strcpy(h, new_allele.c_str());
- d->obshap.push_back(h);
- d->depth.push_back(b->second);
-
- // loc->alleles[new_allele] = b->second;
- // cerr << "Adding haplotype: " << new_allele << " [" << b->first << "]\n";
+ old_allele = b->first;
+ new_allele = "";
+ pos = 0;
+
+ for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
+ //
+ // If the SNPs from the catalog haplotype beyond the length of the query, add Ns
+ //
+ if (k->first == "catalog") {
+ new_allele += (k->second->col > loc->len - 1) ? 'N' : loc->con[k->second->col];
+ } else {
+ new_allele += old_allele[pos];
+ pos++;
+ }
+ }
+
+ char *h = new char[new_allele.length() + 1];
+ strcpy(h, new_allele.c_str());
+ d->obshap.push_back(h);
+ d->depth.push_back(b->second);
+
+ // loc->alleles[new_allele] = b->second;
+ // cerr << "Adding haplotype: " << new_allele << " [" << b->first << "]\n";
}
return 0;
@@ -1165,82 +1165,82 @@ generate_matched_haplotypes(CSLocus *cloc, Locus *loc, Datum *d)
int
log_model_calls(Locus *loc, ofstream &log_fh,
- unsigned long int &unk_hom_cnt, unsigned long int &unk_het_cnt,
- unsigned long int &hom_unk_cnt, unsigned long int &het_unk_cnt,
- unsigned long int &hom_het_cnt, unsigned long int &het_hom_cnt)
+ unsigned long int &unk_hom_cnt, unsigned long int &unk_het_cnt,
+ unsigned long int &hom_unk_cnt, unsigned long int &het_unk_cnt,
+ unsigned long int &hom_het_cnt, unsigned long int &het_hom_cnt)
{
//
// Log model call changes
//
for (uint j = 0; j < loc->snps.size(); j++) {
- switch(loc->model[j]) {
- case 'U':
- switch(loc->snps[j]->type) {
- case snp_type_het:
- if (verbose)
- log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'U' << "\t" << 'E' << "\n";
- unk_het_cnt++;
- break;
- case snp_type_hom:
- if (verbose)
- log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'U' << "\t" << 'O' << "\n";
- unk_hom_cnt++;
- break;
- case snp_type_unk:
- default:
- break;
- }
- break;
- case 'E':
- switch(loc->snps[j]->type) {
- case snp_type_het:
- break;
- case snp_type_hom:
- if (verbose)
- log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'E' << "\t" << 'O' << "\n";
- het_hom_cnt++;
- break;
- case snp_type_unk:
- default:
- if (verbose)
- log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'E' << "\t" << 'U' << "\n";
- het_unk_cnt++;
- break;
- }
- break;
- case 'O':
- default:
- switch(loc->snps[j]->type) {
- case snp_type_het:
- if (verbose)
- log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'O' << "\t" << 'E' << "\n";
- hom_het_cnt++;
- break;
- case snp_type_hom:
- break;
- case snp_type_unk:
- default:
- if (verbose)
- log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'O' << "\t" << 'U' << "\n";
- hom_unk_cnt++;
- break;
- }
- break;
- }
+ switch(loc->model[j]) {
+ case 'U':
+ switch(loc->snps[j]->type) {
+ case snp_type_het:
+ if (verbose)
+ log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'U' << "\t" << 'E' << "\n";
+ unk_het_cnt++;
+ break;
+ case snp_type_hom:
+ if (verbose)
+ log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'U' << "\t" << 'O' << "\n";
+ unk_hom_cnt++;
+ break;
+ case snp_type_unk:
+ default:
+ break;
+ }
+ break;
+ case 'E':
+ switch(loc->snps[j]->type) {
+ case snp_type_het:
+ break;
+ case snp_type_hom:
+ if (verbose)
+ log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'E' << "\t" << 'O' << "\n";
+ het_hom_cnt++;
+ break;
+ case snp_type_unk:
+ default:
+ if (verbose)
+ log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'E' << "\t" << 'U' << "\n";
+ het_unk_cnt++;
+ break;
+ }
+ break;
+ case 'O':
+ default:
+ switch(loc->snps[j]->type) {
+ case snp_type_het:
+ if (verbose)
+ log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'O' << "\t" << 'E' << "\n";
+ hom_het_cnt++;
+ break;
+ case snp_type_hom:
+ break;
+ case snp_type_unk:
+ default:
+ if (verbose)
+ log_fh << loc->sample_id << "\t" << loc->id << "\t" << loc->snps[j]->col << "\t" << 'O' << "\t" << 'U' << "\n";
+ hom_unk_cnt++;
+ break;
+ }
+ break;
+ }
}
return 0;
}
-int
-write_results(string file, map<int, Locus *> &m)
+int
+write_results(string file, map<int, Locus *> &m)
{
map<int, Locus *>::iterator i;
vector<char *>::iterator j;
vector<int>::iterator k;
map<string, int>::iterator t;
Locus *tag_1;
- stringstream sstr;
+ stringstream sstr;
bool gzip = (in_file_type == FileT::gzsql) ? true : false;
@@ -1250,197 +1250,240 @@ write_results(string file, map<int, Locus *> &m)
string tag_file = out_path + file + ".tags.tsv";
string snp_file = out_path + file + ".snps.tsv";
string all_file = out_path + file + ".alleles.tsv";
-
+ string mod_file = out_path + file + ".models.tsv";
+
if (gzip) {
- tag_file += ".gz";
- snp_file += ".gz";
- all_file += ".gz";
+ tag_file += ".gz";
+ snp_file += ".gz";
+ all_file += ".gz";
+ mod_file += ".gz";
}
//
// Open the output files for writing.
//
- gzFile gz_tags, gz_snps, gz_alle;
- ofstream tags, snps, alle;
+ gzFile gz_tags, gz_snps, gz_alle, gz_mods;
+ ofstream tags, snps, alle, mods;
if (gzip) {
- gz_tags = gzopen(tag_file.c_str(), "wb");
- if (!gz_tags) {
- cerr << "Error: Unable to open gzipped catalog tag file '" << tag_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gz_tags = gzopen(tag_file.c_str(), "wb");
+ if (!gz_tags) {
+ cerr << "Error: Unable to open gzipped tag tag file '" << tag_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_tags, libz_buffer_size);
- #endif
- gz_snps = gzopen(snp_file.c_str(), "wb");
- if (!gz_snps) {
- cerr << "Error: Unable to open gzipped catalog snps file '" << snp_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gzbuffer(gz_tags, libz_buffer_size);
+ #endif
+ gz_mods = gzopen(mod_file.c_str(), "wb");
+ if (!gz_mods) {
+ cerr << "Error: Unable to open gzipped model file '" << mod_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_mods, libz_buffer_size);
+ #endif
+ gz_snps = gzopen(snp_file.c_str(), "wb");
+ if (!gz_snps) {
+ cerr << "Error: Unable to open gzipped catalog snps file '" << snp_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_snps, libz_buffer_size);
- #endif
- gz_alle = gzopen(all_file.c_str(), "wb");
- if (!gz_alle) {
- cerr << "Error: Unable to open gzipped catalog alleles file '" << all_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gzbuffer(gz_snps, libz_buffer_size);
+ #endif
+ gz_alle = gzopen(all_file.c_str(), "wb");
+ if (!gz_alle) {
+ cerr << "Error: Unable to open gzipped catalog alleles file '" << all_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_alle, libz_buffer_size);
- #endif
+ gzbuffer(gz_alle, libz_buffer_size);
+ #endif
} else {
- tags.open(tag_file.c_str());
- if (tags.fail()) {
- cerr << "Error: Unable to open catalog tag file for writing.\n";
- exit(1);
- }
- snps.open(snp_file.c_str());
- if (snps.fail()) {
- cerr << "Error: Unable to open catalog SNPs file for writing.\n";
- exit(1);
- }
- alle.open(all_file.c_str());
- if (alle.fail()) {
- cerr << "Error: Unable to open catalog alleles file for writing.\n";
- exit(1);
- }
+ tags.open(tag_file.c_str());
+ if (tags.fail()) {
+ cerr << "Error: Unable to open tag file for writing.\n";
+ exit(1);
+ }
+ mods.open(mod_file.c_str());
+ if (mods.fail()) {
+ cerr << "Error: Unable to open model file for writing.\n";
+ exit(1);
+ }
+ snps.open(snp_file.c_str());
+ if (snps.fail()) {
+ cerr << "Error: Unable to open catalog SNPs file for writing.\n";
+ exit(1);
+ }
+ alle.open(all_file.c_str());
+ if (alle.fail()) {
+ cerr << "Error: Unable to open catalog alleles file for writing.\n";
+ exit(1);
+ }
+ }
+
+ //
+ // Record the version of Stacks used and the date generated as a comment in the catalog.
+ //
+ // Obtain the current date.
+ //
+ stringstream log;
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%F %T", timeinfo);
+ log << "# rxstacks version " << VERSION << "; generated on " << date << "\n";
+ if (gzip) {
+ gzputs(gz_tags, log.str().c_str());
+ gzputs(gz_mods, log.str().c_str());
+ gzputs(gz_snps, log.str().c_str());
+ gzputs(gz_alle, log.str().c_str());
+ } else {
+ tags << log.str();
+ mods << log.str();
+ snps << log.str();
+ alle << log.str();
}
int wrote = 0;
for (i = m.begin(); i != m.end(); i++) {
- tag_1 = i->second;
+ tag_1 = i->second;
- wrote++;
+ wrote++;
- // First write the consensus sequence
- sstr << "0" << "\t"
- << tag_1->sample_id << "\t"
- << tag_1->id << "\t"
+ // First write the consensus sequence
+ sstr << "0" << "\t"
+ << tag_1->sample_id << "\t"
+ << tag_1->id << "\t"
<< tag_1->loc.chr << "\t"
<< tag_1->loc.bp << "\t"
<< (tag_1->loc.strand == strand_plus ? "+" : "-") << "\t"
- << "consensus" << "\t"
- << "\t"
- << "\t"
- << tag_1->con << "\t"
- << (tag_1->deleveraged == true ? "1" : "0") << "\t"
- << (tag_1->blacklisted == true ? "1" : "0") << "\t"
- << (tag_1->lumberjackstack == true ? "1" : "0") << "\t"
- << tag_1->lnl << "\n";
-
- //
- // Write a sequence recording the output of the SNP model for each nucleotide.
- //
- sstr << "0" << "\t"
- << tag_1->sample_id << "\t"
- << tag_1->id << "\t"
+ << "consensus" << "\t"
+ << "\t"
+ << "\t"
+ << tag_1->con << "\t"
+ << (tag_1->deleveraged == true ? "1" : "0") << "\t"
+ << (tag_1->blacklisted == true ? "1" : "0") << "\t"
+ << (tag_1->lumberjackstack == true ? "1" : "0") << "\t"
+ << tag_1->lnl << "\n";
+
+ //
+ // Write a sequence recording the output of the SNP model for each nucleotide.
+ //
+ sstr << "0" << "\t"
+ << tag_1->sample_id << "\t"
+ << tag_1->id << "\t"
+ << "\t"
<< "\t"
<< "\t"
+ << "model" << "\t"
<< "\t"
- << "model" << "\t"
- << "\t"
- << "\t";
- for (uint j = 0; j < tag_1->snps.size(); j++) {
- switch(tag_1->snps[j]->type) {
- case snp_type_het:
- sstr << "E";
- break;
- case snp_type_hom:
- sstr << "O";
- break;
- default:
- sstr << "U";
- break;
- }
- }
- sstr << "\t"
- << "\t"
- << "\t"
- << "\t"
- << "\n";
-
- if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
- sstr.str("");
-
- //
- // Now write out each read from this locus.
- //
- for (uint j = 0; j < tag_1->reads.size(); j++) {
- sstr << "0" << "\t"
- << tag_1->sample_id << "\t"
- << tag_1->id << "\t\t\t\t";
-
- if (tag_1->comp_type[j] == primary) {
- sstr << "primary" << "\t"
+ << "\t";
+ for (uint j = 0; j < tag_1->snps.size(); j++) {
+ switch(tag_1->snps[j]->type) {
+ case snp_type_het:
+ sstr << "E";
+ break;
+ case snp_type_hom:
+ sstr << "O";
+ break;
+ default:
+ sstr << "U";
+ break;
+ }
+ }
+ sstr << "\t"
+ << "\t"
+ << "\t"
+ << "\t"
+ << "\n";
+
+ if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
+ if (gzip) gzputs(gz_mods, sstr.str().c_str()); else mods << sstr.str();
+ sstr.str("");
+
+ //
+ // Now write out each read from this locus.
+ //
+ for (uint j = 0; j < tag_1->reads.size(); j++) {
+ sstr << "0" << "\t"
+ << tag_1->sample_id << "\t"
+ << tag_1->id << "\t\t\t\t";
+
+ if (tag_1->comp_type[j] == primary) {
+ sstr << "primary" << "\t"
<< tag_1->comp_cnt[j] << "\t";
} else {
- sstr << "secondary" << "\t\t";
+ sstr << "secondary" << "\t\t";
+ }
+
+ sstr << tag_1->comp[j] << "\t"
+ << tag_1->reads[j] << "\t\t\t\t\n";
+ }
+
+ if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
+ sstr.str("");
+
+ //
+ // Write out the model calls for each nucleotide in this locus.
+ //
+ for (uint j = 0; j < tag_1->snps.size(); j++) {
+ sstr << "0" << "\t"
+ << tag_1->sample_id << "\t"
+ << tag_1->id << "\t"
+ << tag_1->snps[j]->col << "\t";
+
+ switch(tag_1->snps[j]->type) {
+ case snp_type_het:
+ sstr << "E\t";
+ break;
+ case snp_type_hom:
+ sstr << "O\t";
+ break;
+ default:
+ sstr << "U\t";
+ break;
}
- sstr << tag_1->comp[j] << "\t"
- << tag_1->reads[j] << "\t\t\t\t\n";
- }
-
- if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
- sstr.str("");
-
- //
- // Write out the model calls for each nucleotide in this locus.
- //
- for (uint j = 0; j < tag_1->snps.size(); j++) {
- sstr << "0" << "\t"
- << tag_1->sample_id << "\t"
- << tag_1->id << "\t"
- << tag_1->snps[j]->col << "\t";
-
- switch(tag_1->snps[j]->type) {
- case snp_type_het:
- sstr << "E\t";
- break;
- case snp_type_hom:
- sstr << "O\t";
- break;
- default:
- sstr << "U\t";
- break;
- }
-
- sstr << std::fixed << std::setprecision(3)
- << tag_1->snps[j]->lratio << "\t"
- << tag_1->snps[j]->rank_1 << "\t"
- << (tag_1->snps[j]->rank_2 == 0 ? '-' : tag_1->snps[j]->rank_2) << "\t\t\n";
- }
-
- if (gzip) gzputs(gz_snps, sstr.str().c_str()); else snps << sstr.str();
- sstr.str("");
-
- //
- // Write the expressed alleles seen for the recorded SNPs and
- // the percentage of tags a particular allele occupies.
- //
+ sstr << std::fixed << std::setprecision(3)
+ << tag_1->snps[j]->lratio << "\t"
+ << tag_1->snps[j]->rank_1 << "\t"
+ << (tag_1->snps[j]->rank_2 == 0 ? '-' : tag_1->snps[j]->rank_2) << "\t\t\n";
+ }
+
+ if (gzip) gzputs(gz_snps, sstr.str().c_str()); else snps << sstr.str();
+ sstr.str("");
+
+ //
+ // Write the expressed alleles seen for the recorded SNPs and
+ // the percentage of tags a particular allele occupies.
+ //
char pct[id_len];
- for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
+ for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
sprintf(pct, "%.2f", ((t->second/double(tag_1->reads.size())) * 100));
- sstr << "0" << "\t"
- << tag_1->sample_id << "\t"
- << tag_1->id << "\t"
- << t->first << "\t"
- << pct << "\t"
- << t->second << "\n";
- }
-
- if (gzip) gzputs(gz_alle, sstr.str().c_str()); else alle << sstr.str();
- sstr.str("");
+ sstr << "0" << "\t"
+ << tag_1->sample_id << "\t"
+ << tag_1->id << "\t"
+ << t->first << "\t"
+ << pct << "\t"
+ << t->second << "\n";
+ }
+
+ if (gzip) gzputs(gz_alle, sstr.str().c_str()); else alle << sstr.str();
+ sstr.str("");
}
if (gzip) {
- gzclose(gz_tags);
- gzclose(gz_snps);
- gzclose(gz_alle);
+ gzclose(gz_tags);
+ gzclose(gz_mods);
+ gzclose(gz_snps);
+ gzclose(gz_alle);
} else {
- tags.close();
- snps.close();
- alle.close();
+ tags.close();
+ mods.close();
+ snps.close();
+ alle.close();
}
cerr << "wrote " << wrote << " loci.\n";
@@ -1455,28 +1498,28 @@ fill_catalog_snps(map<int, CSLocus *> &catalog)
CSLocus *cloc;
for (it = catalog.begin(); it != catalog.end(); it++) {
- cloc = it->second;
-
- queue<SNP *> snps;
- for (uint j = 0; j < cloc->snps.size(); j++)
- snps.push(cloc->snps[j]);
-
- cloc->snps.clear();
-
- for (uint j = 0; j < cloc->len; j++) {
- if (snps.size() > 0 && snps.front()->col == j) {
- cloc->snps.push_back(snps.front());
- snps.pop();
- } else {
- SNP *snp = new SNP;
- snp->type = snp_type_hom;
- snp->col = j;
- snp->lratio = 0;
- snp->rank_1 = cloc->con[j];
- snp->rank_2 = 0;
- cloc->snps.push_back(snp);
- }
- }
+ cloc = it->second;
+
+ queue<SNP *> snps;
+ for (uint j = 0; j < cloc->snps.size(); j++)
+ snps.push(cloc->snps[j]);
+
+ cloc->snps.clear();
+
+ for (uint j = 0; j < cloc->len; j++) {
+ if (snps.size() > 0 && snps.front()->col == j) {
+ cloc->snps.push_back(snps.front());
+ snps.pop();
+ } else {
+ SNP *snp = new SNP;
+ snp->type = snp_type_hom;
+ snp->col = j;
+ snp->lratio = 0;
+ snp->rank_1 = cloc->con[j];
+ snp->rank_2 = 0;
+ cloc->snps.push_back(snp);
+ }
+ }
}
return 0;
@@ -1496,27 +1539,27 @@ init_log(int argc, char **argv, ofstream &log_fh, ofstream &log_snp_fh, ofstream
if (log_fh.fail()) {
cerr << "Error opening log file '" << log.str() << "'\n";
- exit(1);
+ exit(1);
}
if (verbose) {
- log.str("");
- log << out_path << "batch_" << batch_id << ".rxstacks.snps.log";
- log_snp_fh.open(log.str().c_str(), ofstream::out);
-
- if (log_snp_fh.fail()) {
- cerr << "Error opening log file '" << log.str() << "'\n";
- exit(1);
- }
-
- log.str("");
- log << out_path << "batch_" << batch_id << ".rxstacks.haplotypes.log";
- log_hap_fh.open(log.str().c_str(), ofstream::out);
-
- if (log_hap_fh.fail()) {
- cerr << "Error opening log file '" << log.str() << "'\n";
- exit(1);
- }
+ log.str("");
+ log << out_path << "batch_" << batch_id << ".rxstacks.snps.log";
+ log_snp_fh.open(log.str().c_str(), ofstream::out);
+
+ if (log_snp_fh.fail()) {
+ cerr << "Error opening log file '" << log.str() << "'\n";
+ exit(1);
+ }
+
+ log.str("");
+ log << out_path << "batch_" << batch_id << ".rxstacks.haplotypes.log";
+ log_hap_fh.open(log.str().c_str(), ofstream::out);
+
+ if (log_hap_fh.fail()) {
+ cerr << "Error opening log file '" << log.str() << "'\n";
+ exit(1);
+ }
}
//
@@ -1531,103 +1574,103 @@ init_log(int argc, char **argv, ofstream &log_fh, ofstream &log_snp_fh, ofstream
sstr << "#";
for (int i = 0; i < argc; i++)
- sstr << " " << argv[i];
+ sstr << " " << argv[i];
sstr << "\n" << "# rxstacks executed " << date;
- log_fh << sstr.str() << "\n"
- << "# Sample\t"
- << "Total nucs\t"
- << "Total nucs converted\t"
- << "Unk to Hom\t"
- << "Unk to Het\t"
- << "Hom to Unk\t"
- << "Het to Unk\t"
- << "Hom to Het\t"
- << "Het to Hom\t"
- << "Confounded loci\t"
- << "Lnl Filtered loci\t"
- << "Pruned Haplotypes\t"
- << "MST-Pruned Haplotypes\n";
+ log_fh << sstr.str() << "\n"
+ << "# Sample\t"
+ << "Total nucs\t"
+ << "Total nucs converted\t"
+ << "Unk to Hom\t"
+ << "Unk to Het\t"
+ << "Hom to Unk\t"
+ << "Het to Unk\t"
+ << "Hom to Het\t"
+ << "Het to Hom\t"
+ << "Confounded loci\t"
+ << "Lnl Filtered loci\t"
+ << "Pruned Haplotypes\t"
+ << "MST-Pruned Haplotypes\n";
if (verbose) {
- log_snp_fh << sstr.str() << "\n"
- << "# Sample Id\t"
- << "Locus ID\t"
- << "SNP Col\t"
- << "Orig Value\t"
- << "Corr Value\n";
- log_hap_fh << sstr.str() << "\n"
- << "# Catalog Locus\t"
- << "Sample\t"
- << "Sample Locus\t"
- << "Sample Haplotype\t"
- << "Catalog Haplotype\t"
- << "Corrected Sample Haplotype\t"
- << "Corrected Catalog Haplotype\t"
- << "Algorithm\n";
+ log_snp_fh << sstr.str() << "\n"
+ << "# Sample Id\t"
+ << "Locus ID\t"
+ << "SNP Col\t"
+ << "Orig Value\t"
+ << "Corr Value\n";
+ log_hap_fh << sstr.str() << "\n"
+ << "# Catalog Locus\t"
+ << "Sample\t"
+ << "Sample Locus\t"
+ << "Sample Haplotype\t"
+ << "Catalog Haplotype\t"
+ << "Corrected Sample Haplotype\t"
+ << "Corrected Catalog Haplotype\t"
+ << "Algorithm\n";
}
return 0;
}
-int
-parse_command_line(int argc, char* argv[])
+int
+parse_command_line(int argc, char* argv[])
{
int c;
-
+
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
- {"conf_filter", no_argument, NULL, 'F'},
- {"prune_haplo", no_argument, NULL, 'H'},
- {"lnl_filter", no_argument, NULL, 'G'},
- {"lnl_dist", no_argument, NULL, 'D'},
- {"verbose", no_argument, NULL, 'V'},
- {"num_threads", required_argument, NULL, 't'},
- {"batch_id", required_argument, NULL, 'b'},
- {"in_path", required_argument, NULL, 'P'},
- {"outpath", required_argument, NULL, 'o'},
- {"model_type", required_argument, NULL, 'T'},
- {"bound_low", required_argument, NULL, 'L'},
- {"bound_high", required_argument, NULL, 'U'},
- {"alpha", required_argument, NULL, 'A'},
- {"conf_lim", required_argument, NULL, 'C'},
- {"max_haplo", required_argument, NULL, 'M'},
- {"lnl_lim", required_argument, NULL, 'I'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "hvVFGDHo:t:b:P:T:L:U:A:C:I:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 't':
- num_threads = atoi(optarg);
- break;
- case 'P':
- in_path = optarg;
- break;
- case 'b':
- batch_id = is_integer(optarg);
- if (batch_id < 0) {
- cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
- help();
- }
- break;
- case 'o':
- out_path = optarg;
- break;
- case 'T':
+ {"conf_filter", no_argument, NULL, 'F'},
+ {"prune_haplo", no_argument, NULL, 'H'},
+ {"lnl_filter", no_argument, NULL, 'G'},
+ {"lnl_dist", no_argument, NULL, 'D'},
+ {"verbose", no_argument, NULL, 'V'},
+ {"num_threads", required_argument, NULL, 't'},
+ {"batch_id", required_argument, NULL, 'b'},
+ {"in_path", required_argument, NULL, 'P'},
+ {"outpath", required_argument, NULL, 'o'},
+ {"model_type", required_argument, NULL, 'T'},
+ {"bound_low", required_argument, NULL, 'L'},
+ {"bound_high", required_argument, NULL, 'U'},
+ {"alpha", required_argument, NULL, 'A'},
+ {"conf_lim", required_argument, NULL, 'C'},
+ {"max_haplo", required_argument, NULL, 'M'},
+ {"lnl_lim", required_argument, NULL, 'I'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hvVFGDHo:t:b:P:T:L:U:A:C:I:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 't':
+ num_threads = atoi(optarg);
+ break;
+ case 'P':
+ in_path = optarg;
+ break;
+ case 'b':
+ batch_id = is_integer(optarg);
+ if (batch_id < 0) {
+ cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
+ help();
+ }
+ break;
+ case 'o':
+ out_path = optarg;
+ break;
+ case 'T':
if (strcmp(optarg, "snp") == 0) {
model_type = snp;
} else if (strcmp(optarg, "fixed") == 0) {
@@ -1638,97 +1681,97 @@ parse_command_line(int argc, char* argv[])
cerr << "Unknown model type specified '" << optarg << "'\n";
help();
}
- case 'L':
- bound_low = atof(optarg);
- break;
- case 'U':
- bound_high = atof(optarg);
- break;
- case 'A':
- alpha = atof(optarg);
- break;
- case 'F':
- filter_confounded = true;
- break;
- case 'C':
- confounded_limit = is_double(optarg);
- filter_confounded = true;
- break;
- case 'H':
- prune_haplotypes = true;
- break;
- case 'M':
- max_haplotype_cnt = is_integer(optarg);
- break;
- case 'G':
- filter_lnl = true;
- break;
- case 'I':
- lnl_limit = is_double(optarg);
- filter_lnl = true;
- break;
- case 'D':
- lnl_dist = true;
- break;
- case 'V':
- verbose = true;
- break;
+ case 'L':
+ bound_low = atof(optarg);
+ break;
+ case 'U':
+ bound_high = atof(optarg);
+ break;
+ case 'A':
+ alpha = atof(optarg);
+ break;
+ case 'F':
+ filter_confounded = true;
+ break;
+ case 'C':
+ confounded_limit = is_double(optarg);
+ filter_confounded = true;
+ break;
+ case 'H':
+ prune_haplotypes = true;
+ break;
+ case 'M':
+ max_haplotype_cnt = is_integer(optarg);
+ break;
+ case 'G':
+ filter_lnl = true;
+ break;
+ case 'I':
+ lnl_limit = is_double(optarg);
+ filter_lnl = true;
+ break;
+ case 'D':
+ lnl_dist = true;
+ break;
+ case 'V':
+ verbose = true;
+ break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
- default:
- help();
- abort();
- }
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
+ default:
+ help();
+ abort();
+ }
}
if (in_path.length() == 0) {
- cerr << "You must specify a path to the directory containing Stacks output files.\n";
- help();
+ cerr << "You must specify a path to the directory containing Stacks output files.\n";
+ help();
}
if (out_path.length() == 0) {
- cerr << "No output path specified, files in '" << in_path << "' will be overwritten.\n";
- out_path = in_path;
+ cerr << "No output path specified, files in '" << in_path << "' will be overwritten.\n";
+ out_path = in_path;
}
- if (in_path.at(in_path.length() - 1) != '/')
- in_path += "/";
+ if (in_path.at(in_path.length() - 1) != '/')
+ in_path += "/";
- if (out_path.at(out_path.length() - 1) != '/')
- out_path += "/";
+ if (out_path.at(out_path.length() - 1) != '/')
+ out_path += "/";
if (batch_id == 0) {
- cerr << "You must specify a batch ID.\n";
- help();
+ cerr << "You must specify a batch ID.\n";
+ help();
}
if (alpha != 0.1 && alpha != 0.05 && alpha != 0.01 && alpha != 0.001) {
- cerr << "SNP model alpha significance level must be either 0.1, 0.05, 0.01, or 0.001.\n";
- help();
+ cerr << "SNP model alpha significance level must be either 0.1, 0.05, 0.01, or 0.001.\n";
+ help();
}
if (bound_low != 0 && (bound_low < 0 || bound_low >= 1.0)) {
- cerr << "SNP model lower bound must be between 0.0 and 1.0.\n";
- help();
+ cerr << "SNP model lower bound must be between 0.0 and 1.0.\n";
+ help();
}
if (bound_high != 1 && (bound_high <= 0 || bound_high > 1.0)) {
- cerr << "SNP model upper bound must be between 0.0 and 1.0.\n";
- help();
+ cerr << "SNP model upper bound must be between 0.0 and 1.0.\n";
+ help();
}
if (bound_low > 0 || bound_high < 1.0) {
- model_type = bounded;
+ model_type = bounded;
}
if (filter_confounded == true && (confounded_limit < 0 || confounded_limit > 1.0)) {
- cerr << "Confounded locus limit is a percentage and must be between 0.0 and 1.0.\n";
- help();
+ cerr << "Confounded locus limit is a percentage and must be between 0.0 and 1.0.\n";
+ help();
}
return 0;
@@ -1743,28 +1786,28 @@ void version() {
void help() {
std::cerr << "rxstacks " << VERSION << "\n"
<< "rxstacks -b batch_id -P path [-o path] [-t threads] [-v] [-h]" << "\n"
- << " b: Batch ID to examine when exporting from the catalog.\n"
- << " P: path to the Stacks output files.\n"
- << " o: output path to write results.\n"
- << " t: number of threads to run in parallel sections of code.\n"
- << " v: print program version." << "\n"
- << " h: display this help messsage." << "\n\n"
- << " Filtering options:\n"
- << " --lnl_filter: filter catalog loci based on the mean log likelihood of the catalog locus in the population.\n"
- << " --lnl_lim <limit>: minimum log likelihood required to keep a catalog locus.\n"
- << " --lnl_dist: print distribution of mean log likelihoods for catalog loci.\n"
- << " --conf_filter: filter confounded loci.\n"
- << " --conf_lim <limit>: between 0.0 and 1.0 (default 0.75), proportion of loci in population that must be confounded relative to the catalog locus.\n"
- << " --prune_haplo: prune out non-biological haplotypes unlikely to occur in the population.\n"
- << " --max_haplo <limit>: only consider haplotypes for pruning if they occur in fewer than max_haplo_cnt samples.\n"
- << " Model options:\n"
- << " --model_type <type>: either 'snp' (default), 'bounded', or 'fixed'\n"
- << " For the SNP or Bounded SNP model:\n"
- << " --alpha <num>: chi square significance level required to call a heterozygote or homozygote, either 0.1 (default), 0.05, 0.01, or 0.001.\n"
- << " For the Bounded SNP model:\n"
- << " --bound_low <num>: lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).\n"
- << " --bound_high <num>: upper bound for epsilon, the error rate, between 0 and 1.0 (default 1).\n"
- << " Logging Options:\n"
- << " --verbose: extended logging, including coordinates of all changed nucleotides (forces single-threaded execution).\n";
+ << " b: Batch ID to examine when exporting from the catalog.\n"
+ << " P: path to the Stacks output files.\n"
+ << " o: output path to write results.\n"
+ << " t: number of threads to run in parallel sections of code.\n"
+ << " v: print program version." << "\n"
+ << " h: display this help messsage." << "\n\n"
+ << " Filtering options:\n"
+ << " --lnl_filter: filter catalog loci based on the mean log likelihood of the catalog locus in the population.\n"
+ << " --lnl_lim <limit>: minimum log likelihood required to keep a catalog locus.\n"
+ << " --lnl_dist: print distribution of mean log likelihoods for catalog loci.\n"
+ << " --conf_filter: filter confounded loci.\n"
+ << " --conf_lim <limit>: between 0.0 and 1.0 (default 0.75), proportion of loci in population that must be confounded relative to the catalog locus.\n"
+ << " --prune_haplo: prune out non-biological haplotypes unlikely to occur in the population.\n"
+ << " --max_haplo <limit>: only consider haplotypes for pruning if they occur in fewer than max_haplo_cnt samples.\n"
+ << " Model options:\n"
+ << " --model_type <type>: either 'snp' (default), 'bounded', or 'fixed'\n"
+ << " For the SNP or Bounded SNP model:\n"
+ << " --alpha <num>: chi square significance level required to call a heterozygote or homozygote, either 0.1 (default), 0.05, 0.01, or 0.001.\n"
+ << " For the Bounded SNP model:\n"
+ << " --bound_low <num>: lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).\n"
+ << " --bound_high <num>: upper bound for epsilon, the error rate, between 0 and 1.0 (default 1).\n"
+ << " Logging Options:\n"
+ << " --verbose: extended logging, including coordinates of all changed nucleotides (forces single-threaded execution).\n";
exit(0);
}
diff --git a/src/rxstacks.h b/src/rxstacks.h
index 5361251..5a80328 100644
--- a/src/rxstacks.h
+++ b/src/rxstacks.h
@@ -80,15 +80,15 @@ int dist(string, string);
int measure_error(CSLocus *, Locus *, Datum *, ofstream &);
int calc_lnl_means(map<int, CSLocus *> &, PopMap<CSLocus> *);
int prune_nucleotides(CSLocus *, Locus *, Datum *, ofstream &, unsigned long int &,
- unsigned long int &, unsigned long int &, unsigned long int &,
- unsigned long int &, unsigned long int &, unsigned long int &);
+ unsigned long int &, unsigned long int &, unsigned long int &,
+ unsigned long int &, unsigned long int &, unsigned long int &);
int invoke_model(Locus *, int, map<char, int> &);
-int call_alleles(Locus *, set<int> &);
+int call_alleles(Locus *, set<int> &);
int generate_matched_haplotypes(CSLocus *, Locus *, Datum *);
int fill_catalog_snps(map<int, CSLocus *> &);
int log_model_calls(Locus *, ofstream &,
- unsigned long int &, unsigned long int &, unsigned long int &,
- unsigned long int &, unsigned long int &, unsigned long int &);
+ unsigned long int &, unsigned long int &, unsigned long int &,
+ unsigned long int &, unsigned long int &, unsigned long int &);
int write_results(string, map<int, Locus *> &);
#endif // __RXSTACKS_H__
diff --git a/src/smoothing.h b/src/smoothing.h
index b0ce73f..547751b 100644
--- a/src/smoothing.h
+++ b/src/smoothing.h
@@ -30,13 +30,13 @@ class KSmooth {
double *weights; // Weight matrix to apply while smoothing.
public:
- KSmooth(int size) {
- this->size = size;
- this->weights = calc_weights();
+ KSmooth(int size) {
+ this->size = size;
+ this->weights = calc_weights();
}
- ~KSmooth() {
- delete [] this->weights;
+ ~KSmooth() {
+ delete [] this->weights;
}
int smooth(vector<StatT *> &popstats);
@@ -47,13 +47,13 @@ int
KSmooth<StatT>::smooth(vector<StatT *> &popstats)
{
//
- // To generate smooth genome-wide distributions of Fst, we calculate a kernel-smoothing
+ // To generate smooth genome-wide distributions of Fst, we calculate a kernel-smoothing
// moving average of Fst values along each ordered chromosome.
//
- // For each genomic region centered on a nucleotide position c, the contribution of the population
+ // For each genomic region centered on a nucleotide position c, the contribution of the population
// genetic statistic at position p to the region average was weighted by the Gaussian function:
// exp( (-1 * (p - c)^2) / (2 * sigma^2))
- //
+ //
// In addition, we weight each position according to (n_k - 1), where n_k is the number of alleles
// sampled at that location.
//
@@ -62,85 +62,85 @@ KSmooth<StatT>::smooth(vector<StatT *> &popstats)
#pragma omp parallel
{
int limit = 3 * sigma;
- int dist;
- uint pos_l, pos_u;
- double sum, final_weight;
- PopStat *c, *p;
+ int dist;
+ uint pos_l, pos_u;
+ double sum, final_weight;
+ PopStat *c, *p;
- pos_l = 0;
- pos_u = 0;
+ pos_l = 0;
+ pos_u = 0;
#pragma omp for schedule(dynamic, 1)
- for (uint pos_c = 0; pos_c < popstats.size(); pos_c++) {
- c = popstats[pos_c];
-
- if (c == NULL)
- continue;
-
- for (uint i = 0; i < this->size; i++)
- c->smoothed[i] = 0.0;
- sum = 0.0;
-
- determine_window_limits(popstats, c->bp, pos_l, pos_u);
-
- for (uint pos_p = pos_l; pos_p < pos_u; pos_p++) {
- p = popstats[pos_p];
-
- if (p == NULL)
- continue;
-
- dist = p->bp > c->bp ? p->bp - c->bp : c->bp - p->bp;
-
- if (dist > limit || dist < 0) {
- #pragma omp critical
- {
- cerr << "ERROR: current basepair is out of the sliding window.\n"
- << " Calculating sliding window; start position: " << pos_l << ", " << (popstats[pos_l] == NULL ? -1 : popstats[pos_l]->bp +1) << "bp; end position: "
- << pos_u << ", " << (popstats[pos_u] == NULL ? -1 : popstats[pos_u]->bp +1) << "bp; center: "
- << pos_c << ", " << popstats[pos_c]->bp +1 << "bp\n"
- << " Current position: " << pos_p << ", " << popstats[pos_p]->bp +1 << "; Dist: " << dist << "\n"
- << " Window positions:\n";
-
- for (uint j = pos_l; j < pos_u; j++) {
- p = popstats[j];
- if (p == NULL) continue;
- cerr << " Position: " << j << "; " << p->bp +1 << "bp\n";
- }
- //exit(0);
- }
- continue;
- }
-
- // sites_cnt++;
-
- final_weight = (p->alleles - 1) * this->weights[dist];
- for (uint i = 0; i < this->size; i++)
- c->smoothed[i] += p->stat[i] * final_weight;
- sum += final_weight;
-
- // if (c->loc_id == 9314) {
- // cerr << " id: " << p->loc_id
- // << "; dist: " << dist
- // << "; weight: " << weights[dist]
- // << "; final_weight: " << final_weight
- // << "; fst': " << p->stat[3]
- // << "; sum: " << sum
- // << "; smoothed: " << c->smoothed[3] << "\n";
- // }
- }
-
- // sites_per_snp += (sites_cnt / snp_cnt);
- // tot_windows++;
- //
- // if (snp_cnt < max_snp_dist) {
- // #pragma omp atomic
- // snp_dist[snp_cnt]++;
- // }
- // c->snp_cnt = snp_cnt;
-
- for (uint i = 0; i < this->size; i++)
- c->smoothed[i] /= sum;
- }
+ for (uint pos_c = 0; pos_c < popstats.size(); pos_c++) {
+ c = popstats[pos_c];
+
+ if (c == NULL)
+ continue;
+
+ for (uint i = 0; i < this->size; i++)
+ c->smoothed[i] = 0.0;
+ sum = 0.0;
+
+ determine_window_limits(popstats, c->bp, pos_l, pos_u);
+
+ for (uint pos_p = pos_l; pos_p < pos_u; pos_p++) {
+ p = popstats[pos_p];
+
+ if (p == NULL)
+ continue;
+
+ dist = p->bp > c->bp ? p->bp - c->bp : c->bp - p->bp;
+
+ if (dist > limit || dist < 0) {
+ #pragma omp critical
+ {
+ cerr << "ERROR: current basepair is out of the sliding window.\n"
+ << " Calculating sliding window; start position: " << pos_l << ", " << (popstats[pos_l] == NULL ? -1 : popstats[pos_l]->bp +1) << "bp; end position: "
+ << pos_u << ", " << (popstats[pos_u] == NULL ? -1 : popstats[pos_u]->bp +1) << "bp; center: "
+ << pos_c << ", " << popstats[pos_c]->bp +1 << "bp\n"
+ << " Current position: " << pos_p << ", " << popstats[pos_p]->bp +1 << "; Dist: " << dist << "\n"
+ << " Window positions:\n";
+
+ for (uint j = pos_l; j < pos_u; j++) {
+ p = popstats[j];
+ if (p == NULL) continue;
+ cerr << " Position: " << j << "; " << p->bp +1 << "bp\n";
+ }
+ //exit(0);
+ }
+ continue;
+ }
+
+ // sites_cnt++;
+
+ final_weight = (p->alleles - 1) * this->weights[dist];
+ for (uint i = 0; i < this->size; i++)
+ c->smoothed[i] += p->stat[i] * final_weight;
+ sum += final_weight;
+
+ // if (c->loc_id == 9314) {
+ // cerr << " id: " << p->loc_id
+ // << "; dist: " << dist
+ // << "; weight: " << weights[dist]
+ // << "; final_weight: " << final_weight
+ // << "; fst': " << p->stat[3]
+ // << "; sum: " << sum
+ // << "; smoothed: " << c->smoothed[3] << "\n";
+ // }
+ }
+
+ // sites_per_snp += (sites_cnt / snp_cnt);
+ // tot_windows++;
+ //
+ // if (snp_cnt < max_snp_dist) {
+ // #pragma omp atomic
+ // snp_dist[snp_cnt]++;
+ // }
+ // c->snp_cnt = snp_cnt;
+
+ for (uint i = 0; i < this->size; i++)
+ c->smoothed[i] /= sum;
+ }
}
return 0;
diff --git a/src/smoothing_utils.h b/src/smoothing_utils.h
index 685e20d..3541da8 100644
--- a/src/smoothing_utils.h
+++ b/src/smoothing_utils.h
@@ -29,12 +29,12 @@ extern double sigma;
inline
double *
-calc_weights()
+calc_weights()
{
//
// Calculate weights for window smoothing operations.
//
- // For each genomic region centered on a nucleotide position c, the contribution of the population
+ // For each genomic region centered on a nucleotide position c, the contribution of the population
// genetic statistic at position p to the region average was weighted by the Gaussian function:
// exp( (-1 * (p - c)^2) / (2 * sigma^2))
//
@@ -42,7 +42,7 @@ calc_weights()
double *weights = new double[limit + 1];
for (int i = 0; i <= limit; i++)
- weights[i] = exp((-1 * pow(i, 2)) / (2 * pow(sigma, 2)));
+ weights[i] = exp((-1 * pow(i, 2)) / (2 * pow(sigma, 2)));
return weights;
}
@@ -56,24 +56,24 @@ determine_window_limits(vector<StatT *> &sites, uint center_bp, uint &pos_l, uin
int limit_u = center_bp + limit;
while (pos_l < sites.size()) {
- if (sites[pos_l] == NULL) {
- pos_l++;
- } else {
- if (sites[pos_l]->bp < limit_l)
- pos_l++;
- else
- break;
- }
+ if (sites[pos_l] == NULL) {
+ pos_l++;
+ } else {
+ if (sites[pos_l]->bp < limit_l)
+ pos_l++;
+ else
+ break;
+ }
}
while (pos_u < sites.size()) {
- if (sites[pos_u] == NULL) {
- pos_u++;
- } else {
- if (sites[pos_u]->bp < limit_u)
- pos_u++;
- else
- break;
- }
+ if (sites[pos_u] == NULL) {
+ pos_u++;
+ } else {
+ if (sites[pos_u]->bp < limit_u)
+ pos_u++;
+ else
+ break;
+ }
}
return 0;
}
diff --git a/src/sql_utilities.h b/src/sql_utilities.h
index 9b92b8f..41175b7 100644
--- a/src/sql_utilities.h
+++ b/src/sql_utilities.h
@@ -40,8 +40,8 @@ int load_model_results(string sample, map<int, ModRes *> &modres);
int load_snp_calls(string sample, map<int, SNPRes *> &snpres);
template <class LocusT>
-int
-load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_all_model_calls, bool &compressed)
+int
+load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_all_model_calls, bool &compressed)
{
LocusT *c;
SNP *snp;
@@ -61,7 +61,7 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
bool open_fail = true;
int fh_status = 1;
- // //
+ // //
// // First, try to parse the models file to pull in the consensus sequence and model string
// // for each locus. If the models file is not available or we are requested to store the
// // reads from each stack, fall back to the tags file.
@@ -149,7 +149,7 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
if (blacklisted.count(id)) continue;
//
- // Make sure this locus has already been defined (consensus sequence SHOULD always
+ // Make sure this locus has already been defined (consensus sequence SHOULD always
// be specified first in the file for a particular locus).
//
if (loci.count(id) > 0) {
@@ -198,7 +198,7 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
}
//
- // Do not include blacklisted tags in the catalog. They are tags that are composed
+ // Do not include blacklisted tags in the catalog. They are tags that are composed
// of noise and/or repetitive sequence.
//
if (parts[11] == "1") {
@@ -251,7 +251,7 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
else
fh.close();
- //
+ //
// Next, parse the SNP file and load model calls.
//
gzip = false;
@@ -301,7 +301,7 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
//
// Only load heterozygous model calls.
//
- if (load_all_model_calls == false && parts[4] != "E")
+ if (load_all_model_calls == false && parts[4] != "E")
continue;
snp = new SNP;
@@ -310,7 +310,7 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
snp->rank_1 = parts[6].at(0);
snp->rank_2 = parts[7].at(0) == '-' ? 0 : parts[7].at(0);
- if (parts[4] == "E")
+ if (parts[4] == "E")
snp->type = snp_type_het;
else if (parts[4] == "O")
snp->type = snp_type_hom;
@@ -320,7 +320,7 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
if (parts.size() == 10) {
if (parts[8].length() == 0 || parts[8].at(0) == '-')
snp->rank_3 = 0;
- else
+ else
snp->rank_3 = parts[8].at(0);
if (parts[9].length() == 0 || parts[9].at(0) == '-')
@@ -344,7 +344,7 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
else
fh.close();
- //
+ //
// Finally, parse the Alleles file
//
gzip = false;
@@ -413,7 +413,7 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
for (i = loci.begin(); i != loci.end(); i++)
i->second->populate_alleles();
- delete [] line;
+ free(line);
return 1;
}
diff --git a/src/sstacks.cc b/src/sstacks.cc
index d51331f..c23f995 100644
--- a/src/sstacks.cc
+++ b/src/sstacks.cc
@@ -31,7 +31,7 @@ string out_path;
FileT in_file_type = FileT::sql;
int num_threads = 1;
int batch_id = 0;
-int samp_id = 0;
+int samp_id = 0;
int catalog = 0;
bool verify_haplotypes = true;
bool impute_haplotypes = true;
@@ -60,17 +60,17 @@ int main (int argc, char* argv[]) {
bool compressed = false;
int res;
- if (search_type == sequence)
- cerr << "Searching for matches by sequence identity...\n";
+ if (search_type == sequence)
+ cerr << "Searching for matches by sequence identity...\n";
else if (search_type == genomic_loc)
- cerr << "Searching for matches by genomic location...\n";
+ cerr << "Searching for matches by genomic location...\n";
catalog_path += ".catalog";
res = load_loci(catalog_path, catalog, false, false, compressed);
if (res == 0) {
- cerr << "Unable to parse catalog, '" << catalog_path << "'\n";
- return 0;
+ cerr << "Unable to parse catalog, '" << catalog_path << "'\n";
+ return 0;
}
KmerHashMap kmer_map;
@@ -78,9 +78,9 @@ int main (int argc, char* argv[]) {
vector<char *> kmer_map_keys;
if (gapped_alignments) {
- cerr << "Populating kmer dictionary for gapped alignments...";
- populate_kmer_hash(catalog, kmer_map, kmer_map_keys, allele_map, gapped_kmer_len);
- cerr << "done.\n";
+ cerr << "Populating kmer dictionary for gapped alignments...";
+ populate_kmer_hash(catalog, kmer_map, kmer_map_keys, allele_map, gapped_kmer_len);
+ cerr << "done.\n";
}
string sample_path;
@@ -89,47 +89,47 @@ int main (int argc, char* argv[]) {
while (!samples.empty()) {
map<int, QLocus *> sample;
- sample_path = samples.front();
- samples.pop();
+ sample_path = samples.front();
+ samples.pop();
- cerr << "Processing sample '" << sample_path << "' [" << i << " of " << sample_cnt << "]\n";
+ cerr << "Processing sample '" << sample_path << "' [" << i << " of " << sample_cnt << "]\n";
- res = load_loci(sample_path, sample, false, false, compressed);
+ res = load_loci(sample_path, sample, false, false, compressed);
- if (res == 0) {
- cerr << "Unable to parse '" << sample_path << "'\n";
- return 0;
- }
+ if (res == 0) {
+ cerr << "Unable to parse '" << sample_path << "'\n";
+ return 0;
+ }
- in_file_type = compressed == true ? FileT::gzsql : FileT::sql;
+ in_file_type = compressed == true ? FileT::gzsql : FileT::sql;
- //
- // Assign the ID for this sample data.
- //
- samp_id = sample.begin()->second->sample_id;
+ //
+ // Assign the ID for this sample data.
+ //
+ samp_id = sample.begin()->second->sample_id;
- //dump_loci(catalog);
- //dump_loci(sample);
+ //dump_loci(catalog);
+ //dump_loci(sample);
- if (search_type == sequence) {
- cerr << "Searching for sequence matches...\n";
- find_matches_by_sequence(catalog, sample);
+ if (search_type == sequence) {
+ cerr << "Searching for sequence matches...\n";
+ find_matches_by_sequence(catalog, sample);
if (gapped_alignments) {
cerr << "Searching for gapped alignments...\n";
- search_for_gaps(catalog, sample, kmer_map, allele_map, min_match_len);
+ search_for_gaps(catalog, sample, kmer_map, allele_map, min_match_len);
}
- } else if (search_type == genomic_loc) {
- cerr << "Searching for matches by genomic location...\n";
- find_matches_by_genomic_loc(catalog, sample);
- }
+ } else if (search_type == genomic_loc) {
+ cerr << "Searching for matches by genomic location...\n";
+ find_matches_by_genomic_loc(catalog, sample);
+ }
- write_matches(sample_path, sample);
- i++;
+ write_matches(sample_path, sample);
+ i++;
//
- // Free memory associated with sample
+ // Free memory associated with the sample.
//
for (map<int, QLocus *>::iterator j = sample.begin(); j != sample.end(); j++)
delete j->second;
@@ -137,7 +137,14 @@ int main (int argc, char* argv[]) {
}
if (gapped_alignments)
- free_kmer_hash(kmer_map, kmer_map_keys);
+ free_kmer_hash(kmer_map, kmer_map_keys);
+
+ //
+ // Free memory associated with the catalog.
+ //
+ for (map<int, Locus *>::iterator j = catalog.begin(); j != catalog.end(); j++)
+ delete j->second;
+ catalog.clear();
return 0;
}
@@ -164,10 +171,10 @@ find_matches_by_genomic_loc(map<int, Locus *> &sample_1, map<int, QLocus *> &sam
map<string, set<int> > locations;
for (j = sample_1.begin(); j != sample_1.end(); j++) {
- snprintf(id, id_len - 1, "%s|%d|%c",
- j->second->loc.chr,
- j->second->loc.bp,
- j->second->loc.strand == strand_plus ? '+' : '-');
+ snprintf(id, id_len - 1, "%s|%d|%c",
+ j->second->loc.chr,
+ j->second->loc.bp,
+ j->second->loc.strand == strand_plus ? '+' : '-');
locations[id].insert(j->second->id);
}
@@ -176,8 +183,8 @@ find_matches_by_genomic_loc(map<int, Locus *> &sample_1, map<int, QLocus *> &sam
// OpenMP can't parallelize random access iterators, so we convert
// our map to a vector of integer keys.
vector<int> keys;
- for (i = sample_2.begin(); i != sample_2.end(); i++)
- keys.push_back(i->first);
+ for (i = sample_2.begin(); i != sample_2.end(); i++)
+ keys.push_back(i->first);
//
// Initialize some counters
@@ -190,49 +197,49 @@ find_matches_by_genomic_loc(map<int, Locus *> &sample_1, map<int, QLocus *> &sam
#pragma omp parallel private(i, j, k, id)
{
- unsigned long verified;
+ unsigned long verified;
#pragma omp for reduction(+:matches) reduction(+:tot_hap) reduction(+:ver_hap) reduction(+:nomatch) reduction(+:nosnps)
- for (k = 0; k < (int) keys.size(); k++) {
-
- i = sample_2.find(keys[k]);
- snprintf(id, id_len - 1, "%s|%d|%c",
- i->second->loc.chr,
- i->second->loc.bp,
- i->second->loc.strand == strand_plus ? '+' : '-');
-
- if (locations.count(id) > 0) {
- Locus *tag;
- set<int>::iterator loc_it;
- vector<pair<allele_type, string> >::iterator q;
-
- matches++;
-
- for (loc_it = locations[id].begin(); loc_it != locations[id].end(); loc_it++) {
- tag = sample_1[*loc_it];
-
- //
- // Generate haplotypes for query tag relative to the catalog tag.
- //
- set<string> query_haplotypes;
- generate_query_haplotypes(tag, i->second, query_haplotypes);
- tot_hap += query_haplotypes.size() > 0 ? query_haplotypes.size() : 1;
-
- if (verify_haplotypes) {
- verified = verify_genomic_loc_match(tag, i->second, query_haplotypes, nosnps);
- ver_hap += verified;
- if (verified == 0) nomatch++;
- } else {
- i->second->add_match(tag->id, tag->strings.begin()->first);
- }
- }
- }
- }
+ for (k = 0; k < (int) keys.size(); k++) {
+
+ i = sample_2.find(keys[k]);
+ snprintf(id, id_len - 1, "%s|%d|%c",
+ i->second->loc.chr,
+ i->second->loc.bp,
+ i->second->loc.strand == strand_plus ? '+' : '-');
+
+ if (locations.count(id) > 0) {
+ Locus *tag;
+ set<int>::iterator loc_it;
+ vector<pair<allele_type, string> >::iterator q;
+
+ matches++;
+
+ for (loc_it = locations[id].begin(); loc_it != locations[id].end(); loc_it++) {
+ tag = sample_1[*loc_it];
+
+ //
+ // Generate haplotypes for query tag relative to the catalog tag.
+ //
+ set<string> query_haplotypes;
+ generate_query_haplotypes(tag, i->second, query_haplotypes);
+ tot_hap += query_haplotypes.size() > 0 ? query_haplotypes.size() : 1;
+
+ if (verify_haplotypes) {
+ verified = verify_genomic_loc_match(tag, i->second, query_haplotypes, nosnps);
+ ver_hap += verified;
+ if (verified == 0) nomatch++;
+ } else {
+ i->second->add_match(tag->id, tag->strings.begin()->first);
+ }
+ }
+ }
+ }
}
- cerr << keys.size() << " stacks matched against the catalog containing " << sample_1.size() << " loci.\n"
- << " " << matches << " matching loci, " << nomatch << " contained no verified haplotypes.\n"
- << " " << nosnps << " loci contained SNPs unaccounted for in the catalog and were excluded.\n"
- << " " << tot_hap << " total haplotypes examined from matching loci, " << ver_hap << " verified.\n";
+ cerr << keys.size() << " stacks matched against the catalog containing " << sample_1.size() << " loci.\n"
+ << " " << matches << " matching loci, " << nomatch << " contained no verified haplotypes.\n"
+ << " " << nosnps << " loci contained SNPs unaccounted for in the catalog and were excluded.\n"
+ << " " << tot_hap << " total haplotypes examined from matching loci, " << ver_hap << " verified.\n";
return 0;
}
@@ -242,7 +249,7 @@ int verify_genomic_loc_match(Locus *s1_tag, QLocus *s2_tag, set<string> &query_h
//
// We have found a match between the genomic location of s1 and s2. We now want
- // to verify that the haplotypes are consistent between the tags, i.e. they
+ // to verify that the haplotypes are consistent between the tags, i.e. they
// have the same number and types of SNPs.
//
@@ -252,37 +259,37 @@ int verify_genomic_loc_match(Locus *s1_tag, QLocus *s2_tag, set<string> &query_h
//
uint min_len = s1_tag->len > s2_tag->len ? s2_tag->len : s1_tag->len;
- if (s1_tag->snps.size() == 0 &&
- s2_tag->snps.size() == 0 &&
- strncmp(s1_tag->con, s2_tag->con, min_len) == 0) {
- s2_tag->add_match(s1_tag->id, "consensus");
- return 1;
+ if (s1_tag->snps.size() == 0 &&
+ s2_tag->snps.size() == 0 &&
+ strncmp(s1_tag->con, s2_tag->con, min_len) == 0) {
+ s2_tag->add_match(s1_tag->id, "consensus");
+ return 1;
}
//
- // 2. Second, we will check that the query locus (s2_tag) does not have any SNPs
+ // 2. Second, we will check that the query locus (s2_tag) does not have any SNPs
// lacking in the catalog tag (s1_tag).
//
bool found;
for (j = s2_tag->snps.begin(); j != s2_tag->snps.end(); j++) {
- found = false;
- //
- // SNP occurs in a column that is beyond the length of the catalog
- //
- if ((*j)->col > min_len - 1)
- continue;
-
- for (i = s1_tag->snps.begin(); i != s1_tag->snps.end(); i++) {
- if ((*i)->col == (*j)->col)
- found = true;
- }
- //
- // Query locus posses a SNP not present in the catalog.
- //
- if (found == false) {
- nosnps++;
- return 0;
- }
+ found = false;
+ //
+ // SNP occurs in a column that is beyond the length of the catalog
+ //
+ if ((*j)->col > min_len - 1)
+ continue;
+
+ for (i = s1_tag->snps.begin(); i != s1_tag->snps.end(); i++) {
+ if ((*i)->col == (*j)->col)
+ found = true;
+ }
+ //
+ // Query locus posses a SNP not present in the catalog.
+ //
+ if (found == false) {
+ nosnps++;
+ return 0;
+ }
}
//
@@ -296,83 +303,83 @@ int verify_genomic_loc_match(Locus *s1_tag, QLocus *s2_tag, set<string> &query_h
uint matches = 0;
for (a = query_haplotypes.begin(); a != query_haplotypes.end(); a++) {
- if (impute_haplotypes) {
- int res = impute_haplotype(*a, s1_tag->strings, cat_haplotype);
-
- if (res > 0) {
- //
- // If the matching haplotype was imputed, record the depths of the query alleles
- // under the new, imputed alleles.
- //
- if (s2_tag->alleles.count(cat_haplotype) == 0) {
- if (s2_tag->alleles.count(*a) > 0)
- s2_tag->alleles[cat_haplotype] = s2_tag->alleles[*a];
- else
- s2_tag->alleles[cat_haplotype] = s2_tag->depth;
- }
- //cerr << s2_tag->id << "; Adding cat haplotype: " << cat_haplotype << " based on depth of " << *a << ", " << s2_tag->alleles[cat_haplotype] << "\n";
- s2_tag->add_match(s1_tag->id, cat_haplotype);
- matches++;
- } else if (res < 0) {
- cerr << " Failure imputing haplotype for catalog locus: " << s1_tag->id << " and query tag: " << s2_tag->id << "\n";
- }
- } else {
- for (c = s1_tag->strings.begin(); c != s1_tag->strings.end(); c++)
- if (*a == c->first) {
- //cerr << " Adding match between " << s1_tag->id << " and " << c->first << "\n";
- s2_tag->add_match(s1_tag->id, c->first);
- matches++;
- }
- }
+ if (impute_haplotypes) {
+ int res = impute_haplotype(*a, s1_tag->strings, cat_haplotype);
+
+ if (res > 0) {
+ //
+ // If the matching haplotype was imputed, record the depths of the query alleles
+ // under the new, imputed alleles.
+ //
+ if (s2_tag->alleles.count(cat_haplotype) == 0) {
+ if (s2_tag->alleles.count(*a) > 0)
+ s2_tag->alleles[cat_haplotype] = s2_tag->alleles[*a];
+ else
+ s2_tag->alleles[cat_haplotype] = s2_tag->depth;
+ }
+ //cerr << s2_tag->id << "; Adding cat haplotype: " << cat_haplotype << " based on depth of " << *a << ", " << s2_tag->alleles[cat_haplotype] << "\n";
+ s2_tag->add_match(s1_tag->id, cat_haplotype);
+ matches++;
+ } else if (res < 0) {
+ cerr << " Failure imputing haplotype for catalog locus: " << s1_tag->id << " and query tag: " << s2_tag->id << "\n";
+ }
+ } else {
+ for (c = s1_tag->strings.begin(); c != s1_tag->strings.end(); c++)
+ if (*a == c->first) {
+ //cerr << " Adding match between " << s1_tag->id << " and " << c->first << "\n";
+ s2_tag->add_match(s1_tag->id, c->first);
+ matches++;
+ }
+ }
}
return matches;
}
-// int impute_haplotype(string query_haplotype,
-// vector<pair<allele_type, string> > &cat_haplotypes,
-// string &match) {
+// int impute_haplotype(string query_haplotype,
+// vector<pair<allele_type, string> > &cat_haplotypes,
+// string &match) {
// uint max_len = query_haplotype.length() > cat_haplotypes[0].first.length() ?
-// query_haplotype.length() :
-// cat_haplotypes[0].first.length();
+// query_haplotype.length() :
+// cat_haplotypes[0].first.length();
// //cerr << "Query len: " << query_haplotype.length() << "; Max length: " << max_len << "\n";
// vector<string> cur, next;
// for (uint i = 0; i < cat_haplotypes.size(); i++)
-// cur.push_back(cat_haplotypes[i].first);
+// cur.push_back(cat_haplotypes[i].first);
// match = "";
// //
-// // Examine the haplotypes one SNP at a time. If we are able to uniquely
-// // determine the catalog haplotype that the query haplotype corresponds
+// // Examine the haplotypes one SNP at a time. If we are able to uniquely
+// // determine the catalog haplotype that the query haplotype corresponds
// // to, return it.
// //
// uint j = 0;
// while (cur.size() > 1 && j < max_len) {
-// for (uint i = 0; i < cur.size(); i++) {
-// //cerr << "Comparing query[" << j << "]: '" << query_haplotype[j] << "' to catalog '" << cur[i][j] << "'\n";
-// if (query_haplotype[j] == cur[i][j]) {
-// //cerr << " Keeping this haplotype.\n";
-// next.push_back(cur[i]);
-// }
-// }
-// cur = next;
-// next.clear();
-// j++;
+// for (uint i = 0; i < cur.size(); i++) {
+// //cerr << "Comparing query[" << j << "]: '" << query_haplotype[j] << "' to catalog '" << cur[i][j] << "'\n";
+// if (query_haplotype[j] == cur[i][j]) {
+// //cerr << " Keeping this haplotype.\n";
+// next.push_back(cur[i]);
+// }
+// }
+// cur = next;
+// next.clear();
+// j++;
// }
// //
// // If there is only one left, make sure what we have of the haplotype does match
// // and its not simply an erroneously called haplotype.
// //
-// if (cur.size() == 1 &&
-// strncmp(cur[0].c_str(), query_haplotype.c_str(), max_len) == 0) {
-// match = cur[0];
-// return 1;
+// if (cur.size() == 1 &&
+// strncmp(cur[0].c_str(), query_haplotype.c_str(), max_len) == 0) {
+// match = cur[0];
+// return 1;
// }
// //
@@ -382,19 +389,19 @@ int verify_genomic_loc_match(Locus *s1_tag, QLocus *s2_tag, set<string> &query_h
// return 0;
// }
-int impute_haplotype(string query_haplotype,
- vector<pair<allele_type, string> > &cat_haplotypes,
- string &match) {
+int impute_haplotype(string query_haplotype,
+ vector<pair<allele_type, string> > &cat_haplotypes,
+ string &match) {
if (cat_haplotypes.size() == 0) {
- cerr << "Warning: malformed catalog tag: missing haplotype information.\n";
- return -1;
+ cerr << "Warning: malformed catalog tag: missing haplotype information.\n";
+ return -1;
}
//cerr << "Examining " << query_haplotype << "\n";
uint max_len = query_haplotype.length() > cat_haplotypes[0].first.length() ?
- query_haplotype.length() :
- cat_haplotypes[0].first.length();
+ query_haplotype.length() :
+ cat_haplotypes[0].first.length();
//cerr << "Query len: " << query_haplotype.length() << "; Max length: " << max_len << "\n";
@@ -402,32 +409,32 @@ int impute_haplotype(string query_haplotype,
uint match_cnt, no_n_cnt;
for (uint i = 0; i < cat_haplotypes.size(); i++)
- cur.push_back(cat_haplotypes[i].first);
+ cur.push_back(cat_haplotypes[i].first);
match = "";
//
- // Examine the haplotypes one SNP at a time. If we are able to uniquely
- // determine the catalog haplotype that the query haplotype corresponds
+ // Examine the haplotypes one SNP at a time. If we are able to uniquely
+ // determine the catalog haplotype that the query haplotype corresponds
// to, return it.
//
uint j = 0;
while (cur.size() > 1 && j < max_len) {
- for (uint i = 0; i < cur.size(); i++) {
- //cerr << "Comparing query[" << j << "]: '" << query_haplotype << "' to catalog '" << cur[i] << "'\n";
- if (require_uniq_haplotypes && (query_haplotype[j] == cur[i][j] || query_haplotype[j] == 'N')) {
- //cerr << " Keeping this haplotype.\n";
- next.push_back(cur[i]);
- } else if (query_haplotype[j] == cur[i][j]) {
- //cerr << " Keeping this haplotype.\n";
- next.push_back(cur[i]);
- } //else {
- //cerr << " Discarding this haplotype.\n";
- //}
- }
- cur = next;
- next.clear();
- j++;
+ for (uint i = 0; i < cur.size(); i++) {
+ //cerr << "Comparing query[" << j << "]: '" << query_haplotype << "' to catalog '" << cur[i] << "'\n";
+ if (require_uniq_haplotypes && (query_haplotype[j] == cur[i][j] || query_haplotype[j] == 'N')) {
+ //cerr << " Keeping this haplotype.\n";
+ next.push_back(cur[i]);
+ } else if (query_haplotype[j] == cur[i][j]) {
+ //cerr << " Keeping this haplotype.\n";
+ next.push_back(cur[i]);
+ } //else {
+ //cerr << " Discarding this haplotype.\n";
+ //}
+ }
+ cur = next;
+ next.clear();
+ j++;
}
//
@@ -437,23 +444,23 @@ int impute_haplotype(string query_haplotype,
no_n_cnt = 0;
match_cnt = 0;
if (cur.size() == 1) {
- if (require_uniq_haplotypes) {
- for (uint k = 0; k < max_len; k++)
- if (query_haplotype[k] != 'N') no_n_cnt++;
- for (uint k = 0; k < max_len; k++)
- if (cur[0][k] == query_haplotype[k]) match_cnt++;
-
- if (match_cnt == no_n_cnt) {
- //cerr << "Keeping " << query_haplotype << "\n";
- match = cur[0];
- return 1;
- }
- } else {
- if (strncmp(cur[0].c_str(), query_haplotype.c_str(), max_len) == 0) {
- match = cur[0];
- return 1;
- }
- }
+ if (require_uniq_haplotypes) {
+ for (uint k = 0; k < max_len; k++)
+ if (query_haplotype[k] != 'N') no_n_cnt++;
+ for (uint k = 0; k < max_len; k++)
+ if (cur[0][k] == query_haplotype[k]) match_cnt++;
+
+ if (match_cnt == no_n_cnt) {
+ //cerr << "Keeping " << query_haplotype << "\n";
+ match = cur[0];
+ return 1;
+ }
+ } else {
+ if (strncmp(cur[0].c_str(), query_haplotype.c_str(), max_len) == 0) {
+ match = cur[0];
+ return 1;
+ }
+ }
}
//
@@ -463,17 +470,17 @@ int impute_haplotype(string query_haplotype,
return 0;
}
-int
+int
generate_query_haplotypes(Locus *s1_tag, QLocus *s2_tag, set<string> &query_haplotypes)
{
//
// Construct a set of haplotypes from the query locus relative to the catalog locus.
- // (The query locus already has a set of haplotypes, however, they don't necessarily
+ // (The query locus already has a set of haplotypes, however, they don't necessarily
// account for all the SNPs in the catalog, so we will augment them with sequence
// from the consensus.)
//
if (s1_tag->snps.size() == 0 && s2_tag->snps.size() == 0)
- return 0;
+ return 0;
vector<pair<string, SNP *> > merged_snps;
map<int, pair<string, SNP *> > columns;
@@ -482,20 +489,20 @@ generate_query_haplotypes(Locus *s1_tag, QLocus *s2_tag, set<string> &query_hapl
vector<SNP *>::iterator i;
for (i = s1_tag->snps.begin(); i != s1_tag->snps.end(); i++)
- columns[(*i)->col] = make_pair("catalog", *i);
+ columns[(*i)->col] = make_pair("catalog", *i);
for (i = s2_tag->snps.begin(); i != s2_tag->snps.end(); i++) {
- //
- // Is this column already represented in the catalog?
- //
- if (columns.count((*i)->col))
- columns[(*i)->col] = make_pair("both", *i);
- else
- columns[(*i)->col] = make_pair("query", *i);
+ //
+ // Is this column already represented in the catalog?
+ //
+ if (columns.count((*i)->col))
+ columns[(*i)->col] = make_pair("both", *i);
+ else
+ columns[(*i)->col] = make_pair("query", *i);
}
- for (c = columns.begin(); c != columns.end(); c++)
- merged_snps.push_back((*c).second);
+ for (c = columns.begin(); c != columns.end(); c++)
+ merged_snps.push_back((*c).second);
//
// Sort the SNPs by column
@@ -508,44 +515,44 @@ generate_query_haplotypes(Locus *s1_tag, QLocus *s2_tag, set<string> &query_hapl
int pos;
for (b = s2_tag->alleles.begin(); b != s2_tag->alleles.end(); b++) {
- old_allele = b->first;
- new_allele = "";
- pos = 0;
-
- for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
- //
- // If the SNPs from the catalog haplotype beyond the length of the query, add Ns
- //
- if (k->first == "catalog") {
- new_allele += (k->second->col > s2_tag->len - 1) ? 'N' : s2_tag->con[k->second->col];
- } else {
- new_allele += old_allele[pos];
- pos++;
- }
- }
- query_haplotypes.insert(new_allele);
- converted_alleles[new_allele] = b->second;
-
- // cerr << "Adding haplotype: " << new_allele << " [" << b->first << "]\n";
+ old_allele = b->first;
+ new_allele = "";
+ pos = 0;
+
+ for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
+ //
+ // If the SNPs from the catalog haplotype beyond the length of the query, add Ns
+ //
+ if (k->first == "catalog") {
+ new_allele += (k->second->col > s2_tag->len - 1) ? 'N' : s2_tag->con[k->second->col];
+ } else {
+ new_allele += old_allele[pos];
+ pos++;
+ }
+ }
+ query_haplotypes.insert(new_allele);
+ converted_alleles[new_allele] = b->second;
+
+ // cerr << "Adding haplotype: " << new_allele << " [" << b->first << "]\n";
}
if (s2_tag->alleles.size() == 0) {
- new_allele = "";
- for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
- new_allele += (k->second->col > s2_tag->len - 1) ? 'N' : s2_tag->con[k->second->col];
- }
- query_haplotypes.insert(new_allele);
- // cerr << "Adding haplotype 2: " << new_allele << "\n";
+ new_allele = "";
+ for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
+ new_allele += (k->second->col > s2_tag->len - 1) ? 'N' : s2_tag->con[k->second->col];
+ }
+ query_haplotypes.insert(new_allele);
+ // cerr << "Adding haplotype 2: " << new_allele << "\n";
} else {
- s2_tag->alleles.clear();
- for (b = converted_alleles.begin(); b != converted_alleles.end(); b++)
- s2_tag->alleles[b->first] = b->second;
+ s2_tag->alleles.clear();
+ for (b = converted_alleles.begin(); b != converted_alleles.end(); b++)
+ s2_tag->alleles[b->first] = b->second;
}
return 0;
}
-int
+int
find_matches_by_sequence(map<int, Locus *> &sample_1, map<int, QLocus *> &sample_2)
{
map<int, QLocus *>::iterator i;
@@ -554,9 +561,9 @@ find_matches_by_sequence(map<int, Locus *> &sample_1, map<int, QLocus *> &sample
//
// We don't assume all radtags will be the same length.
//
- min_tag_len =
- sample_1.begin()->second->len > sample_2.begin()->second->len ?
- sample_2.begin()->second->len : sample_1.begin()->second->len;
+ min_tag_len =
+ sample_1.begin()->second->len > sample_2.begin()->second->len ?
+ sample_2.begin()->second->len : sample_1.begin()->second->len;
//
// Build a hash map out of the catalog, using only the minimum length
@@ -571,8 +578,8 @@ find_matches_by_sequence(map<int, Locus *> &sample_1, map<int, QLocus *> &sample
// our map to a vector of integer keys.
//
vector<int> keys;
- for (i = sample_2.begin(); i != sample_2.end(); i++)
- keys.push_back(i->first);
+ for (i = sample_2.begin(); i != sample_2.end(); i++)
+ keys.push_back(i->first);
//
// Initialize some counters
@@ -588,8 +595,8 @@ find_matches_by_sequence(map<int, Locus *> &sample_1, map<int, QLocus *> &sample
#pragma omp parallel
{
#pragma omp for reduction(+:matches) reduction(+:tot_hap) reduction(+:ver_hap) reduction(+:nomatch) reduction(+:mmatch)
- for (uint k = 0; k < keys.size(); k++) {
- QLocus *query = sample_2[keys[k]];
+ for (uint k = 0; k < keys.size(); k++) {
+ QLocus *query = sample_2[keys[k]];
//
// Iterate through the haplotypes for this tag in sample_2
@@ -597,8 +604,8 @@ find_matches_by_sequence(map<int, Locus *> &sample_1, map<int, QLocus *> &sample
HashMap::iterator hit;
vector<pair<allele_type, string> >::iterator q; // Query records allele_type/search string pairs
vector<pair<int, allele_type> >::iterator c; // Hash map records id/allele_type pairs
- map<string, vector<string> > haplo_hits;
- set<int> loci_hit;
+ map<string, vector<string> > haplo_hits;
+ set<int> loci_hit;
for (q = query->strings.begin(); q != query->strings.end(); q++) {
// cerr << " Looking for haplotype: " << q->first << " with sequence " << q->second.substr(0, min_tag_len) << "\n";
@@ -606,37 +613,37 @@ find_matches_by_sequence(map<int, Locus *> &sample_1, map<int, QLocus *> &sample
hit = sample_1_map.find(q->second.substr(0, min_tag_len).c_str());
if (hit != sample_1_map.end()) {
- tot_hap++;
+ tot_hap++;
// cerr << " Found a match for " << hit->first << "\n";
for (c = hit->second.begin(); c != hit->second.end(); c++) {
- //
- // Record the catalog loci hit by the haplotypes of this query locus.
- //
- loci_hit.insert(c->first);
-
- //
- // Record the haplotypes hit between the query and catalog loci.
- //
- haplo_hits[q->first].push_back(c->second);
-
- if (verify_haplotypes == false)
- query->add_match(c->first, c->second);
- }
+ //
+ // Record the catalog loci hit by the haplotypes of this query locus.
+ //
+ loci_hit.insert(c->first);
+
+ //
+ // Record the haplotypes hit between the query and catalog loci.
+ //
+ haplo_hits[q->first].push_back(c->second);
+
+ if (verify_haplotypes == false)
+ query->add_match(c->first, c->second);
+ }
}
}
if (loci_hit.size() == 0)
nomatch++;
- else if (loci_hit.size() > 0)
+ else if (loci_hit.size() > 0)
matches++;
- if (verify_haplotypes && loci_hit.size() > 0) {
- uint verified = verify_sequence_match(sample_1, query, loci_hit, haplo_hits,
- min_tag_len, mmatch, nosnps);
- ver_hap += verified;
- if (verified == 0) no_haps++;
- }
+ if (verify_haplotypes && loci_hit.size() > 0) {
+ uint verified = verify_sequence_match(sample_1, query, loci_hit, haplo_hits,
+ min_tag_len, mmatch, nosnps);
+ ver_hap += verified;
+ if (verified == 0) no_haps++;
+ }
}
}
@@ -650,18 +657,18 @@ find_matches_by_sequence(map<int, Locus *> &sample_1, map<int, QLocus *> &sample
delete [] sample_1_map_keys[i];
sample_1_map_keys.clear();
- cerr << keys.size() << " stacks compared against the catalog containing " << sample_1.size() << " loci.\n"
- << " " << matches << " matching loci, " << no_haps << " contained no verified haplotypes.\n"
- << " " << mmatch << " loci matched more than one catalog locus and were excluded.\n"
- << " " << nosnps << " loci contained SNPs unaccounted for in the catalog and were excluded.\n"
- << " " << tot_hap << " total haplotypes examined from matching loci, " << ver_hap << " verified.\n";
+ cerr << keys.size() << " stacks compared against the catalog containing " << sample_1.size() << " loci.\n"
+ << " " << matches << " matching loci, " << no_haps << " contained no verified haplotypes.\n"
+ << " " << mmatch << " loci matched more than one catalog locus and were excluded.\n"
+ << " " << nosnps << " loci contained SNPs unaccounted for in the catalog and were excluded.\n"
+ << " " << tot_hap << " total haplotypes examined from matching loci, " << ver_hap << " verified.\n";
return 0;
}
-int verify_sequence_match(map<int, Locus *> &sample_1, QLocus *query,
- set<int> &loci_hit, map<string, vector<string> > &haplo_hits,
- uint min_tag_len, unsigned long &mmatch, unsigned long &nosnps) {
+int verify_sequence_match(map<int, Locus *> &sample_1, QLocus *query,
+ set<int> &loci_hit, map<string, vector<string> > &haplo_hits,
+ uint min_tag_len, unsigned long &mmatch, unsigned long &nosnps) {
//
// 1. Check that this query locus matches just a single catalog locus.
//
@@ -679,28 +686,28 @@ int verify_sequence_match(map<int, Locus *> &sample_1, QLocus *query,
bool found;
for (i = query->snps.begin(); i != query->snps.end(); i++) {
- found = false;
- //
- // SNP occurs in a column that is beyond the length of the catalog
- //
- if ((*i)->col > min_tag_len - 1)
- continue;
-
- for (j = cat->snps.begin(); j != cat->snps.end(); j++) {
- if ((*i)->col == (*j)->col)
- found = true;
- }
- //
- // Query locus posses a SNP not present in the catalog.
- //
- if (found == false) {
- nosnps++;
- return 0;
- }
+ found = false;
+ //
+ // SNP occurs in a column that is beyond the length of the catalog
+ //
+ if ((*i)->col > min_tag_len - 1)
+ continue;
+
+ for (j = cat->snps.begin(); j != cat->snps.end(); j++) {
+ if ((*i)->col == (*j)->col)
+ found = true;
+ }
+ //
+ // Query locus posses a SNP not present in the catalog.
+ //
+ if (found == false) {
+ nosnps++;
+ return 0;
+ }
}
//
- // 3. We want a one-to-one correspondance between a query haplotype and a
+ // 3. We want a one-to-one correspondance between a query haplotype and a
// catalog haplotype. This relationship fails when the catalog and query seqeunces
// are different lengths and the full length haplotype can not be determined.
//
@@ -708,38 +715,38 @@ int verify_sequence_match(map<int, Locus *> &sample_1, QLocus *query,
map<string, int> cat_hap, query_hap;
for (it = haplo_hits.begin(); it != haplo_hits.end(); it++) {
- query_hap[it->first] = it->second.size();
- for (uint j = 0; j < it->second.size(); j++)
- cat_hap[it->second[j]]++;
+ query_hap[it->first] = it->second.size();
+ for (uint j = 0; j < it->second.size(); j++)
+ cat_hap[it->second[j]]++;
}
uint verified = 0;
for (it = haplo_hits.begin(); it != haplo_hits.end(); it++)
- for (uint j = 0; j < it->second.size(); j++) {
- if (cat_hap[it->second[j]] == 1 &&
- query_hap[it->first] == 1) {
- verified++;
- query->add_match(cat->id, it->second[j]);
- //
- // If the matching haplotype was imputed, record the depths of the query alleles
- // under the new, imputed alleles.
- //
- if (query->alleles.count(it->second[j]) == 0) {
- if (query->alleles.count(it->first) > 0)
- query->alleles[it->second[j]] = query->alleles[it->first];
- else
- query->alleles[it->second[j]] = query->depth;
- }
- }
- }
+ for (uint j = 0; j < it->second.size(); j++) {
+ if (cat_hap[it->second[j]] == 1 &&
+ query_hap[it->first] == 1) {
+ verified++;
+ query->add_match(cat->id, it->second[j]);
+ //
+ // If the matching haplotype was imputed, record the depths of the query alleles
+ // under the new, imputed alleles.
+ //
+ if (query->alleles.count(it->second[j]) == 0) {
+ if (query->alleles.count(it->first) > 0)
+ query->alleles[it->second[j]] = query->alleles[it->first];
+ else
+ query->alleles[it->second[j]] = query->depth;
+ }
+ }
+ }
return verified;
}
int
search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
- KmerHashMap &kmer_map, map<int, pair<allele_type, int> > &allele_map,
- double min_match_len)
+ KmerHashMap &kmer_map, map<int, pair<allele_type, int> > &allele_map,
+ double min_match_len)
{
//
// Search for loci that can be merged with a gapped alignment.
@@ -751,7 +758,7 @@ search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
//
map<int, QLocus *>::iterator it;
vector<int> keys;
- for (it = sample.begin(); it != sample.end(); it++)
+ for (it = sample.begin(); it != sample.end(); it++)
keys.push_back(it->first);
//
@@ -782,22 +789,22 @@ search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
{
QLocus *query;
Locus *tag_2;
- KmerHashMap::iterator h;
+ KmerHashMap::iterator h;
AlignRes aln_res;
- vector<char *> kmers;
- set<string> uniq_kmers;
- vector<int> hits;
- vector<pair<int, int> > ordered_hits;
- uint hit_cnt, index, prev_id, allele_id, hits_size, stop, top_hit;
- pair<allele_type, int> cat_hit;
- string query_allele, query_seq, cat_allele, cat_seq;
- map<string, vector<string> > haplo_hits;
- set<int> loci_hit;
+ vector<char *> kmers;
+ set<string> uniq_kmers;
+ vector<int> hits;
+ vector<pair<int, int> > ordered_hits;
+ uint hit_cnt, index, prev_id, allele_id, hits_size, stop, top_hit;
+ pair<allele_type, int> cat_hit;
+ string query_allele, query_seq, cat_allele, cat_seq;
+ map<string, vector<string> > haplo_hits;
+ set<int> loci_hit;
vector<pair<char, uint> > cigar;
- GappedAln *aln = new GappedAln();
+ GappedAln *aln = new GappedAln();
- initialize_kmers(gapped_kmer_len, num_kmers, kmers);
+ initialize_kmers(gapped_kmer_len, num_kmers, kmers);
#pragma omp for schedule(dynamic) reduction(+:matches) reduction(+:nomatches) reduction(+:mmatches) reduction(+:gapped_aln) reduction(+:ver_hap) \
reduction(+:tot_hap) reduction(+:bad_aln) reduction(+:no_haps)
@@ -810,7 +817,7 @@ search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
if (query->matches.size() > 0)
continue;
- gapped_aln++;
+ gapped_aln++;
map<allele_type, map<allele_type, AlignRes> > query_hits;
@@ -823,108 +830,108 @@ search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
query_seq = allele->second;
tot_hap++;
- generate_kmers_lazily(allele->second.c_str(), gapped_kmer_len, num_kmers, kmers);
+ generate_kmers_lazily(allele->second.c_str(), gapped_kmer_len, num_kmers, kmers);
- //
- // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
- // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
- //
- uniq_kmers.clear();
+ //
+ // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
+ // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
+ //
+ uniq_kmers.clear();
for (int j = 0; j < num_kmers; j++)
- uniq_kmers.insert(kmers[j]);
+ uniq_kmers.insert(kmers[j]);
- hits.clear();
- ordered_hits.clear();
+ hits.clear();
+ ordered_hits.clear();
- //
- // Lookup the occurances of each k-mer in the kmer_map
- //
- for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
+ //
+ // Lookup the occurances of each k-mer in the kmer_map
+ //
+ for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
- h = kmer_map.find(j->c_str());
+ h = kmer_map.find(j->c_str());
- if (h != kmer_map.end())
- for (uint k = 0; k < h->second.size(); k++)
- hits.push_back(h->second[k]);
+ if (h != kmer_map.end())
+ for (uint k = 0; k < h->second.size(); k++)
+ hits.push_back(h->second[k]);
}
- //
- // Sort the vector of indexes; provides the number of hits to each allele/locus
- // and orders them largest to smallest.
- //
- sort(hits.begin(), hits.end());
+ //
+ // Sort the vector of indexes; provides the number of hits to each allele/locus
+ // and orders them largest to smallest.
+ //
+ sort(hits.begin(), hits.end());
- //
- // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
- //
- hits_size = hits.size();
+ //
+ // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
+ //
+ hits_size = hits.size();
if (hits_size == 0)
continue;
- prev_id = hits[0];
- index = 0;
+ prev_id = hits[0];
+ index = 0;
- do {
- hit_cnt = 0;
- allele_id = prev_id;
+ do {
+ hit_cnt = 0;
+ allele_id = prev_id;
- while ((uint)hits[index] == prev_id) {
- hit_cnt++;
- index++;
- }
+ while (index < hits_size && (uint) hits[index] == prev_id) {
+ hit_cnt++;
+ index++;
+ }
- if (index < hits_size)
- prev_id = hits[index];
+ if (index < hits_size)
+ prev_id = hits[index];
- if (hit_cnt >= (uint)min_hits)
- ordered_hits.push_back(make_pair(allele_id, hit_cnt));
+ if (hit_cnt >= (uint) min_hits)
+ ordered_hits.push_back(make_pair(allele_id, hit_cnt));
- } while (index < hits_size);
+ } while (index < hits_size);
if (ordered_hits.size() == 0)
continue;
- //
- // Process the hits from most kmer hits to least kmer hits.
- //
- sort(ordered_hits.begin(), ordered_hits.end(), compare_pair_intint);
+ //
+ // Process the hits from most kmer hits to least kmer hits.
+ //
+ sort(ordered_hits.begin(), ordered_hits.end(), compare_pair_intint);
- //
- // Only try to align the sequences with the most kmers in common.
- //
- top_hit = ordered_hits[0].second;
+ //
+ // Only try to align the sequences with the most kmers in common.
+ //
+ top_hit = ordered_hits[0].second;
stop = 1;
- for (uint j = 1; j < ordered_hits.size(); j++)
- if ((uint)ordered_hits[j].second < top_hit) {
- stop = j;
- break;
- }
+ for (uint j = 1; j < ordered_hits.size(); j++)
+ if ((uint)ordered_hits[j].second < top_hit) {
+ stop = j;
+ break;
+ }
- for (uint j = 0; j < stop; j++) {
- cat_hit = allele_map.at(ordered_hits[j].first);
- hit_cnt = ordered_hits[j].second;
+ for (uint j = 0; j < stop; j++) {
+ cat_hit = allele_map.at(ordered_hits[j].first);
+ hit_cnt = ordered_hits[j].second;
- tag_2 = catalog[cat_hit.second];
+ tag_2 = catalog[cat_hit.second];
cat_allele = cat_hit.first;
- cat_seq = "";
- for (uint k = 0; k < tag_2->strings.size(); k++)
- if (tag_2->strings[k].first == cat_hit.first) {
- cat_seq = tag_2->strings[k].second;
- break;
- }
+ cat_seq = "";
+ for (uint k = 0; k < tag_2->strings.size(); k++)
+ if (tag_2->strings[k].first == cat_hit.first) {
+ cat_seq = tag_2->strings[k].second;
+ break;
+ }
- aln->init(tag_2->len, query->len);
+ aln->init(tag_2->len, query->len);
- // cerr << "Attempting to align: cat id " << tag_2->id << " with locus id " << query->id << "\n"
+ // cerr << "Attempting to align: cat id " << tag_2->id << " with locus id " << query->id << "\n"
// << "Cat allele: " << cat_allele << "; seq: " << cat_seq << "\n"
// << "Allele: " << query_allele << "; seq: " << allele->second << "\n";
- if (aln->align(cat_seq, query_seq)) {
- aln->parse_cigar(cigar);
+ if (aln->align(cat_seq, query_seq)) {
+ aln->parse_cigar(cigar);
- aln_res = aln->result();
+ aln_res = aln->result();
//
// At this point in the analysis, all possible alleles we want to detect must already
@@ -935,20 +942,20 @@ search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
if (aln_res.cigar.find('D') != string::npos)
continue;
- //
- // If the alignment has too many gaps, skip it.
- // If the alignment doesn't span enough of the two sequences, skip it.
- //
- if (aln_res.gap_cnt <= (max_gaps + 1) &&
- aln_res.pct_id >= min_match_len &&
- dist(cat_seq.c_str(), query_seq.c_str(), cigar) == 0) {
+ //
+ // If the alignment has too many gaps, skip it.
+ // If the alignment doesn't span enough of the two sequences, skip it.
+ //
+ if (aln_res.gap_cnt <= (max_gaps + 1) &&
+ aln_res.pct_id >= min_match_len &&
+ dist(cat_seq.c_str(), query_seq.c_str(), cigar) == 0) {
// cerr << "Adding match: " << aln_res.cigar << "\n";
loci_hit.insert(tag_2->id);
query_hits[query_allele][cat_allele] = aln_res;
- }
- }
- }
- }
+ }
+ }
+ }
+ }
if (verify_gapped_match(catalog, query, loci_hit, query_hits, mmatches, nosnps, no_haps, bad_aln, ver_hap))
matches++;
@@ -956,20 +963,20 @@ search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
nomatches++;
}
- //
- // Free the k-mers we generated for this query and the alignment class.
- //
- for (uint j = 0; j < kmers.size(); j++)
- delete [] kmers[j];
- kmers.clear();
+ //
+ // Free the k-mers we generated for this query and the alignment class.
+ //
+ for (uint j = 0; j < kmers.size(); j++)
+ delete [] kmers[j];
+ kmers.clear();
- delete aln;
+ delete aln;
}
cerr << "Out of " << keys.size() << " query loci, " << gapped_aln << " gapped alignments attempted.\n"
- << " " << matches << " loci matched one catalog locus; " << tot_hap << " total haplotypes examined, " << ver_hap << " verified.\n"
- << " " << nomatches << " loci matched no catalog locus;\n"
- << " " << mmatches << " loci matched more than one catalog locus and were excluded.\n"
+ << " " << matches << " loci matched one catalog locus; " << tot_hap << " total haplotypes examined, " << ver_hap << " verified.\n"
+ << " " << nomatches << " loci matched no catalog locus;\n"
+ << " " << mmatches << " loci matched more than one catalog locus and were excluded.\n"
<< " " << nosnps << " loci contained SNPs unaccounted for in the catalog and were excluded.\n"
<< " " << no_haps << " loci had no verified haplotypes.\n"
<< " " << bad_aln << " loci had inconsistent alignments to a catalog locus and were excluded.\n";
@@ -979,8 +986,8 @@ search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
}
bool
-verify_gapped_match(map<int, Locus *> &catalog, QLocus *query,
- set<int> &loci_hit, map<allele_type, map<allele_type, AlignRes> > &query_hits,
+verify_gapped_match(map<int, Locus *> &catalog, QLocus *query,
+ set<int> &loci_hit, map<allele_type, map<allele_type, AlignRes> > &query_hits,
uint &mmatch, uint &nosnps, uint &no_haps, uint &bad_aln, uint &ver_hits) {
//
// 1. Check that this query locus matches just a single catalog locus.
@@ -1031,24 +1038,24 @@ verify_gapped_match(map<int, Locus *> &catalog, QLocus *query,
bool found;
for (i = query->snps.begin(); i != query->snps.end(); i++) {
- found = false;
- //
- // SNP occurs in a column that is beyond the length of the catalog
- //
- if ((int)(*i)->col > min_tag_len - 1)
- continue;
-
- for (j = cat->snps.begin(); j != cat->snps.end(); j++) {
- if ((*i)->col == (*j)->col)
- found = true;
- }
- //
- // Query locus posses a SNP not present in the catalog.
- //
- if (found == false) {
- nosnps++;
- return false;
- }
+ found = false;
+ //
+ // SNP occurs in a column that is beyond the length of the catalog
+ //
+ if ((int)(*i)->col > min_tag_len - 1)
+ continue;
+
+ for (j = cat->snps.begin(); j != cat->snps.end(); j++) {
+ if ((*i)->col == (*j)->col)
+ found = true;
+ }
+ //
+ // Query locus posses a SNP not present in the catalog.
+ //
+ if (found == false) {
+ nosnps++;
+ return false;
+ }
}
//
@@ -1101,7 +1108,7 @@ verify_gapped_match(map<int, Locus *> &catalog, QLocus *query,
// }
// }
}
-
+
if (verified > 0) {
ver_hits += verified;
@@ -1137,7 +1144,7 @@ generate_query_allele(Locus *ctag, Locus *qtag, allele_type allele)
if (qtag->snps.size() == 0) {
for (uint i = 0; i < ctag->snps.size(); i++)
new_allele += ctag->snps[i]->col > qtag->len - 1 ? 'N' : qtag->con[ctag->snps[i]->col];
-
+
} else {
uint pos = 0;
uint index = 0;
@@ -1171,13 +1178,13 @@ populate_hash(map<int, Locus *> &sample, HashMap &hash_map, vector<char *> &hash
tag = it->second;
for (all_it = tag->strings.begin(); all_it != tag->strings.end(); all_it++) {
- key = new char[min_tag_len + 1];
- strncpy(key, all_it->second.c_str(), min_tag_len);
- key[min_tag_len] = '\0';
+ key = new char[min_tag_len + 1];
+ strncpy(key, all_it->second.c_str(), min_tag_len);
+ key[min_tag_len] = '\0';
- hash_map[key].push_back(make_pair(tag->id, all_it->first));
+ hash_map[key].push_back(make_pair(tag->id, all_it->first));
hash_map_keys.push_back(key);
- }
+ }
}
//dump_kmer_map(kmer_map);
@@ -1185,8 +1192,8 @@ populate_hash(map<int, Locus *> &sample, HashMap &hash_map, vector<char *> &hash
return 0;
}
-int
-write_matches(string sample_path, map<int, QLocus *> &sample)
+int
+write_matches(string sample_path, map<int, QLocus *> &sample)
{
map<int, QLocus *>::iterator i;
@@ -1197,7 +1204,7 @@ write_matches(string sample_path, map<int, QLocus *> &sample)
string out_file = out_path + sample_path.substr(pos_1 + 1) + ".matches.tsv";
if (in_file_type == FileT::gzsql)
- out_file += ".gz";
+ out_file += ".gz";
//
// Open the output files for writing.
@@ -1205,20 +1212,20 @@ write_matches(string sample_path, map<int, QLocus *> &sample)
gzFile gz_matches;
ofstream matches;
if (in_file_type == FileT::gzsql) {
- gz_matches = gzopen(out_file.c_str(), "wb");
- if (!gz_matches) {
- cerr << "Error: Unable to open gzipped matches file '" << out_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gz_matches = gzopen(out_file.c_str(), "wb");
+ if (!gz_matches) {
+ cerr << "Error: Unable to open gzipped matches file '" << out_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_matches, libz_buffer_size);
- #endif
+ gzbuffer(gz_matches, libz_buffer_size);
+ #endif
} else {
- matches.open(out_file.c_str());
- if (matches.fail()) {
- cerr << "Error: Unable to open matches file for writing.\n";
- exit(1);
- }
+ matches.open(out_file.c_str());
+ if (matches.fail()) {
+ cerr << "Error: Unable to open matches file for writing.\n";
+ exit(1);
+ }
}
//
@@ -1233,8 +1240,8 @@ write_matches(string sample_path, map<int, QLocus *> &sample)
time(&rawtime);
timeinfo = localtime(&rawtime);
strftime(date, 32, "%F %T", timeinfo);
- log << "# sstacks version " << VERSION << "; generated on " << date << "\n";
- if (in_file_type == FileT::gzsql)
+ log << "# sstacks version " << VERSION << "; generated on " << date << "\n";
+ if (in_file_type == FileT::gzsql)
gzputs(gz_matches, log.str().c_str());
else
matches << log.str();
@@ -1247,29 +1254,29 @@ write_matches(string sample_path, map<int, QLocus *> &sample)
cerr << "Outputing to file " << out_file << "\n";
for (i = sample.begin(); i != sample.end(); i++) {
- qloc = i->second;
-
- for (uint j = 0; j < qloc->matches.size(); j++) {
- if (verify_haplotypes == false && search_type == genomic_loc)
- match_depth = qloc->depth;
- else
- match_depth =
- qloc->alleles.count(qloc->matches[j]->cat_type) > 0 ?
- qloc->alleles[qloc->matches[j]->cat_type] : qloc->depth;
-
- sstr << "0" << "\t"
- << batch_id << "\t"
- << qloc->matches[j]->cat_id << "\t"
- << samp_id << "\t"
- << qloc->id << "\t"
- << qloc->matches[j]->cat_type << "\t"
- << match_depth << "\t"
- << qloc->lnl << "\t"
+ qloc = i->second;
+
+ for (uint j = 0; j < qloc->matches.size(); j++) {
+ if (verify_haplotypes == false && search_type == genomic_loc)
+ match_depth = qloc->depth;
+ else
+ match_depth =
+ qloc->alleles.count(qloc->matches[j]->cat_type) > 0 ?
+ qloc->alleles[qloc->matches[j]->cat_type] : qloc->depth;
+
+ sstr << "0" << "\t"
+ << batch_id << "\t"
+ << qloc->matches[j]->cat_id << "\t"
+ << samp_id << "\t"
+ << qloc->id << "\t"
+ << qloc->matches[j]->cat_type << "\t"
+ << match_depth << "\t"
+ << qloc->lnl << "\t"
<< qloc->matches[j]->cigar << "\n";
- }
+ }
- if (in_file_type == FileT::gzsql) gzputs(gz_matches, sstr.str().c_str()); else matches << sstr.str();
- sstr.str("");
+ if (in_file_type == FileT::gzsql) gzputs(gz_matches, sstr.str().c_str()); else matches << sstr.str();
+ sstr.str("");
}
if (in_file_type == FileT::gzsql)
@@ -1281,98 +1288,98 @@ write_matches(string sample_path, map<int, QLocus *> &sample)
}
int parse_command_line(int argc, char* argv[]) {
- string sample_file;
- int c;
-
+ string sample_file;
+ int c;
+
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
- {"genomic_loc", no_argument, NULL, 'g'},
- {"verify_hap", no_argument, NULL, 'x'},
- {"uniq_haplotypes", no_argument, NULL, 'u'},
+ {"genomic_loc", no_argument, NULL, 'g'},
+ {"verify_hap", no_argument, NULL, 'x'},
+ {"uniq_haplotypes", no_argument, NULL, 'u'},
{"gapped", no_argument, NULL, 'G'},
- {"num_threads", required_argument, NULL, 'p'},
- {"batch_id", required_argument, NULL, 'b'},
- {"catalog", required_argument, NULL, 'c'},
- {"sample_2", required_argument, NULL, 's'},
- {"outpath", required_argument, NULL, 'o'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "hgGxuvs:c:o:b:p:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 'p':
- num_threads = atoi(optarg);
- break;
- case 'b':
- batch_id = is_integer(optarg);
- if (batch_id < 0) {
- cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
- help();
- }
- break;
- case 's':
- sample_file = optarg;
- samples.push(sample_file);
- break;
- case 'g':
- search_type = genomic_loc;
- break;
- case 'o':
- out_path = optarg;
- break;
- case 'c':
- catalog_path = optarg;
- break;
- case 'x':
- verify_haplotypes = false;
- break;
- case 'u':
- require_uniq_haplotypes = true;
- break;
+ {"num_threads", required_argument, NULL, 'p'},
+ {"batch_id", required_argument, NULL, 'b'},
+ {"catalog", required_argument, NULL, 'c'},
+ {"sample_2", required_argument, NULL, 's'},
+ {"outpath", required_argument, NULL, 'o'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hgGxuvs:c:o:b:p:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 'p':
+ num_threads = atoi(optarg);
+ break;
+ case 'b':
+ batch_id = is_integer(optarg);
+ if (batch_id < 0) {
+ cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
+ help();
+ }
+ break;
+ case 's':
+ sample_file = optarg;
+ samples.push(sample_file);
+ break;
+ case 'g':
+ search_type = genomic_loc;
+ break;
+ case 'o':
+ out_path = optarg;
+ break;
+ case 'c':
+ catalog_path = optarg;
+ break;
+ case 'x':
+ verify_haplotypes = false;
+ break;
+ case 'u':
+ require_uniq_haplotypes = true;
+ break;
case 'G':
gapped_alignments = true;
break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
- default:
- help();
- abort();
- }
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
+ default:
+ help();
+ abort();
+ }
}
if (catalog_path.length() == 0) {
- cerr << "You must specify the prefix path to the catalog.\n";
- help();
+ cerr << "You must specify the prefix path to the catalog.\n";
+ help();
}
if (samples.size() == 0) {
- cerr << "You must specify at least one sample file.\n";
- help();
+ cerr << "You must specify at least one sample file.\n";
+ help();
}
- if (out_path.length() == 0)
- out_path = ".";
+ if (out_path.length() == 0)
+ out_path = ".";
- if (out_path.at(out_path.length() - 1) != '/')
- out_path += "/";
+ if (out_path.at(out_path.length() - 1) != '/')
+ out_path += "/";
return 0;
}
@@ -1387,14 +1394,14 @@ void help() {
std::cerr << "sstacks " << VERSION << "\n"
<< "sstacks -b batch_id -c catalog_file -s sample_file [-s sample_file_2 ...] [-o path] [-p num_threads] [-g] [-x] [-v] [-h]" << "\n"
<< " p: enable parallel execution with num_threads threads.\n"
- << " b: MySQL ID of this batch." << "\n"
- << " c: TSV file from which to load the catalog loci." << "\n"
- << " s: filename prefix from which to load sample loci." << "\n"
- << " o: output path to write results." << "\n"
+ << " b: MySQL ID of this batch." << "\n"
+ << " c: TSV file from which to load the catalog loci." << "\n"
+ << " s: filename prefix from which to load sample loci." << "\n"
+ << " o: output path to write results." << "\n"
<< " g: base matching on genomic location, not sequence identity." << "\n"
- << " x: don't verify haplotype of matching locus." << "\n"
- << " v: print program version." << "\n"
- << " h: display this help messsage." << "\n\n"
+ << " x: don't verify haplotype of matching locus." << "\n"
+ << " v: print program version." << "\n"
+ << " h: display this help messsage." << "\n\n"
<< " Gapped assembly options:\n"
<< " --gapped: preform gapped alignments between stacks.\n";
diff --git a/src/stacks.cc b/src/stacks.cc
index 4294586..4dd507e 100644
--- a/src/stacks.cc
+++ b/src/stacks.cc
@@ -29,13 +29,13 @@
//
#include "stacks.h"
-Rem::Rem() {
+Rem::Rem() {
this->id = 0;
- this->seq = NULL;
+ this->seq = NULL;
this->utilized = false;
}
-Rem::Rem(int id, uint seq_id, DNANSeq *seq) {
+Rem::Rem(int id, uint seq_id, DNANSeq *seq) {
this->id = id;
this->utilized = false;
@@ -86,7 +86,7 @@ int PStack::add_seq(const char *seq) {
return 0;
}
-int PStack::add_seq(DNANSeq *seq) {
+int PStack::add_seq(const DNANSeq *seq) {
if (this->seq != NULL)
delete this->seq;
diff --git a/src/stacks.h b/src/stacks.h
index d9b0ba0..6a70d8e 100644
--- a/src/stacks.h
+++ b/src/stacks.h
@@ -59,7 +59,7 @@ public:
strand_type strand;
void set(const char *chr, uint bp, strand_type strand) {
- if (this->chr != NULL)
+ if (this->chr != NULL)
delete [] this->chr;
this->chr = new char[strlen(chr) + 1];
this->bp = bp;
@@ -71,6 +71,11 @@ public:
bp = 0;
strand = strand_plus;
}
+ PhyLoc(const PhyLoc& other)
+ : bp(other.bp), strand(other.strand) {
+ chr = new char[strlen(other.chr)+1];
+ strcpy(chr, other.chr);
+ }
PhyLoc(const char *chr, uint bp) {
this->chr = new char[strlen(chr) + 1];
this->bp = bp;
@@ -84,7 +89,45 @@ public:
strcpy(this->chr, chr);
}
~PhyLoc() {
- delete [] chr;
+ if (chr != NULL)
+ delete [] chr;
+ }
+
+ friend void swap(PhyLoc& p, PhyLoc& q) {
+ char* chr = p.chr;
+ p.chr = q.chr;
+ q.chr = chr;
+
+ const uint bp = p.bp;
+ p.bp = q.bp;
+ q.bp = bp;
+
+ const strand_type strand = p.strand;
+ p.strand = q.strand;
+ q.strand = strand;
+ }
+ PhyLoc& operator=(PhyLoc&& other) {swap(*this, other); return *this;}
+ PhyLoc& operator=(const PhyLoc& other) =delete;
+
+ bool operator==(const PhyLoc& other) const {
+ if (bp == other.bp
+ && strand == other.strand
+ && strcmp(chr, other.chr) == 0)
+ return true;
+ else
+ return false;
+ }
+
+ bool operator<(const PhyLoc& other) const {
+ const int chrcmp = strcmp(chr, other.chr);
+ if (chrcmp != 0)
+ // Alphanumeric.
+ return chrcmp < 0;
+ else if (bp != other.bp)
+ return bp < other.bp;
+ else
+ // Minus strand first.
+ return strand == strand_minus && other.strand == strand_plus;
}
};
@@ -127,13 +170,13 @@ public:
string cigar;
Aln() {
this->id = 0;
- this->pct_id = 0.0;
+ this->pct_id = 0.0;
this->gap_cnt = 0;
}
Aln(uint id, string cigar, double pct_id, uint gap_cnt) {
- this->id = id;
- this->cigar = cigar;
- this->pct_id = pct_id;
+ this->id = id;
+ this->cigar = cigar;
+ this->pct_id = pct_id;
this->gap_cnt = gap_cnt;
}
};
@@ -147,20 +190,20 @@ class PStack {
vector<char *> map; // List of sequence read IDs merged into this stack
PhyLoc loc; // Physical genome location of this stack.
- PStack() {
- id = 0;
- count = 0;
- seq = NULL;
- len = 0;
+ PStack() {
+ id = 0;
+ count = 0;
+ seq = NULL;
+ len = 0;
}
- ~PStack() {
- delete this->seq;
- for (unsigned int i = 0; i < this->map.size(); i++)
- delete [] this->map[i];
+ ~PStack() {
+ delete this->seq;
+ for (unsigned int i = 0; i < this->map.size(); i++)
+ delete [] this->map[i];
}
int add_id(const char *);
int add_seq(const char *);
- int add_seq(DNANSeq *);
+ int add_seq(const DNANSeq *);
};
class Stack {
@@ -173,7 +216,7 @@ class Stack {
id = 0;
seq = NULL;
}
- ~Stack() {
+ ~Stack() {
delete this->seq;
}
uint count() { return this->map.size(); }
@@ -191,7 +234,7 @@ class Rem {
Rem();
Rem(int, uint, DNANSeq *);
- ~Rem() {
+ ~Rem() {
delete this->seq;
}
uint count() { return this->map.size(); }
@@ -211,12 +254,12 @@ public:
char *haplotype;
char *cigar;
- CatMatch() {
- batch_id = 0;
- cat_id = 0;
- sample_id = 0;
- tag_id = 0;
- depth = 0;
+ CatMatch() {
+ batch_id = 0;
+ cat_id = 0;
+ sample_id = 0;
+ tag_id = 0;
+ depth = 0;
lnl = 0.0;
haplotype = NULL;
cigar = NULL;
@@ -233,14 +276,14 @@ public:
int tag_id;
char *model;
- ModRes(int samp_id, int tag_id, const char *model) {
- this->sample_id = samp_id;
+ ModRes(int samp_id, int tag_id, const char *model) {
+ this->sample_id = samp_id;
this->tag_id = tag_id;
this->model = new char[strlen(model) + 1];
strcpy(this->model, model);
}
- ~ModRes() {
- delete [] this->model;
+ ~ModRes() {
+ delete [] this->model;
}
};
@@ -250,11 +293,11 @@ public:
int tag_id;
vector<SNP *> snps;
- SNPRes(int samp_id, int tag_id) {
- this->sample_id = samp_id;
+ SNPRes(int samp_id, int tag_id) {
+ this->sample_id = samp_id;
this->tag_id = tag_id;
}
- ~SNPRes() {
+ ~SNPRes() {
for (uint i = 0; i < this->snps.size(); i++)
delete this->snps[i];
this->snps.clear();
diff --git a/src/ustacks.cc b/src/ustacks.cc
index f46d584..fb93412 100644
--- a/src/ustacks.cc
+++ b/src/ustacks.cc
@@ -78,12 +78,12 @@ int main (int argc, char* argv[]) {
<< " Max number of stacks allowed per de novo locus: " << max_subgraph << "\n"
<< " Deleveraging algorithm: " << (deleverage_stacks ? "enabled" : "disabled") << "\n"
<< " Removal algorithm: " << (remove_rep_stacks ? "enabled" : "disabled") << "\n"
- << " Model type: ";
+ << " Model type: ";
switch (model_type) {
case snp:
cerr << "SNP\n";
break;
- case fixed:
+ case fixed:
cerr << "Fixed\n";
break;
case bounded:
@@ -94,7 +94,7 @@ int main (int argc, char* argv[]) {
<< " Gapped alignments: " << (gapped_alignments ? "enabled" : "disabled") << "\n";
//
- // Set limits to call het or homozygote according to chi-square distribution with one
+ // Set limits to call het or homozygote according to chi-square distribution with one
// degree of freedom:
// http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value
//
@@ -119,22 +119,21 @@ int main (int argc, char* argv[]) {
omp_set_num_threads(num_threads);
#endif
- DNASeqHashMap radtags;
- vector<DNANSeq *> radtags_keys;
+ DNASeqHashMap* radtags = new DNASeqHashMap();
map<int, Rem *> remainders;
set<int> merge_map;
map<int, Stack *> unique;
- load_radtags(in_file, radtags, radtags_keys);
+ load_radtags(in_file, *radtags);
- reduce_radtags(radtags, unique, remainders);
+ reduce_radtags(*radtags, unique, remainders);
- free_radtags_hash(radtags, radtags_keys);
+ delete radtags;
// dump_unique_tags(unique);
double cov_mean, cov_stdev, cov_max;
-
+
calc_coverage_distribution(unique, cov_mean, cov_stdev, cov_max);
cerr << "Initial coverage mean: " << cov_mean << "; Std Dev: " << cov_stdev << "; Max: " << cov_max << "\n";
@@ -188,7 +187,7 @@ int main (int argc, char* argv[]) {
calc_coverage_distribution(unique, remainders, merged, cov_mean, cov_stdev, cov_max);
cerr << "After gapped alignments, coverage depth Mean: " << cov_mean << "; Std Dev: " << cov_stdev << "; Max: " << cov_max << "\n";
}
-
+
//
// Call the final consensus sequence and invoke the SNP model.
//
@@ -223,15 +222,15 @@ merge_gapped_alns(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, Merg
continue;
tag_1 = it->second;
- sort(tag_1->alns.begin(), tag_1->alns.end(), rank_alignments);
+ sort(tag_1->alns.begin(), tag_1->alns.end(), rank_alignments);
//
// No gapped alignments, or no optimal alignment for this stack, or
- // this stack has already been set aside.
+ // this stack has already been set aside.
//
if (tag_1->masked || tag_1->alns.size() == 0)
continue;
- if (tag_1->alns.size() > 1 && tag_1->alns[0].pct_id == tag_1->alns[1].pct_id)
+ if (tag_1->alns.size() > 1 && tag_1->alns[0].pct_id == tag_1->alns[1].pct_id)
continue;
//
@@ -239,24 +238,24 @@ merge_gapped_alns(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, Merg
// of tags are, reciprocal to each other..
//
tag_2 = merged[tag_1->alns[0].id];
- sort(tag_2->alns.begin(), tag_2->alns.end(), rank_alignments);
+ sort(tag_2->alns.begin(), tag_2->alns.end(), rank_alignments);
- if (tag_2->masked || tag_2->alns.size() == 0)
+ if (tag_2->masked || tag_2->alns.size() == 0)
continue;
- if (tag_2->alns.size() > 1 && tag_2->alns[0].pct_id == tag_2->alns[1].pct_id)
+ if (tag_2->alns.size() > 1 && tag_2->alns[0].pct_id == tag_2->alns[1].pct_id)
continue;
if (tag_1->id != (int)tag_2->alns[0].id)
continue;
- cigar_1 = invert_cigar(tag_1->alns[0].cigar);
+ cigar_1 = invert_cigar(tag_1->alns[0].cigar);
cigar_2 = tag_2->alns[0].cigar;
if (cigar_1 == cigar_2) {
parse_cigar(tag_1->alns[0].cigar.c_str(), cigar);
//
- // Check that the alignment still contains fewer than
+ // Check that the alignment still contains fewer than
// max_utag_dist mismatches.
//
if (dist(tag_1->con, tag_2->con, cigar) > max_utag_dist)
@@ -329,8 +328,8 @@ merge_gapped_alns(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, Merg
merged = new_merged;
- cerr << " " << old_cnt << " stacks merged into " << new_cnt
- << " stacks; merged " << merge_cnt
+ cerr << " " << old_cnt << " stacks merged into " << new_cnt
+ << " stacks; merged " << merge_cnt
<< " gapped alignments.\n";
return 0;
@@ -377,7 +376,7 @@ edit_gapped_seqs(map<int, Stack *> &unique, map<int, Rem *> &rem, MergedStack *t
return 0;
}
-int
+int
edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
{
char *buf;
@@ -411,7 +410,7 @@ edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
// sequence down. Trim the final length to keep the read length consistent.
//
k = bp >= len ? len : bp;
-
+
strncpy(buf, seq + k, buf_size - 1);
buf[buf_size - 1] = '\0';
buf_len = strlen(buf);
@@ -461,7 +460,7 @@ search_for_gaps(map<int, MergedStack *> &merged, double min_match_len)
// our map to a vector of integer keys.
//
vector<int> keys;
- for (it = merged.begin(); it != merged.end(); it++)
+ for (it = merged.begin(); it != merged.end(); it++)
keys.push_back(it->first);
//
@@ -480,18 +479,18 @@ search_for_gaps(map<int, MergedStack *> &merged, double min_match_len)
cerr << " Searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); " << min_hits << " k-mer hits required.\n";
populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
-
+
#pragma omp parallel private(tag_1, tag_2)
{
KmerHashMap::iterator h;
vector<char *> query_kmers;
set<string> uniq_kmers;
- GappedAln *aln = new GappedAln(con_len);
- AlignRes a;
+ GappedAln *aln = new GappedAln(con_len);
+ AlignRes a;
initialize_kmers(kmer_len, num_kmers, query_kmers);
-
- #pragma omp for schedule(dynamic)
+
+ #pragma omp for schedule(dynamic)
for (uint i = 0; i < keys.size(); i++) {
tag_1 = merged[keys[i]];
@@ -511,7 +510,7 @@ search_for_gaps(map<int, MergedStack *> &merged, double min_match_len)
uniq_kmers.clear();
for (int j = 0; j < num_kmers; j++)
uniq_kmers.insert(query_kmers[j]);
-
+
map<int, int> hits;
//
// Lookup the occurances of each k-mer in the kmer_map
@@ -548,9 +547,9 @@ search_for_gaps(map<int, MergedStack *> &merged, double min_match_len)
continue;
if (aln->align(tag_1->con, tag_2->con)) {
- a = aln->result();
- tag_1->alns.push_back(Aln(tag_2->id, a.cigar, a.pct_id, a.gap_cnt));
- }
+ a = aln->result();
+ tag_1->alns.push_back(Aln(tag_2->id, a.cigar, a.pct_id, a.gap_cnt));
+ }
}
}
@@ -561,7 +560,7 @@ search_for_gaps(map<int, MergedStack *> &merged, double min_match_len)
delete [] query_kmers[j];
query_kmers.clear();
- delete aln;
+ delete aln;
}
free_kmer_hash(kmer_map, kmer_map_keys);
@@ -604,9 +603,9 @@ merge_remainders(map<int, MergedStack *> &merged, map<int, Rem *> &rem)
int min_hits = calc_min_kmer_matches(kmer_len, max_rem_dist, con_len, set_kmer_len ? true : false);
cerr << " Distance allowed between stacks: " << max_rem_dist
- << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
- << min_hits << " k-mer hits required.\n";
-
+ << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
+ << min_hits << " k-mer hits required.\n";
+
KmerHashMap kmer_map;
vector<char *> kmer_map_keys;
populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
@@ -616,9 +615,9 @@ merge_remainders(map<int, MergedStack *> &merged, map<int, Rem *> &rem)
{
KmerHashMap::iterator h;
vector<char *> rem_kmers;
- char *buf = new char[con_len + 1];
+ char *buf = new char[con_len + 1];
- #pragma omp for schedule(dynamic)
+ #pragma omp for schedule(dynamic)
for (uint j = 0; j < keys.size(); j++) {
it = rem.find(keys[j]);
Rem *r = it->second;
@@ -695,7 +694,7 @@ merge_remainders(map<int, MergedStack *> &merged, map<int, Rem *> &rem)
for (uint k = 0; k < rem_kmers.size(); k++)
delete [] rem_kmers[k];
- delete [] buf;
+ delete [] buf;
}
free_kmer_hash(kmer_map, kmer_map_keys);
@@ -738,7 +737,7 @@ call_alleles(MergedStack *mtag, vector<DNANSeq *> &reads, vector<read_type> &rea
// Check to make sure the nucleotide at the location of this SNP is
// of one of the two possible states the multinomial model called.
//
- if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
+ if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
allele += base;
else
break;
@@ -760,24 +759,24 @@ call_consensus(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<i
//
map<int, MergedStack *>::iterator it;
vector<int> keys;
- for (it = merged.begin(); it != merged.end(); it++)
+ for (it = merged.begin(); it != merged.end(); it++)
keys.push_back(it->first);
int i;
#pragma omp parallel private(i)
- {
+ {
MergedStack *mtag;
Stack *utag;
Rem *r;
- #pragma omp for schedule(dynamic)
+ #pragma omp for schedule(dynamic)
for (i = 0; i < (int) keys.size(); i++) {
mtag = merged[keys[i]];
-
+
//
// Create a two-dimensional array, each row containing one read. For
// each unique tag that has been merged together, add the sequence for
- // that tag into our array as many times as it originally occurred.
+ // that tag into our array as many times as it originally occurred.
//
vector<int>::iterator j;
vector<DNANSeq *> reads;
@@ -792,7 +791,7 @@ call_consensus(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<i
}
}
- // For each remainder tag that has been merged into this Stack, add the sequence.
+ // For each remainder tag that has been merged into this Stack, add the sequence.
for (j = mtag->remtags.begin(); j != mtag->remtags.end(); j++) {
r = rem[*j];
@@ -825,8 +824,8 @@ call_consensus(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<i
SNP *snp = new SNP;
snp->type = snp_type_unk;
snp->col = col;
- snp->rank_1 = '-';
- snp->rank_2 = '-';
+ snp->rank_1 = '-';
+ snp->rank_2 = '-';
mtag->snps.push_back(snp);
col++;
} while (col < mtag->gaps[cur_gap].end && col < length);
@@ -835,7 +834,7 @@ call_consensus(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<i
continue;
}
- nuc['A'] = 0;
+ nuc['A'] = 0;
nuc['G'] = 0;
nuc['C'] = 0;
nuc['T'] = 0;
@@ -861,7 +860,7 @@ call_consensus(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<i
//
// Search this column for the presence of a SNP
//
- if (invoke_model)
+ if (invoke_model)
switch(model_type) {
case snp:
call_multinomial_snp(mtag, col, nuc, true);
@@ -1106,8 +1105,8 @@ merge_stacks(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, MergedSta
merged = new_merged;
- cerr << " " << old_cnt << " stacks merged into " << new_cnt
- << " stacks; deleveraged " << delev_cnt
+ cerr << " " << old_cnt << " stacks merged into " << new_cnt
+ << " stacks; deleveraged " << delev_cnt
<< " stacks; removed " << blist_cnt << " stacks.\n";
return 0;
@@ -1206,9 +1205,9 @@ int
remove_repetitive_stacks(map<int, Stack *> &unique, map<int, MergedStack *> &merged)
{
//
- // If enabled, check the depth of coverage of each unique tag, and remove
+ // If enabled, check the depth of coverage of each unique tag, and remove
// from consideration any tags with depths greater than removal_trigger. These tags
- // are likely to be multiple repetitive sites that have been merged together.
+ // are likely to be multiple repetitive sites that have been merged together.
// Because large stacks of unique tags are likely to also generate many one-off
// sequencing error reads, remove all seqeunces that are a distance of one from
// the RAD-Tag with high depth of coverage.
@@ -1275,7 +1274,7 @@ remove_repetitive_stacks(map<int, Stack *> &unique, map<int, MergedStack *> &mer
}
}
}
-
+
//
// Merge these tags together into a new MergedStack object.
//
@@ -1325,11 +1324,11 @@ remove_repetitive_stacks(map<int, Stack *> &unique, map<int, MergedStack *> &mer
return 0;
}
-int deleverage(map<int, Stack *> &unique,
+int deleverage(map<int, Stack *> &unique,
map<int, Rem *> &rem,
- map<int, MergedStack *> &merged,
- set<int> &merge_list,
- int cohort_id,
+ map<int, MergedStack *> &merged,
+ set<int> &merge_list,
+ int cohort_id,
vector<MergedStack *> &deleveraged_tags) {
set<int>::iterator it;
vector<pair<int, int> >::iterator j;
@@ -1420,7 +1419,7 @@ int deleverage(map<int, Stack *> &unique,
}
//
- // This set is sorted by definition. Check if there is more than a single
+ // This set is sorted by definition. Check if there is more than a single
// distance separating stacks.
//
if (dists.size() == 1) {
@@ -1504,7 +1503,7 @@ int calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist) {
// OpenMP can't parallelize random access iterators, so we convert
// our map to a vector of integer keys.
vector<int> keys;
- for (it = merged.begin(); it != merged.end(); it++)
+ for (it = merged.begin(); it != merged.end(); it++)
keys.push_back(it->first);
//
@@ -1522,19 +1521,19 @@ int calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist) {
int min_hits = calc_min_kmer_matches(kmer_len, utag_dist, con_len, set_kmer_len ? true : false);
cerr << " Distance allowed between stacks: " << utag_dist
- << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
- << min_hits << " k-mer hits required.\n";
+ << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
+ << min_hits << " k-mer hits required.\n";
populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
-
+
#pragma omp parallel private(tag_1, tag_2)
- {
- KmerHashMap::iterator h;
- vector<char *> query_kmers;
-
- initialize_kmers(kmer_len, num_kmers, query_kmers);
+ {
+ KmerHashMap::iterator h;
+ vector<char *> query_kmers;
- #pragma omp for schedule(dynamic)
+ initialize_kmers(kmer_len, num_kmers, query_kmers);
+
+ #pragma omp for schedule(dynamic)
for (uint i = 0; i < keys.size(); i++) {
tag_1 = merged[keys[i]];
@@ -1588,11 +1587,11 @@ int calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist) {
sort(tag_1->dist.begin(), tag_1->dist.end(), compare_dist);
}
- //
- // Free the k-mers we generated for this query
- //
- for (uint j = 0; j < query_kmers.size(); j++)
- delete [] query_kmers[j];
+ //
+ // Free the k-mers we generated for this query
+ //
+ for (uint j = 0; j < query_kmers.size(); j++)
+ delete [] query_kmers[j];
}
free_kmer_hash(kmer_map, kmer_map_keys);
@@ -1612,12 +1611,12 @@ int calc_distance(map<int, MergedStack *> &merged, int utag_dist) {
// OpenMP can't parallelize random access iterators, so we convert
// our map to a vector of integer keys.
vector<int> keys;
- for (it = merged.begin(); it != merged.end(); it++)
+ for (it = merged.begin(); it != merged.end(); it++)
keys.push_back(it->first);
#pragma omp parallel private(i, j, tag_1, tag_2)
- {
- #pragma omp for schedule(dynamic)
+ {
+ #pragma omp for schedule(dynamic)
for (i = 0; i < (int) keys.size(); i++) {
tag_1 = merged[keys[i]];
@@ -1665,7 +1664,7 @@ int reduce_radtags(DNASeqHashMap &radtags, map<int, Stack *> &unique, map<int, R
Rem *r;
Stack *u;
int global_id = 1;
-
+
for (it = radtags.begin(); it != radtags.end(); it++) {
if (it->second.count() < min_merge_cov) {
//
@@ -1675,7 +1674,7 @@ int reduce_radtags(DNASeqHashMap &radtags, map<int, Stack *> &unique, map<int, R
//
r = new Rem;
r->id = global_id;
- r->add_seq(it->first);
+ r->add_seq(&it->first);
for (uint i = 0; i < it->second.ids.size(); i++)
r->add_id(it->second.ids[i]);
@@ -1691,7 +1690,7 @@ int reduce_radtags(DNASeqHashMap &radtags, map<int, Stack *> &unique, map<int, R
//
u = new Stack;
u->id = global_id;
- u->add_seq(it->first);
+ u->add_seq(&it->first);
// Copy the original Fastq IDs from which this unique radtag was built.
for (uint i = 0; i < it->second.ids.size(); i++)
@@ -1711,17 +1710,6 @@ int reduce_radtags(DNASeqHashMap &radtags, map<int, Stack *> &unique, map<int, R
}
int
-free_radtags_hash(DNASeqHashMap &radtags, vector<DNANSeq *> &radtags_keys)
-{
- for (uint i = 0; i < radtags_keys.size(); i++)
- delete radtags_keys[i];
-
- radtags.clear();
-
- return 0;
-}
-
-int
calc_coverage_distribution(map<int, Stack *> &unique,
double &mean, double &stdev, double &max)
{
@@ -1820,7 +1808,7 @@ calc_coverage_distribution(map<int, Stack *> &unique,
int
calc_coverage_distribution(map<int, Stack *> &unique,
- map<int, Rem *> &rem,
+ map<int, Rem *> &rem,
map<int, MergedStack *> &merged,
double &mean, double &stdev, double &max)
{
@@ -1831,7 +1819,7 @@ calc_coverage_distribution(map<int, Stack *> &unique,
double s = 0.0;
double sum = 0.0;
double cnt = 0.0;
-
+
mean = 0.0;
max = 0.0;
stdev = 0.0;
@@ -1911,8 +1899,8 @@ int count_raw_reads(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, Me
return 0;
}
-int
-write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *> &r)
+int
+write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *> &r)
{
map<int, MergedStack *>::iterator i;
vector<int>::iterator k;
@@ -1946,7 +1934,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
string snp_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".snps.tsv";
string all_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".alleles.tsv";
string mod_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".models.tsv";
-
+
if (gzip) {
tag_file += ".gz";
snp_file += ".gz";
@@ -1970,7 +1958,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
#endif
gz_mods = gzopen(mod_file.c_str(), "wb");
if (!gz_mods) {
- cerr << "Error: Unable to open gzipped tag file '" << tag_file << "': " << strerror(errno) << ".\n";
+ cerr << "Error: Unable to open gzipped model file '" << mod_file << "': " << strerror(errno) << ".\n";
exit(1);
}
#if ZLIB_VERNUM >= 0x1240
@@ -2000,7 +1988,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
}
mods.open(mod_file.c_str());
if (mods.fail()) {
- cerr << "Error: Unable to open tag file for writing.\n";
+ cerr << "Error: Unable to open model file for writing.\n";
exit(1);
}
snps.open(snp_file.c_str());
@@ -2027,7 +2015,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
time(&rawtime);
timeinfo = localtime(&rawtime);
strftime(date, 32, "%F %T", timeinfo);
- log << "# ustacks version " << VERSION << "; generated on " << date << "\n";
+ log << "# ustacks version " << VERSION << "; generated on " << date << "\n";
if (gzip) {
gzputs(gz_tags, log.str().c_str());
gzputs(gz_mods, log.str().c_str());
@@ -2055,17 +2043,17 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
tag_1->calc_likelihood();
// First write the consensus sequence
- sstr << "0" << "\t"
- << sql_id << "\t"
+ sstr << "0" << "\t"
+ << sql_id << "\t"
<< tag_1->id << "\t"
//<< tag_1->cohort_id << "\t"
<< "" << "\t" // chr
<< 0 << "\t" // bp
<< "+" << "\t" // strand
- << "consensus\t" << "\t"
- << "\t"
- << tag_1->con << "\t"
- << tag_1->deleveraged << "\t"
+ << "consensus\t" << "\t"
+ << "\t"
+ << tag_1->con << "\t"
+ << tag_1->deleveraged << "\t"
<< tag_1->blacklisted << "\t"
<< tag_1->lumberjackstack << "\t"
<< tag_1->lnl << "\n";
@@ -2073,9 +2061,9 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
//
// Write a sequence recording the output of the SNP model for each nucleotide.
//
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
//<< "\t" // cohort_id
<< "\t" // chr
<< "\t" // bp
@@ -2095,7 +2083,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
break;
}
}
- sstr << "\t"
+ sstr << "\t"
<< "\t"
<< "\t"
<< "\t"
@@ -2121,10 +2109,10 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
<< "\t" // chr
<< "\t" // bp
<< "\t" // strand
- << "primary\t"
- << id << "\t"
- << seq_ids[tag_2->map[j]] << "\t"
- << tag_2->seq->seq(buf)
+ << "primary\t"
+ << id << "\t"
+ << seq_ids[tag_2->map[j]] << "\t"
+ << tag_2->seq->seq(buf)
<< "\t\t\t\t\n";
if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
@@ -2142,17 +2130,17 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
total += rem->map.size();
for (uint j = 0; j < rem->map.size(); j++)
- sstr << "0" << "\t"
- << sql_id << "\t"
+ sstr << "0" << "\t"
+ << sql_id << "\t"
<< tag_1->id << "\t"
//<< "\t" // cohort_id
<< "\t" // chr
<< "\t" // bp
<< "\t" // strand
<< "secondary\t"
- << "\t"
- << seq_ids[rem->map[j]] << "\t"
- << rem->seq->seq(buf)
+ << "\t"
+ << seq_ids[rem->map[j]] << "\t"
+ << rem->seq->seq(buf)
<< "\t\t\t\t\n";
if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
@@ -2163,9 +2151,9 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
// Write out the model calls for each nucleotide in this locus.
//
for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
<< (*s)->col << "\t";
switch((*s)->type) {
@@ -2181,8 +2169,8 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
}
sstr << std::fixed << std::setprecision(2)
- << (*s)->lratio << "\t"
- << (*s)->rank_1 << "\t"
+ << (*s)->lratio << "\t"
+ << (*s)->rank_1 << "\t"
<< (*s)->rank_2 << "\t\t\n";
}
@@ -2194,11 +2182,11 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
// the percentage of tags a particular allele occupies.
//
for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
- << (*t).first << "\t"
- << (((*t).second/total) * 100) << "\t"
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
+ << (*t).first << "\t"
+ << (((*t).second/total) * 100) << "\t"
<< (*t).second << "\n";
}
if (gzip) gzputs(gz_alle, sstr.str().c_str()); else alle << sstr.str();
@@ -2267,11 +2255,11 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
return 0;
}
-int dump_stack_graph(string data_file,
- map<int, Stack *> &unique,
- map<int, MergedStack *> &merged,
- vector<int> &keys,
- map<int, map<int, double> > &dist_map,
+int dump_stack_graph(string data_file,
+ map<int, Stack *> &unique,
+ map<int, MergedStack *> &merged,
+ vector<int> &keys,
+ map<int, map<int, double> > &dist_map,
map<int, set<int> > &cluster_map) {
uint s, t;
double d, scale, scaled_d;
@@ -2301,9 +2289,9 @@ int dump_stack_graph(string data_file,
<< "edge [fontsize=8.0 fontname=\"Arial\" color=\"#aaaaaa\"];\n";
colors.push_back("red");
- colors.push_back("blue");
- colors.push_back("green");
- colors.push_back("brown");
+ colors.push_back("blue");
+ colors.push_back("green");
+ colors.push_back("brown");
colors.push_back("purple");
map<int, set<int> >::iterator c;
@@ -2381,7 +2369,7 @@ int dump_unique_tags(map<int, Stack *> &u) {
cerr << "UniqueTag UID: " << (*it).second->id << "\n"
<< " Seq: " << c << "\n"
- << " IDs: ";
+ << " IDs: ";
for (uint j = 0; j < it->second->map.size(); j++)
cerr << it->second->map[j] << " ";
@@ -2405,9 +2393,9 @@ int dump_merged_tags(map<int, MergedStack *> &m) {
<< " Consensus: ";
if (it->second->con != NULL)
cerr << it->second->con << "\n";
- else
+ else
cerr << "\n";
- cerr << " IDs: ";
+ cerr << " IDs: ";
for (fit = it->second->utags.begin(); fit != it->second->utags.end(); fit++)
cerr << (*fit) << " ";
@@ -2424,9 +2412,8 @@ int dump_merged_tags(map<int, MergedStack *> &m) {
return 0;
}
-int load_radtags(string in_file, DNASeqHashMap &radtags, vector<DNANSeq *> &radtags_keys) {
+int load_radtags(string in_file, DNASeqHashMap &radtags) {
Input *fh = NULL;
- DNANSeq *d;
if (in_file_type == FileT::fasta)
fh = new Fasta(in_file.c_str());
@@ -2449,8 +2436,10 @@ int load_radtags(string in_file, DNASeqHashMap &radtags, vector<DNANSeq *> &radt
c.seq = new char[max_len];
c.qual = new char[max_len];
+ cerr << "Loading RAD-Tags...";
while ((fh->next_seq(c)) != 0) {
- if (i % 10000 == 0) cerr << " Loading RAD-Tag " << i << " \r";
+ if (i % 1000000 == 0 && i>0)
+ cerr << i/1000000 << "M...";
prev_seql = seql;
seql = 0;
@@ -2464,32 +2453,26 @@ int load_radtags(string in_file, DNASeqHashMap &radtags, vector<DNANSeq *> &radt
corrected++;
}
- if (seql != prev_seql && prev_seql > 0) len_mismatch = true;
-
- d = new DNANSeq(seql, c.seq);
+ if (seql != prev_seql && prev_seql > 0)
+ len_mismatch = true;
- pair<DNASeqHashMap::iterator, bool> r;
-
- r = radtags.insert(make_pair(d, HVal()));
- (*r.first).second.add_id(i);
- radtags_keys.push_back(d);
+ DNASeqHashMap::iterator element = radtags.insert({DNANSeq(seql, c.seq), HVal()}).first;
+ element->second.add_id(i);
i++;
}
- cerr << "Loaded " << i << " RAD-Tags; inserted " << radtags.size() << " elements into the RAD-Tags hash map.\n";
+ cerr << "done\n";
if (i == 0) {
cerr << "Error: Unable to load data from '" << in_file.c_str() << "'.\n";
exit(1);
}
-
- cerr << " " << corrected << " reads contained uncalled nucleotides that were modified.\n";
-
if (len_mismatch)
- cerr << " Warning: different sequence lengths detected, this will interfere with Stacks algorithms.\n";
+ cerr << "Warning: different sequence lengths detected, this will interfere with Stacks algorithms.\n";
+
+ cerr << "Loaded " << i << " RAD-Tags.\n"
+ " Inserted " << radtags.size() << " elements into the RAD-Tags hash map.\n"
+ " " << corrected << " reads contained uncalled nucleotides that were modified.\n";
- //
- // Close the file and delete the Input object.
- //
delete fh;
return 0;
@@ -2509,7 +2492,7 @@ load_seq_ids(vector<char *> &seq_ids)
else if (in_file_type == FileT::gzfastq)
fh = new GzFastq(in_file.c_str());
- cerr << " Refetching sequencing IDs from " << in_file.c_str() << "... ";
+ cerr << " Refetching sequencing IDs from " << in_file.c_str() << "... ";
char *id;
Seq c;
@@ -2544,7 +2527,7 @@ calc_triggers(double cov_mean,
// //
// // Calculate the deleverage trigger. Assume RAD-Tags are selected from
// // the sample for sequencing randomly, forming a poisson distribution
-// // representing the depths of coverage of RAD-Tags in the sample. Calculate
+// // representing the depths of coverage of RAD-Tags in the sample. Calculate
// // the trigger value that is larger than the depth of coverage of 99.9999% of stacks.
// //
// long double lambda = cov_mean;
@@ -2586,7 +2569,7 @@ long double factorial(int i) {
int parse_command_line(int argc, char* argv[]) {
int c;
-
+
while (1) {
static struct option long_options[] = {
{"help", no_argument, NULL, 'h'},
@@ -2619,13 +2602,13 @@ int parse_command_line(int argc, char* argv[]) {
// getopt_long stores the option index here.
int option_index = 0;
-
+
c = getopt_long(argc, argv, "GhHvdrgRA:L:U:f:o:i:m:e:p:t:M:N:K:k:T:X:x:", long_options, &option_index);
-
+
// Detect the end of the options.
if (c == -1)
break;
-
+
switch (c) {
case 'h':
help();
@@ -2730,7 +2713,7 @@ int parse_command_line(int argc, char* argv[]) {
// getopt_long already printed an error message.
help();
break;
-
+
default:
cerr << "Unknown command line option '" << (char) c << "'\n";
help();
@@ -2767,10 +2750,10 @@ int parse_command_line(int argc, char* argv[]) {
help();
}
- if (out_path.length() == 0)
+ if (out_path.length() == 0)
out_path = ".";
- if (out_path.at(out_path.length() - 1) != '/')
+ if (out_path.at(out_path.length() - 1) != '/')
out_path += "/";
if (model_type == fixed && barcode_err_freq == 0) {
@@ -2810,7 +2793,7 @@ void help() {
<< " --gapped: preform gapped alignments between stacks.\n"
<< " --max_gaps: number of gaps allowed between stacks before merging (default: 2).\n"
<< " --min_aln_len: minimum length of aligned sequence in a gapped alignment (default: 0.80).\n\n"
- << " Model options:\n"
+ << " Model options:\n"
<< " --model_type: either 'snp' (default), 'bounded', or 'fixed'\n"
<< " For the SNP or Bounded SNP model:\n"
<< " --alpha <num>: chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001.\n"
diff --git a/src/ustacks.h b/src/ustacks.h
index f5ed095..8d8d003 100644
--- a/src/ustacks.h
+++ b/src/ustacks.h
@@ -21,7 +21,7 @@
#ifndef __USTACKS_H__
#define __USTACKS_H__
-#include "constants.h"
+#include "constants.h"
#ifdef _OPENMP
#include <omp.h> // OpenMP library
@@ -67,6 +67,7 @@ using std::set;
using google::sparse_hash_map;
#endif
+#include "config.h"
#include "kmers.h"
#include "utils.h"
#include "DNASeq.h" // Class for storing two-bit compressed DNA sequences
@@ -97,15 +98,15 @@ class HVal {
const int barcode_size = 5;
#ifdef HAVE_SPARSEHASH
-typedef sparse_hash_map<DNANSeq *, HVal, hash_dnanseq, dnanseq_eqstr> DNASeqHashMap;
+typedef sparse_hash_map<DNANSeq, HVal> DNASeqHashMap;
#else
-typedef unordered_map<DNANSeq *, HVal, hash_dnanseq, dnanseq_eqstr> DNASeqHashMap;
+typedef unordered_map<DNANSeq, HVal> DNASeqHashMap;
#endif
void help( void );
void version( void );
int parse_command_line(int, char**);
-int load_radtags(string, DNASeqHashMap &, vector<DNANSeq *> &);
+int load_radtags(string, DNASeqHashMap &);
int load_seq_ids(vector<char *> &);
int reduce_radtags(DNASeqHashMap &, map<int, Stack *> &, map<int, Rem *> &);
int free_radtags_hash(DNASeqHashMap &, vector<DNANSeq *> &);
diff --git a/src/utils.cc b/src/utils.cc
index 8a83e74..975ab41 100644
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -31,42 +31,42 @@ char reverse(char c) {
switch (c) {
case 'A':
case 'a':
- return 'T';
- break;
+ return 'T';
+ break;
case 'C':
case 'c':
- return 'G';
- break;
+ return 'G';
+ break;
case 'G':
case 'g':
- return 'C';
- break;
+ return 'C';
+ break;
case 'T':
case 't':
- return 'A';
- break;
+ return 'A';
+ break;
case 'N':
case 'n':
case '.':
- return 'N';
- break;
+ return 'N';
+ break;
case '-':
default:
- return '-';
- break;
+ return '-';
+ break;
}
return 'N';
}
char *
-rev_comp(const char *seq)
+rev_comp(const char *seq)
{
int len = strlen(seq);
int j = 0;
- char *com = new char[len + 1];
+ char *com = new char[len + 1];
const char *p;
-
+
for (p = seq + len - 1; p >= seq; p--) {
switch (*p) {
case 'A':
@@ -85,11 +85,11 @@ rev_comp(const char *seq)
case 't':
com[j] = 'A';
break;
- case 'N':
- case 'n':
- case '.':
- com[j] = 'N';
- break;
+ case 'N':
+ case 'n':
+ case '.':
+ com[j] = 'N';
+ break;
}
j++;
}
@@ -99,7 +99,7 @@ rev_comp(const char *seq)
}
void
-reverse_string(char *seq)
+reverse_string(char *seq)
{
int len = strlen(seq);
char *p = seq;
@@ -107,18 +107,18 @@ reverse_string(char *seq)
char tmp;
while (q > p) {
- tmp = *q;
- *q = *p;
- *p = tmp;
- q--;
- p++;
+ tmp = *q;
+ *q = *p;
+ *p = tmp;
+ q--;
+ p++;
}
return;
}
-int
-is_integer(const char *str)
+int
+is_integer(const char *str)
{
//
// Adapted from the strtol manpage.
@@ -133,18 +133,18 @@ is_integer(const char *str)
// Check for various possible errors
//
if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN))
- || (errno != 0 && val == 0)) {
- return -1;
+ || (errno != 0 && val == 0)) {
+ return -1;
}
if (endptr == str || *endptr != '\0')
- return -1;
+ return -1;
return (int) val;
}
-double
-is_double(const char *str)
+double
+is_double(const char *str)
{
//
// Adapted from the strtol manpage.
@@ -159,18 +159,18 @@ is_double(const char *str)
// Check for various possible errors
//
if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN))
- || (errno != 0 && val == 0)) {
- return -1;
+ || (errno != 0 && val == 0)) {
+ return -1;
}
if (endptr == str || *endptr != '\0')
- return -1;
+ return -1;
return val;
}
-double
-factorial(double n)
+double
+factorial(double n)
{
double fact = 1;
@@ -180,14 +180,14 @@ factorial(double n)
return fact;
}
-double
-reduced_factorial(double n, double d)
+double
+reduced_factorial(double n, double d)
{
double f = n - d;
- if (f < 0)
+ if (f < 0)
return 0;
- else if (f == 0)
+ else if (f == 0)
return 1;
else if (f == 1)
return n;
@@ -202,8 +202,8 @@ reduced_factorial(double n, double d)
return f;
}
-double
-log_factorial(double n)
+double
+log_factorial(double n)
{
double fact = 0;
@@ -213,14 +213,14 @@ log_factorial(double n)
return fact;
}
-double
-reduced_log_factorial(double n, double d)
+double
+reduced_log_factorial(double n, double d)
{
double f = n - d;
- if (f < 0)
+ if (f < 0)
return 0;
- else if (f == 0)
+ else if (f == 0)
return 0;
else if (f == 1)
return log(n);
diff --git a/src/utils.h b/src/utils.h
index a5c26e2..7bcda89 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -62,13 +62,13 @@ bool compare_pair_haplotype_rev(pair<string, double>, pair<string, double>);
//
struct int_increasing {
bool operator() (const int& lhs, const int& rhs) const {
- return lhs < rhs;
+ return lhs < rhs;
}
};
struct int_decreasing {
bool operator() (const int& lhs, const int& rhs) const {
- return lhs > rhs;
+ return lhs > rhs;
}
};
diff --git a/src/write.cc b/src/write.cc
index 2b9df77..703be81 100644
--- a/src/write.cc
+++ b/src/write.cc
@@ -28,7 +28,7 @@
#include "write.h"
-int
+int
write_fasta(ofstream *fh, Read *href, bool overhang) {
char tile[id_len];
sprintf(tile, "%04d", href->tile);
@@ -37,25 +37,25 @@ write_fasta(ofstream *fh, Read *href, bool overhang) {
offset += overhang ? 1 : 0;
if (href->fastq_type != generic_fastq)
- *fh <<
- ">" << href->lane <<
- "_" << tile <<
- "_" << href->x <<
- "_" << href->y <<
- "/" << href->read << "\n" <<
- href->seq + offset << "\n";
- else
- *fh <<
- ">" << href->machine <<
- "/" << href->read << "\n" <<
- href->seq + offset << "\n";
+ *fh <<
+ ">" << href->lane <<
+ "_" << tile <<
+ "_" << href->x <<
+ "_" << href->y <<
+ "/" << href->read << "\n" <<
+ href->seq + offset << "\n";
+ else
+ *fh <<
+ ">" << href->machine <<
+ "/" << href->read << "\n" <<
+ href->seq + offset << "\n";
if (fh->fail()) return -1;
return 1;
}
-int
+int
write_fasta(gzFile *fh, Read *href, bool overhang) {
stringstream sstr;
char tile[id_len];
@@ -65,54 +65,54 @@ write_fasta(gzFile *fh, Read *href, bool overhang) {
offset += overhang ? 1 : 0;
if (href->fastq_type != generic_fastq)
- sstr <<
- ">" << href->lane <<
- "_" << tile <<
- "_" << href->x <<
- "_" << href->y <<
- "/" << href->read << "\n" <<
- href->seq + offset << "\n";
- else
- sstr <<
- ">" << href->machine <<
- "/" << href->read << "\n" <<
- href->seq + offset << "\n";
+ sstr <<
+ ">" << href->lane <<
+ "_" << tile <<
+ "_" << href->x <<
+ "_" << href->y <<
+ "/" << href->read << "\n" <<
+ href->seq + offset << "\n";
+ else
+ sstr <<
+ ">" << href->machine <<
+ "/" << href->read << "\n" <<
+ href->seq + offset << "\n";
int res = gzputs(*fh, sstr.str().c_str());
return res;
}
-int
+int
write_fasta(ofstream *fh, Seq *href) {
*fh <<
- ">" <<
- href->id << "\n" <<
- href->seq << "\n";
+ ">" <<
+ href->id << "\n" <<
+ href->seq << "\n";
if (fh->fail()) return -1;
return 1;
}
-int
+int
write_fasta(gzFile *fh, Seq *href) {
stringstream sstr;
sstr <<
- ">" <<
- href->id << "\n" <<
- href->seq << "\n";
+ ">" <<
+ href->id << "\n" <<
+ href->seq << "\n";
int res = gzputs(*fh, sstr.str().c_str());
return res;
}
-int
+int
write_fastq(ofstream *fh, Read *href, bool overhang) {
//
- // Write the sequence and quality scores in FASTQ format.
+ // Write the sequence and quality scores in FASTQ format.
//
char tile[id_len];
sprintf(tile, "%04d", href->tile);
@@ -121,32 +121,32 @@ write_fastq(ofstream *fh, Read *href, bool overhang) {
offset += overhang ? 1 : 0;
if (href->fastq_type != generic_fastq)
- *fh <<
- "@" << href->lane <<
- "_" << tile <<
- "_" << href->x <<
- "_" << href->y <<
- "/" << href->read << "\n" <<
- href->seq + offset << "\n" <<
- "+\n" <<
- href->phred + offset << "\n";
+ *fh <<
+ "@" << href->lane <<
+ "_" << tile <<
+ "_" << href->x <<
+ "_" << href->y <<
+ "/" << href->read << "\n" <<
+ href->seq + offset << "\n" <<
+ "+\n" <<
+ href->phred + offset << "\n";
else
- *fh <<
- "@" << href->machine <<
- "/" << href->read << "\n" <<
- href->seq + offset << "\n" <<
- "+\n" <<
- href->phred + offset << "\n";
+ *fh <<
+ "@" << href->machine <<
+ "/" << href->read << "\n" <<
+ href->seq + offset << "\n" <<
+ "+\n" <<
+ href->phred + offset << "\n";
if (fh->fail()) return -1;
-
+
return 1;
}
-int
+int
write_fastq(gzFile *fh, Read *href, bool overhang) {
//
- // Write the sequence and quality scores in FASTQ format.
+ // Write the sequence and quality scores in FASTQ format.
//
stringstream sstr;
char tile[id_len];
@@ -156,201 +156,201 @@ write_fastq(gzFile *fh, Read *href, bool overhang) {
offset += overhang ? 1 : 0;
if (href->fastq_type != generic_fastq)
- sstr <<
- "@" << href->lane <<
- "_" << tile <<
- "_" << href->x <<
- "_" << href->y <<
- "/" << href->read << "\n" <<
- href->seq + offset << "\n" <<
- "+\n" <<
- href->phred + offset << "\n";
+ sstr <<
+ "@" << href->lane <<
+ "_" << tile <<
+ "_" << href->x <<
+ "_" << href->y <<
+ "/" << href->read << "\n" <<
+ href->seq + offset << "\n" <<
+ "+\n" <<
+ href->phred + offset << "\n";
else
- sstr <<
- "@" << href->machine <<
- "/" << href->read << "\n" <<
- href->seq + offset << "\n" <<
- "+\n" <<
- href->phred + offset << "\n";
+ sstr <<
+ "@" << href->machine <<
+ "/" << href->read << "\n" <<
+ href->seq + offset << "\n" <<
+ "+\n" <<
+ href->phred + offset << "\n";
int res = gzputs(*fh, sstr.str().c_str());
return res;
}
-int
+int
write_fastq(ofstream *fh, Seq *href, int offset) {
*fh <<
- "@" << href->id << "\n" <<
- href->seq + offset << "\n" <<
- "+\n" <<
- href->qual + offset << "\n";
+ "@" << href->id << "\n" <<
+ href->seq + offset << "\n" <<
+ "+\n" <<
+ href->qual + offset << "\n";
if (fh->fail()) return -1;
return 1;
}
-int
+int
write_fastq(gzFile *fh, Seq *href, int offset) {
stringstream sstr;
sstr <<
- "@" << href->id << "\n" <<
- href->seq + offset << "\n" <<
- "+\n" <<
- href->qual + offset << "\n";
+ "@" << href->id << "\n" <<
+ href->seq + offset << "\n" <<
+ "+\n" <<
+ href->qual + offset << "\n";
int res = gzputs(*fh, sstr.str().c_str());
return res;
}
-int
+int
write_fasta(ofstream *fh, Seq *href, int offset) {
*fh <<
- ">" <<
- href->id << "\n" <<
- href->seq + offset << "\n";
+ ">" <<
+ href->id << "\n" <<
+ href->seq + offset << "\n";
if (fh->fail()) return -1;
-
+
return 1;
}
-int
+int
write_fasta(gzFile *fh, Seq *href, int offset) {
stringstream sstr;
sstr <<
- ">" <<
- href->id << "\n" <<
- href->seq + offset << "\n";
+ ">" <<
+ href->id << "\n" <<
+ href->seq + offset << "\n";
int res = gzputs(*fh, sstr.str().c_str());
return res;
}
-int
+int
write_fastq(ofstream *fh, Seq *href) {
*fh <<
- "@" << href->id << "\n" <<
- href->seq << "\n" <<
- "+\n" <<
- href->qual << "\n";
+ "@" << href->id << "\n" <<
+ href->seq << "\n" <<
+ "+\n" <<
+ href->qual << "\n";
if (fh->fail()) return -1;
return 1;
}
-int
+int
write_fastq(gzFile *fh, Seq *href) {
stringstream sstr;
sstr <<
- "@" << href->id << "\n" <<
- href->seq << "\n" <<
- "+\n" <<
- href->qual << "\n";
+ "@" << href->id << "\n" <<
+ href->seq << "\n" <<
+ "+\n" <<
+ href->qual << "\n";
int res = gzputs(*fh, sstr.str().c_str());
return res;
}
-int
+int
write_fastq(ofstream *fh, Seq *href, string msg) {
*fh <<
- "@" << href->id << "|" << msg << "\n" <<
- href->seq << "\n" <<
- "+\n" <<
- href->qual << "\n";
+ "@" << href->id << "|" << msg << "\n" <<
+ href->seq << "\n" <<
+ "+\n" <<
+ href->qual << "\n";
if (fh->fail()) return -1;
-
+
return 1;
}
-int
+int
write_fastq(gzFile *fh, Seq *href, string msg) {
stringstream sstr;
sstr <<
- "@" << href->id << "|" << msg << "\n" <<
- href->seq << "\n" <<
- "+\n" <<
- href->qual << "\n";
+ "@" << href->id << "|" << msg << "\n" <<
+ href->seq << "\n" <<
+ "+\n" <<
+ href->qual << "\n";
int res = gzputs(*fh, sstr.str().c_str());
return res;
}
-int
+int
write_fasta(ofstream *fh, Seq *href, string msg) {
*fh <<
- ">" <<
- href->id << "|" << msg << "\n" <<
- href->seq << "\n";
+ ">" <<
+ href->id << "|" << msg << "\n" <<
+ href->seq << "\n";
if (fh->fail()) return -1;
-
+
return 1;
}
-int
+int
write_fasta(gzFile *fh, Seq *href, string msg) {
stringstream sstr;
sstr <<
- ">" <<
- href->id << "|" << msg << "\n" <<
- href->seq << "\n";
+ ">" <<
+ href->id << "|" << msg << "\n" <<
+ href->seq << "\n";
int res = gzputs(*fh, sstr.str().c_str());
return res;
}
-int
+int
write_fasta(ofstream *fh, Seq *href, Read *r) {
- *fh << ">"
- << href->id << "\n"
- << r->seq + r->inline_bc_len << "\n";
+ *fh << ">"
+ << href->id << "\n"
+ << r->seq + r->inline_bc_len << "\n";
if (fh->fail()) return -1;
-
+
return 1;
}
-int
+int
write_fasta(gzFile *fh, Seq *href, Read *r) {
stringstream sstr;
sstr << ">"
- << href->id << "\n"
- << r->seq + r->inline_bc_len << "\n";
+ << href->id << "\n"
+ << r->seq + r->inline_bc_len << "\n";
int res = gzputs(*fh, sstr.str().c_str());
return res;
}
-int
+int
write_fastq(ofstream *fh, Seq *href, Read *r) {
*fh << "@" << href->id << "\n"
- << r->seq + r->inline_bc_len << "\n"
- << "+\n"
- << r->phred + r->inline_bc_len << "\n";
+ << r->seq + r->inline_bc_len << "\n"
+ << "+\n"
+ << r->phred + r->inline_bc_len << "\n";
if (fh->fail()) return -1;
return 1;
}
-int
+int
write_fastq(gzFile *fh, Seq *href, Read *r) {
stringstream sstr;
sstr << "@" << href->id << "\n"
- << r->seq + r->inline_bc_len << "\n"
- << "+\n"
- << r->phred + r->inline_bc_len << "\n";
+ << r->seq + r->inline_bc_len << "\n"
+ << "+\n"
+ << r->phred + r->inline_bc_len << "\n";
int res = gzputs(*fh, sstr.str().c_str());
diff --git a/src/write.h b/src/write.h
index 6f0ed89..adf6287 100644
--- a/src/write.h
+++ b/src/write.h
@@ -46,7 +46,7 @@ int write_fasta(ofstream *, Seq *, int);
int write_fasta(ofstream *, Seq *, string);
int write_fastq(ofstream *, Seq *, Read *);
int write_fasta(ofstream *, Seq *, Read *);
-
+
int write_fastq(gzFile *, Read *, bool);
int write_fastq(gzFile *, Seq *);
int write_fastq(gzFile *, Seq *, int);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/stacks.git
More information about the debian-med-commit
mailing list