[med-svn] [cufflinks] 01/07: Imported Upstream version 2.2.0
Alex Mestiashvili
malex-guest at moszumanska.debian.org
Fri Apr 11 18:58:12 UTC 2014
This is an automated email from the git hooks/post-receive script.
malex-guest pushed a commit to branch master
in repository cufflinks.
commit 9dea31cd641dac222bb807f1d6b01a719ad981f6
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date: Fri Apr 11 13:01:34 2014 +0200
Imported Upstream version 2.2.0
---
Makefile.in | 8 +-
ax_boost_serialization.m4 | 135 ++++
config.h.in | 3 +
configure | 473 +++++++++++-
configure.ac | 6 +-
make_bin.sh | 21 +-
src/Makefile.am | 29 +-
src/Makefile.in | 72 +-
src/abundances.cpp | 911 ++++++++++++++++++----
src/abundances.h | 264 +++++--
src/assemble.h | 2 +-
src/biascorrection.cpp | 18 +-
src/biascorrection.h | 2 +-
src/bundles.cpp | 179 ++++-
src/bundles.h | 91 ++-
src/clustering.cpp | 18 +-
src/common.cpp | 50 +-
src/common.h | 256 +++++-
src/compress_gtf.cpp | 27 +-
src/cuffcompare.cpp | 59 +-
src/cuffdiff.cpp | 703 ++++++++---------
src/cufflinks.cpp | 109 +--
src/cuffnorm.cpp | 1892 +++++++++++++++++++++++++++++++++++++++++++++
src/cuffquant.cpp | 1623 ++++++++++++++++++++++++++++++++++++++
src/differential.cpp | 609 +++++----------
src/differential.h | 127 +--
src/filters.cpp | 10 +-
src/filters.h | 4 +-
src/genes.cpp | 2 +-
src/gff.cpp | 438 +++++++----
src/gff.h | 146 ++--
src/gff_utils.cpp | 95 ++-
src/gff_utils.h | 6 +-
src/gffread.cpp | 266 ++++---
src/gtf_to_sam.cpp | 19 +-
src/gtf_tracking.cpp | 34 +-
src/gtf_tracking.h | 6 +-
src/hits.cpp | 173 +++++
src/hits.h | 146 +++-
src/replicates.cpp | 113 ++-
src/replicates.h | 102 ++-
src/rounding.h | 25 +-
src/scaffolds.cpp | 14 +-
src/scaffolds.h | 10 +-
src/tracking.cpp | 99 +++
src/tracking.h | 118 +++
46 files changed, 7712 insertions(+), 1801 deletions(-)
diff --git a/Makefile.in b/Makefile.in
index 6c9c999..a9e8232 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -44,9 +44,10 @@ subdir = .
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/ax_boost_base.m4 \
$(top_srcdir)/ax_boost_thread.m4 \
- $(top_srcdir)/ax_boost_system.m4 $(top_srcdir)/ax_bam.m4 \
- $(top_srcdir)/ax_check_zlib.m4 $(top_srcdir)/ax_check_eigen.m4 \
- $(top_srcdir)/configure.ac
+ $(top_srcdir)/ax_boost_system.m4 \
+ $(top_srcdir)/ax_boost_serialization.m4 \
+ $(top_srcdir)/ax_bam.m4 $(top_srcdir)/ax_check_zlib.m4 \
+ $(top_srcdir)/ax_check_eigen.m4 $(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
@@ -88,6 +89,7 @@ BAM_LDFLAGS = @BAM_LDFLAGS@
BAM_LIB = @BAM_LIB@
BOOST_CPPFLAGS = @BOOST_CPPFLAGS@
BOOST_LDFLAGS = @BOOST_LDFLAGS@
+BOOST_SERIALIZATION_LIB = @BOOST_SERIALIZATION_LIB@
BOOST_SYSTEM_LIB = @BOOST_SYSTEM_LIB@
BOOST_THREAD_LIB = @BOOST_THREAD_LIB@
CC = @CC@
diff --git a/ax_boost_serialization.m4 b/ax_boost_serialization.m4
new file mode 100644
index 0000000..1318b03
--- /dev/null
+++ b/ax_boost_serialization.m4
@@ -0,0 +1,135 @@
+# ===========================================================================
+# http://autoconf-archive.cryp.to/ax_boost_serialization.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_BOOST_SERIALIZATION
+#
+# DESCRIPTION
+#
+# Test for Serialization library from the Boost C++ libraries. The macro requires
+# a preceding call to AX_BOOST_BASE. Further documentation is available at
+# <http://randspringer.de/boost/index.html>.
+#
+# This macro calls:
+#
+# AC_SUBST(BOOST_SERIALIZATION_LIB)
+# AC_SUBST(BOOST_SERIALIZATION_LIB)
+#
+# And sets:
+#
+# HAVE_BOOST_SERIALIZATION
+#
+# LICENSE
+#
+# Copyright (c) 2009 Thomas Porschberg <thomas at randspringer.de>
+# Copyright (c) 2009 Michael Tindal
+#
+# Copying and distribution of this file, with or without modification, are
+# permitted in any medium without royalty provided the copyright notice
+# and this notice are preserved.
+
+AC_DEFUN([AX_BOOST_SERIALIZATION],
+[
+ AC_ARG_WITH([boost-serialization],
+ AS_HELP_STRING([--with-boost-serialization@<:@=special-lib@:>@],
+ [use the Serialization library from boost - it is possible to specify a certain library for the linker
+ e.g. --with-boost-serialization=boost_serialization-gcc-mt ]),
+ [
+ if test "$withval" = "no"; then
+ want_boost="no"
+ elif test "$withval" = "yes"; then
+ want_boost="yes"
+ ax_boost_user_serialization_lib=""
+ ax_booth_user_serialization_lib=""
+ else
+ want_boost="yes"
+ echo "using $withval"
+ ax_boost_user_serialization_lib="$withval"
+ fi
+ ],
+ [want_boost="yes"]
+ )
+
+ if test "x$want_boost" = "xyes"; then
+ AC_REQUIRE([AC_PROG_CC])
+ AC_REQUIRE([AC_CANONICAL_BUILD])
+ CPPFLAGS_SAVED="$CPPFLAGS"
+ CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
+ export CPPFLAGS
+
+ LDFLAGS_SAVED="$LDFLAGS"
+ LDFLAGS="$LDFLAGS $BOOST_LDFLAGS"
+ export LDFLAGS
+
+ AC_CACHE_CHECK(whether the Boost::Serialization library is available,
+ ax_cv_boost_serialization,
+ [AC_LANG_PUSH([C++])
+ CXXFLAGS_SAVE=$CXXFLAGS
+ AC_COMPILE_IFELSE(AC_LANG_PROGRAM([[@%:@include <boost/serialization/utility.hpp>]]),
+ ax_cv_boost_serialization=yes, ax_cv_boost_serialization=no)
+ CXXFLAGS=$CXXFLAGS_SAVE
+ AC_LANG_POP([C++])
+ ])
+ if test "x$ax_cv_boost_serialization" = "xyes"; then
+ AC_SUBST(BOOST_CPPFLAGS)
+
+ AC_DEFINE(HAVE_BOOST_SERIALIZATION,,[define if the Boost::Serialization library is available])
+ BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/@<:@^\/@:>@*//'`
+
+ LDFLAGS_SAVE=$LDFLAGS
+
+ if test "x$ax_boost_user_serialization_lib" = "x"; then
+ for libextension in `ls $BOOSTLIBDIR/libboost_serialization*.so* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_serialization.*\)\.so.*$;\1;'` `ls $BOOSTLIBDIR/libboost_serialization*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_serialization.*\)\.a*$;\1;'`; do
+ ax_lib=${libextension}
+ AC_CHECK_LIB($ax_lib, exit,
+ [BOOST_SERIALIZATION_LIB="-l$ax_lib"; AC_SUBST(BOOST_SERIALIZATION_LIB) link_serialization="yes"; break],
+ [link_serialization="no"])
+ done
+ if test "x$link_serialization" != "xyes"; then
+ for libextension in `ls $BOOSTLIBDIR/boost_serialization*.dll* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_serialization.*\)\.dll.*$;\1;'` `ls $BOOSTLIBDIR/libboost_serialization*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_serialization.*\)\.a*$;\1;'` ; do
+ ax_lib=${libextension}
+ AC_CHECK_LIB($ax_lib, exit,
+ [BOOST_SERIALIZATION_LIB="-l$ax_lib"; AC_SUBST(BOOST_SERIALIZATION_LIB) link_serialization="yes"; break],
+ [link_serialization="no"])
+ done
+ fi
+
+ else
+ BOOST_SERIALIZATION_LIB="$ax_boost_user_serialization_lib";
+ AC_SUBST(BOOST_SERIALIZATION_LIB)
+ link_serialization="yes";
+
+
+ fi
+
+ if test "x$ax_boost_user_serialization_lib" = "x"; then
+ for libextension in `ls $BOOSTLIBDIR/libboost_serialization*.so* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_serialization.*\)\.so.*$;\1;'` `ls $BOOSTLIBDIR/libboost_serialization*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_serialization.*\)\.a*$;\1;'`; do
+ ax_lib=${libextension}
+ AC_CHECK_LIB($ax_lib, exit,
+ [BOOST_SERIALIZATION_LIB="-l$ax_lib"; AC_SUBST(BOOST_SERIALIZATION_LIB) link_serialization="yes"; break],
+ [link_serialization="no"])
+ done
+ if test "x$link_serialization" != "xyes"; then
+ for libextension in `ls $BOOSTLIBDIR/boost_serialization*.dll* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_serialization.*\)\.dll.*$;\1;'` `ls $BOOSTLIBDIR/libboost_serialization*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_serialization.*\)\.a*$;\1;'` ; do
+ ax_lib=${libextension}
+ AC_CHECK_LIB($ax_lib, exit,
+ [BOOST_SERIALIZATION_LIB="-l$ax_lib"; AC_SUBST(BOOST_SERIALIZATION_LIB) link_serialization="yes"; break],
+ [link_serialization="no"])
+ done
+ fi
+
+ else
+ BOOST_SERIALIZATION_LIB="$ax_boost_user_serialization_lib";
+ AC_SUBST(BOOST_SERIALIZATION_LIB)
+ link_serialization="yes";
+
+
+ fi
+ fi
+
+ CPPFLAGS="$CPPFLAGS_SAVED"
+ LDFLAGS="$LDFLAGS_SAVED"
+ fi
+])
diff --git a/config.h.in b/config.h.in
index cf47d1d..3fd2181 100644
--- a/config.h.in
+++ b/config.h.in
@@ -6,6 +6,9 @@
/* define if the Boost library is available */
#undef HAVE_BOOST
+/* define if the Boost::Serialization library is available */
+#undef HAVE_BOOST_SERIALIZATION
+
/* define if the Boost::System library is available */
#undef HAVE_BOOST_SYSTEM
diff --git a/configure b/configure
index df6c676..e8c40f0 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.59 for cufflinks 2.1.1.
+# Generated by GNU Autoconf 2.59 for cufflinks 2.2.0.
#
# Report bugs to <cole at cs.umd.edu>.
#
@@ -269,8 +269,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
# Identity of this package.
PACKAGE_NAME='cufflinks'
PACKAGE_TARNAME='cufflinks'
-PACKAGE_VERSION='2.1.1'
-PACKAGE_STRING='cufflinks 2.1.1'
+PACKAGE_VERSION='2.2.0'
+PACKAGE_STRING='cufflinks 2.2.0'
PACKAGE_BUGREPORT='cole at cs.umd.edu'
ac_unique_file="config.h.in"
@@ -311,7 +311,7 @@ ac_includes_default="\
# include <unistd.h>
#endif"
-ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA CYGPATH_W PACKAGE VERSION ACLOCAL AUTOCONF AUTOMAKE AUTOHEADER MAKEINFO install_sh STRIP ac_ct_STRIP INS [...]
+ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA CYGPATH_W PACKAGE VERSION ACLOCAL AUTOCONF AUTOMAKE AUTOHEADER MAKEINFO install_sh STRIP ac_ct_STRIP INS [...]
ac_subst_files=''
# Initialize some variables set by options.
@@ -792,7 +792,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures cufflinks 2.1.1 to adapt to many kinds of systems.
+\`configure' configures cufflinks 2.2.0 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -858,7 +858,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of cufflinks 2.1.1:";;
+ short | recursive ) echo "Configuration of cufflinks 2.2.0:";;
esac
cat <<\_ACEOF
@@ -898,6 +898,11 @@ Optional Packages:
use the System library from boost - it is possible
to specify a certain library for the linker e.g.
--with-boost-system=boost_system-gcc-mt
+ --with-boost-serialization[=special-lib]
+ use the Serialization library from boost - it is
+ possible to specify a certain library for the linker
+ e.g.
+ --with-boost-serialization=boost_serialization-gcc-mt
--with-boost-thread[=special-lib]
use the Thread library from boost - it is possible
to specify a certain library for the linker e.g.
@@ -1021,7 +1026,7 @@ fi
test -n "$ac_init_help" && exit 0
if $ac_init_version; then
cat <<\_ACEOF
-cufflinks configure 2.1.1
+cufflinks configure 2.2.0
generated by GNU Autoconf 2.59
Copyright (C) 2003 Free Software Foundation, Inc.
@@ -1035,7 +1040,7 @@ cat >&5 <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by cufflinks $as_me 2.1.1, which was
+It was created by cufflinks $as_me 2.2.0, which was
generated by GNU Autoconf 2.59. Invocation command line was
$ $0 $@
@@ -1373,7 +1378,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
cat >>confdefs.h <<\_ACEOF
-#define SVN_REVISION "4046M"
+#define SVN_REVISION "4222"
_ACEOF
@@ -1687,7 +1692,7 @@ fi
# Define the identity of the package.
PACKAGE='cufflinks'
- VERSION='2.1.1'
+ VERSION='2.2.0'
cat >>confdefs.h <<_ACEOF
@@ -4893,6 +4898,445 @@ fi
+# Check whether --with-boost-serialization or --without-boost-serialization was given.
+if test "${with_boost_serialization+set}" = set; then
+ withval="$with_boost_serialization"
+
+ if test "$withval" = "no"; then
+ want_boost="no"
+ elif test "$withval" = "yes"; then
+ want_boost="yes"
+ ax_boost_user_serialization_lib=""
+ ax_booth_user_serialization_lib=""
+ else
+ want_boost="yes"
+ echo "using $withval"
+ ax_boost_user_serialization_lib="$withval"
+ fi
+
+else
+ want_boost="yes"
+
+fi;
+
+ if test "x$want_boost" = "xyes"; then
+
+
+ CPPFLAGS_SAVED="$CPPFLAGS"
+ CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
+ export CPPFLAGS
+
+ LDFLAGS_SAVED="$LDFLAGS"
+ LDFLAGS="$LDFLAGS $BOOST_LDFLAGS"
+ export LDFLAGS
+
+ echo "$as_me:$LINENO: checking whether the Boost::Serialization library is available" >&5
+echo $ECHO_N "checking whether the Boost::Serialization library is available... $ECHO_C" >&6
+if test "${ax_cv_boost_serialization+set}" = set; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ ac_ext=cc
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+ CXXFLAGS_SAVE=$CXXFLAGS
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+#include <boost/serialization/utility.hpp>
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
+ (eval $ac_compile) 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } &&
+ { ac_try='test -z "$ac_cxx_werror_flag"
+ || test ! -s conftest.err'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; } &&
+ { ac_try='test -s conftest.$ac_objext'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ ax_cv_boost_serialization=yes
+else
+ echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ax_cv_boost_serialization=no
+fi
+rm -f conftest.err conftest.$ac_objext conftest.$ac_ext
+ CXXFLAGS=$CXXFLAGS_SAVE
+ ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+fi
+echo "$as_me:$LINENO: result: $ax_cv_boost_serialization" >&5
+echo "${ECHO_T}$ax_cv_boost_serialization" >&6
+ if test "x$ax_cv_boost_serialization" = "xyes"; then
+
+
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_BOOST_SERIALIZATION
+_ACEOF
+
+ BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/[^\/]*//'`
+
+ LDFLAGS_SAVE=$LDFLAGS
+
+ if test "x$ax_boost_user_serialization_lib" = "x"; then
+ for libextension in `ls $BOOSTLIBDIR/libboost_serialization*.so* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_serialization.*\)\.so.*$;\1;'` `ls $BOOSTLIBDIR/libboost_serialization*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_serialization.*\)\.a*$;\1;'`; do
+ ax_lib=${libextension}
+ as_ac_Lib=`echo "ac_cv_lib_$ax_lib''_exit" | $as_tr_sh`
+echo "$as_me:$LINENO: checking for exit in -l$ax_lib" >&5
+echo $ECHO_N "checking for exit in -l$ax_lib... $ECHO_C" >&6
+if eval "test \"\${$as_ac_Lib+set}\" = set"; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ ac_check_lib_save_LIBS=$LIBS
+LIBS="-l$ax_lib $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+/* Override any gcc2 internal prototype to avoid an error. */
+#ifdef __cplusplus
+extern "C"
+#endif
+/* We use char because int might match the return type of a gcc2
+ builtin and then its argument prototype would still apply. */
+char exit ();
+int
+main ()
+{
+exit ();
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5
+ (eval $ac_link) 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } &&
+ { ac_try='test -z "$ac_c_werror_flag"
+ || test ! -s conftest.err'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; } &&
+ { ac_try='test -s conftest$ac_exeext'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ eval "$as_ac_Lib=yes"
+else
+ echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+eval "$as_ac_Lib=no"
+fi
+rm -f conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Lib'}'`" >&5
+echo "${ECHO_T}`eval echo '${'$as_ac_Lib'}'`" >&6
+if test `eval echo '${'$as_ac_Lib'}'` = yes; then
+ BOOST_SERIALIZATION_LIB="-l$ax_lib"; link_serialization="yes"; break
+else
+ link_serialization="no"
+fi
+
+ done
+ if test "x$link_serialization" != "xyes"; then
+ for libextension in `ls $BOOSTLIBDIR/boost_serialization*.dll* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_serialization.*\)\.dll.*$;\1;'` `ls $BOOSTLIBDIR/libboost_serialization*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_serialization.*\)\.a*$;\1;'` ; do
+ ax_lib=${libextension}
+ as_ac_Lib=`echo "ac_cv_lib_$ax_lib''_exit" | $as_tr_sh`
+echo "$as_me:$LINENO: checking for exit in -l$ax_lib" >&5
+echo $ECHO_N "checking for exit in -l$ax_lib... $ECHO_C" >&6
+if eval "test \"\${$as_ac_Lib+set}\" = set"; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ ac_check_lib_save_LIBS=$LIBS
+LIBS="-l$ax_lib $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+/* Override any gcc2 internal prototype to avoid an error. */
+#ifdef __cplusplus
+extern "C"
+#endif
+/* We use char because int might match the return type of a gcc2
+ builtin and then its argument prototype would still apply. */
+char exit ();
+int
+main ()
+{
+exit ();
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5
+ (eval $ac_link) 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } &&
+ { ac_try='test -z "$ac_c_werror_flag"
+ || test ! -s conftest.err'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; } &&
+ { ac_try='test -s conftest$ac_exeext'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ eval "$as_ac_Lib=yes"
+else
+ echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+eval "$as_ac_Lib=no"
+fi
+rm -f conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Lib'}'`" >&5
+echo "${ECHO_T}`eval echo '${'$as_ac_Lib'}'`" >&6
+if test `eval echo '${'$as_ac_Lib'}'` = yes; then
+ BOOST_SERIALIZATION_LIB="-l$ax_lib"; link_serialization="yes"; break
+else
+ link_serialization="no"
+fi
+
+ done
+ fi
+
+ else
+ BOOST_SERIALIZATION_LIB="$ax_boost_user_serialization_lib";
+
+ link_serialization="yes";
+
+
+ fi
+
+ if test "x$ax_boost_user_serialization_lib" = "x"; then
+ for libextension in `ls $BOOSTLIBDIR/libboost_serialization*.so* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_serialization.*\)\.so.*$;\1;'` `ls $BOOSTLIBDIR/libboost_serialization*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_serialization.*\)\.a*$;\1;'`; do
+ ax_lib=${libextension}
+ as_ac_Lib=`echo "ac_cv_lib_$ax_lib''_exit" | $as_tr_sh`
+echo "$as_me:$LINENO: checking for exit in -l$ax_lib" >&5
+echo $ECHO_N "checking for exit in -l$ax_lib... $ECHO_C" >&6
+if eval "test \"\${$as_ac_Lib+set}\" = set"; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ ac_check_lib_save_LIBS=$LIBS
+LIBS="-l$ax_lib $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+/* Override any gcc2 internal prototype to avoid an error. */
+#ifdef __cplusplus
+extern "C"
+#endif
+/* We use char because int might match the return type of a gcc2
+ builtin and then its argument prototype would still apply. */
+char exit ();
+int
+main ()
+{
+exit ();
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5
+ (eval $ac_link) 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } &&
+ { ac_try='test -z "$ac_c_werror_flag"
+ || test ! -s conftest.err'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; } &&
+ { ac_try='test -s conftest$ac_exeext'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ eval "$as_ac_Lib=yes"
+else
+ echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+eval "$as_ac_Lib=no"
+fi
+rm -f conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Lib'}'`" >&5
+echo "${ECHO_T}`eval echo '${'$as_ac_Lib'}'`" >&6
+if test `eval echo '${'$as_ac_Lib'}'` = yes; then
+ BOOST_SERIALIZATION_LIB="-l$ax_lib"; link_serialization="yes"; break
+else
+ link_serialization="no"
+fi
+
+ done
+ if test "x$link_serialization" != "xyes"; then
+ for libextension in `ls $BOOSTLIBDIR/boost_serialization*.dll* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_serialization.*\)\.dll.*$;\1;'` `ls $BOOSTLIBDIR/libboost_serialization*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_serialization.*\)\.a*$;\1;'` ; do
+ ax_lib=${libextension}
+ as_ac_Lib=`echo "ac_cv_lib_$ax_lib''_exit" | $as_tr_sh`
+echo "$as_me:$LINENO: checking for exit in -l$ax_lib" >&5
+echo $ECHO_N "checking for exit in -l$ax_lib... $ECHO_C" >&6
+if eval "test \"\${$as_ac_Lib+set}\" = set"; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ ac_check_lib_save_LIBS=$LIBS
+LIBS="-l$ax_lib $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+/* Override any gcc2 internal prototype to avoid an error. */
+#ifdef __cplusplus
+extern "C"
+#endif
+/* We use char because int might match the return type of a gcc2
+ builtin and then its argument prototype would still apply. */
+char exit ();
+int
+main ()
+{
+exit ();
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5
+ (eval $ac_link) 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } &&
+ { ac_try='test -z "$ac_c_werror_flag"
+ || test ! -s conftest.err'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; } &&
+ { ac_try='test -s conftest$ac_exeext'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ eval "$as_ac_Lib=yes"
+else
+ echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+eval "$as_ac_Lib=no"
+fi
+rm -f conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Lib'}'`" >&5
+echo "${ECHO_T}`eval echo '${'$as_ac_Lib'}'`" >&6
+if test `eval echo '${'$as_ac_Lib'}'` = yes; then
+ BOOST_SERIALIZATION_LIB="-l$ax_lib"; link_serialization="yes"; break
+else
+ link_serialization="no"
+fi
+
+ done
+ fi
+
+ else
+ BOOST_SERIALIZATION_LIB="$ax_boost_user_serialization_lib";
+
+ link_serialization="yes";
+
+
+ fi
+ fi
+
+ CPPFLAGS="$CPPFLAGS_SAVED"
+ LDFLAGS="$LDFLAGS_SAVED"
+ fi
+
+
+
# Check whether --with-boost-thread or --without-boost-thread was given.
if test "${with_boost_thread+set}" = set; then
withval="$with_boost_thread"
@@ -6966,7 +7410,7 @@ host_os=`echo $ac_cv_host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
# set CFLAGS and CXXFLAGS
user_CFLAGS=${CFLAGS}
-generic_CFLAGS="-Wall -Wno-strict-aliasing -g -gdwarf-2 -Wunused -Wuninitialized"
+generic_CFLAGS="-Wall -Wno-strict-aliasing -g -gdwarf-2 -Wunused -Wuninitialized -ftemplate-depth-1024"
ext_CFLAGS=""
debug_CFLAGS=""
#echo "${host_cpu}-${host_os}"
@@ -7474,7 +7918,7 @@ fi
# Define the identity of the package.
PACKAGE='cufflinks'
- VERSION='2.1.1'
+ VERSION='2.2.0'
cat >>confdefs.h <<_ACEOF
@@ -8304,7 +8748,7 @@ _ASBOX
} >&5
cat >&5 <<_CSEOF
-This file was extended by cufflinks $as_me 2.1.1, which was
+This file was extended by cufflinks $as_me 2.2.0, which was
generated by GNU Autoconf 2.59. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -8367,7 +8811,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF
ac_cs_version="\\
-cufflinks config.status 2.1.1
+cufflinks config.status 2.2.0
configured by $0, generated by GNU Autoconf 2.59,
with options \\"`echo "$ac_configure_args" | sed 's/[\\""\`\$]/\\\\&/g'`\\"
@@ -8631,6 +9075,7 @@ s, at build_cpu@,$build_cpu,;t t
s, at build_vendor@,$build_vendor,;t t
s, at build_os@,$build_os,;t t
s, at BOOST_SYSTEM_LIB@,$BOOST_SYSTEM_LIB,;t t
+s, at BOOST_SERIALIZATION_LIB@,$BOOST_SERIALIZATION_LIB,;t t
s, at BOOST_THREAD_LIB@,$BOOST_THREAD_LIB,;t t
s, at CPP@,$CPP,;t t
s, at EGREP@,$EGREP,;t t
diff --git a/configure.ac b/configure.ac
index eafc4f8..ee58080 100755
--- a/configure.ac
+++ b/configure.ac
@@ -1,12 +1,13 @@
m4_include([ax_boost_base.m4])
m4_include([ax_boost_thread.m4])
m4_include([ax_boost_system.m4])
+m4_include([ax_boost_serialization.m4])
m4_include([ax_bam.m4])
m4_include([ax_check_zlib.m4])
m4_include([ax_check_eigen.m4])
define([svnversion], esyscmd([sh -c "svnversion|tr -d '\n'"]))dnl
-AC_INIT([cufflinks], [2.1.1], [cole at cs.umd.edu])
+AC_INIT([cufflinks], [2.2.0], [cole at cs.umd.edu])
AC_DEFINE(SVN_REVISION, "svnversion", [SVN Revision])
AC_CONFIG_SRCDIR([config.h.in])
@@ -36,6 +37,7 @@ AM_PATH_PYTHON([2.4])
AX_BOOST_BASE([1.47.0])
AX_BAM
AX_BOOST_SYSTEM
+AX_BOOST_SERIALIZATION
AX_BOOST_THREAD
AX_CHECK_ZLIB()
AX_EIGEN
@@ -59,7 +61,7 @@ AC_CANONICAL_HOST
# set CFLAGS and CXXFLAGS
user_CFLAGS=${CFLAGS}
-generic_CFLAGS="-Wall -Wno-strict-aliasing -g -gdwarf-2 -Wunused -Wuninitialized"
+generic_CFLAGS="-Wall -Wno-strict-aliasing -g -gdwarf-2 -Wunused -Wuninitialized -ftemplate-depth-1024"
ext_CFLAGS=""
debug_CFLAGS=""
#echo "${host_cpu}-${host_os}"
diff --git a/make_bin.sh b/make_bin.sh
index 36efc06..32c2e6e 100755
--- a/make_bin.sh
+++ b/make_bin.sh
@@ -10,10 +10,19 @@ echo "packing up $1.tar.gz, using boost in $2, linking against $3 and using BAM
mkdir $1
#make clean
make distclean
-if [[ $(uname -m) = "x86_64" ]]; then
-echo "Linking statically on x86_64 (only for gcc 4.5+).."
-export LDFLAGS="-static-libgcc -static-libstdc++"
-fi
+if [[ $(uname) = "Darwin" ]]
+then
+ export CFLAGS="-mmacosx-version-min=10.7 -stdlib=libc++"
+ export LDFLAGS="-stdlib=libc++"
+elif [ $(uname -m) = "x86_64"]
+then
+ echo "Linking statically on x86_64 (only for gcc 4.5+).."
+ export LDFLAGS="-static-libgcc -static-libstdc++"
+else
+ echo "Unrecognized architecture"
+fi
+
+
l2="$2"
l3="$3"
if [[ -z "$l3" ]]; then
@@ -30,7 +39,7 @@ fi
#./configure --enable-intel64 --with-boost=$l2 --with-boost-thread=$l3 --with-bam=$l4 --with-eigen=$l5
-./configure --with-boost=$l2 --with-boost-thread=$l2/lib/libboost_thread.a --with-boost-system=$l2/lib/libboost_system.a --with-bam=$l3 --with-eigen=$l4
+./configure --with-boost=$l2 --with-boost-thread=$l2/lib/libboost_thread.a --with-boost-system=$l2/lib/libboost_system.a --with-bam=$l3 --with-eigen=$l4 --with-boost-serialization=$l2/lib/libboost_serialization.a
make
cp src/cufflinks $1
cp src/cuffcompare $1
@@ -38,6 +47,8 @@ cp src/cuffdiff $1
cp src/cuffmerge $1/cuffmerge
cp src/gffread $1
cp src/gtf_to_sam $1
+cp src/cuffnorm $1
+cp src/cuffquant $1
cp README $1
cp LICENSE $1
cp AUTHORS $1
diff --git a/src/Makefile.am b/src/Makefile.am
index 1cf607b..b46137d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -13,7 +13,9 @@ bin_PROGRAMS = \
cuffdiff \
gtf_to_sam \
compress_gtf \
- gffread
+ gffread \
+ cuffquant \
+ cuffnorm
# cuffcluster
# gtf_reads
@@ -103,6 +105,7 @@ noinst_HEADERS = \
multireads.h \
rounding.h \
negative_binomial_distribution.h \
+ tracking.h \
common.h
noinst_LIBRARIES = libcufflinks.a libgc.a
@@ -177,7 +180,8 @@ libcufflinks_a_SOURCES = \
locfit/weight.c \
replicates.cpp \
multireads.cpp \
- jensen_shannon.cpp
+ jensen_shannon.cpp \
+ tracking.cpp
libgc_a_SOURCES = \
codons.cpp \
@@ -202,7 +206,7 @@ CLEANFILES = $(bin_SCRIPTS)
# (echo '#!$(PYTHON)'; sed '/^#!/d' $<) > $@
cufflinks_SOURCES = cufflinks.cpp
-cufflinks_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+cufflinks_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
cufflinks_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS) #$(ZLIB_LDFLAGS)
cuffcompare_SOURCES = cuffcompare.cpp
@@ -212,21 +216,30 @@ gffread_SOURCES = gffread.cpp
gffread_LDADD = libgc.a
cuffdiff_SOURCES = cuffdiff.cpp
-cuffdiff_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+cuffdiff_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
cuffdiff_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
+cuffquant_SOURCES = cuffquant.cpp
+cuffquant_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
+cuffquant_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
+
+cuffnorm_SOURCES = cuffnorm.cpp
+cuffnorm_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
+cuffnorm_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
+
+
gtf_to_sam_SOURCES = gtf_to_sam.cpp
-gtf_to_sam_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+gtf_to_sam_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
gtf_to_sam_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
#cuffcluster_SOURCES = cuffcluster.cpp
-#cuffcluster_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+#cuffcluster_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
#cuffcluster_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
compress_gtf_SOURCES = compress_gtf.cpp
-compress_gtf_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+compress_gtf_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
compress_gtf_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
#gtf_reads_SOURCES = gtf_reads.cpp
-#gtf_reads_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+#gtf_reads_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
#gtf_reads_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS) #$(ZLIB_LDFLAGS)
diff --git a/src/Makefile.in b/src/Makefile.in
index 1228939..c066835 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -43,16 +43,17 @@ build_triplet = @build@
host_triplet = @host@
bin_PROGRAMS = cufflinks$(EXEEXT) cuffcompare$(EXEEXT) \
cuffdiff$(EXEEXT) gtf_to_sam$(EXEEXT) compress_gtf$(EXEEXT) \
- gffread$(EXEEXT)
+ gffread$(EXEEXT) cuffquant$(EXEEXT) cuffnorm$(EXEEXT)
subdir = src
DIST_COMMON = $(dist_bin_SCRIPTS) $(noinst_HEADERS) \
$(srcdir)/Makefile.am $(srcdir)/Makefile.in
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/ax_boost_base.m4 \
$(top_srcdir)/ax_boost_thread.m4 \
- $(top_srcdir)/ax_boost_system.m4 $(top_srcdir)/ax_bam.m4 \
- $(top_srcdir)/ax_check_zlib.m4 $(top_srcdir)/ax_check_eigen.m4 \
- $(top_srcdir)/configure.ac
+ $(top_srcdir)/ax_boost_system.m4 \
+ $(top_srcdir)/ax_boost_serialization.m4 \
+ $(top_srcdir)/ax_bam.m4 $(top_srcdir)/ax_check_zlib.m4 \
+ $(top_srcdir)/ax_check_eigen.m4 $(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
mkinstalldirs = $(install_sh) -d
@@ -87,7 +88,7 @@ am_libcufflinks_a_OBJECTS = clustering.$(OBJEXT) \
simul.$(OBJEXT) solve.$(OBJEXT) startlf.$(OBJEXT) \
strings.$(OBJEXT) vari.$(OBJEXT) wdiag.$(OBJEXT) \
weight.$(OBJEXT) replicates.$(OBJEXT) multireads.$(OBJEXT) \
- jensen_shannon.$(OBJEXT)
+ jensen_shannon.$(OBJEXT) tracking.$(OBJEXT)
libcufflinks_a_OBJECTS = $(am_libcufflinks_a_OBJECTS)
libgc_a_AR = $(AR) $(ARFLAGS)
libgc_a_LIBADD =
@@ -104,25 +105,38 @@ compress_gtf_OBJECTS = $(am_compress_gtf_OBJECTS)
am__DEPENDENCIES_1 =
compress_gtf_DEPENDENCIES = libcufflinks.a libgc.a \
$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
- $(am__DEPENDENCIES_1)
+ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
am_cuffcompare_OBJECTS = cuffcompare.$(OBJEXT)
cuffcompare_OBJECTS = $(am_cuffcompare_OBJECTS)
cuffcompare_DEPENDENCIES = libgc.a
am_cuffdiff_OBJECTS = cuffdiff.$(OBJEXT)
cuffdiff_OBJECTS = $(am_cuffdiff_OBJECTS)
cuffdiff_DEPENDENCIES = libcufflinks.a libgc.a $(am__DEPENDENCIES_1) \
- $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
+ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
+ $(am__DEPENDENCIES_1)
am_cufflinks_OBJECTS = cufflinks.$(OBJEXT)
cufflinks_OBJECTS = $(am_cufflinks_OBJECTS)
cufflinks_DEPENDENCIES = libcufflinks.a libgc.a $(am__DEPENDENCIES_1) \
- $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
+ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
+ $(am__DEPENDENCIES_1)
+am_cuffnorm_OBJECTS = cuffnorm.$(OBJEXT)
+cuffnorm_OBJECTS = $(am_cuffnorm_OBJECTS)
+cuffnorm_DEPENDENCIES = libcufflinks.a libgc.a $(am__DEPENDENCIES_1) \
+ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
+ $(am__DEPENDENCIES_1)
+am_cuffquant_OBJECTS = cuffquant.$(OBJEXT)
+cuffquant_OBJECTS = $(am_cuffquant_OBJECTS)
+cuffquant_DEPENDENCIES = libcufflinks.a libgc.a $(am__DEPENDENCIES_1) \
+ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
+ $(am__DEPENDENCIES_1)
am_gffread_OBJECTS = gffread.$(OBJEXT)
gffread_OBJECTS = $(am_gffread_OBJECTS)
gffread_DEPENDENCIES = libgc.a
am_gtf_to_sam_OBJECTS = gtf_to_sam.$(OBJEXT)
gtf_to_sam_OBJECTS = $(am_gtf_to_sam_OBJECTS)
gtf_to_sam_DEPENDENCIES = libcufflinks.a libgc.a $(am__DEPENDENCIES_1) \
- $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
+ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
+ $(am__DEPENDENCIES_1)
dist_binSCRIPT_INSTALL = $(INSTALL_SCRIPT)
SCRIPTS = $(dist_bin_SCRIPTS)
DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)
@@ -139,12 +153,12 @@ CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
-o $@
SOURCES = $(libcufflinks_a_SOURCES) $(libgc_a_SOURCES) \
$(compress_gtf_SOURCES) $(cuffcompare_SOURCES) \
- $(cuffdiff_SOURCES) $(cufflinks_SOURCES) $(gffread_SOURCES) \
- $(gtf_to_sam_SOURCES)
+ $(cuffdiff_SOURCES) $(cufflinks_SOURCES) $(cuffnorm_SOURCES) \
+ $(cuffquant_SOURCES) $(gffread_SOURCES) $(gtf_to_sam_SOURCES)
DIST_SOURCES = $(libcufflinks_a_SOURCES) $(libgc_a_SOURCES) \
$(compress_gtf_SOURCES) $(cuffcompare_SOURCES) \
- $(cuffdiff_SOURCES) $(cufflinks_SOURCES) $(gffread_SOURCES) \
- $(gtf_to_sam_SOURCES)
+ $(cuffdiff_SOURCES) $(cufflinks_SOURCES) $(cuffnorm_SOURCES) \
+ $(cuffquant_SOURCES) $(gffread_SOURCES) $(gtf_to_sam_SOURCES)
HEADERS = $(noinst_HEADERS)
ETAGS = etags
CTAGS = ctags
@@ -162,6 +176,7 @@ BAM_LDFLAGS = @BAM_LDFLAGS@
BAM_LIB = @BAM_LIB@
BOOST_CPPFLAGS = @BOOST_CPPFLAGS@
BOOST_LDFLAGS = @BOOST_LDFLAGS@
+BOOST_SERIALIZATION_LIB = @BOOST_SERIALIZATION_LIB@
BOOST_SYSTEM_LIB = @BOOST_SYSTEM_LIB@
BOOST_THREAD_LIB = @BOOST_THREAD_LIB@
CC = @CC@
@@ -347,6 +362,7 @@ noinst_HEADERS = \
multireads.h \
rounding.h \
negative_binomial_distribution.h \
+ tracking.h \
common.h
noinst_LIBRARIES = libcufflinks.a libgc.a
@@ -420,7 +436,8 @@ libcufflinks_a_SOURCES = \
locfit/weight.c \
replicates.cpp \
multireads.cpp \
- jensen_shannon.cpp
+ jensen_shannon.cpp \
+ tracking.cpp
libgc_a_SOURCES = \
codons.cpp \
@@ -445,24 +462,30 @@ CLEANFILES = $(bin_SCRIPTS)
#.py:
# (echo '#!$(PYTHON)'; sed '/^#!/d' $<) > $@
cufflinks_SOURCES = cufflinks.cpp
-cufflinks_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+cufflinks_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
cufflinks_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS) #$(ZLIB_LDFLAGS)
cuffcompare_SOURCES = cuffcompare.cpp
cuffcompare_LDADD = libgc.a
gffread_SOURCES = gffread.cpp
gffread_LDADD = libgc.a
cuffdiff_SOURCES = cuffdiff.cpp
-cuffdiff_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+cuffdiff_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
cuffdiff_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
+cuffquant_SOURCES = cuffquant.cpp
+cuffquant_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
+cuffquant_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
+cuffnorm_SOURCES = cuffnorm.cpp
+cuffnorm_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
+cuffnorm_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
gtf_to_sam_SOURCES = gtf_to_sam.cpp
-gtf_to_sam_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+gtf_to_sam_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
gtf_to_sam_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
#cuffcluster_SOURCES = cuffcluster.cpp
-#cuffcluster_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+#cuffcluster_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
#cuffcluster_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
compress_gtf_SOURCES = compress_gtf.cpp
-compress_gtf_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+compress_gtf_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
compress_gtf_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS)
all: all-am
@@ -543,6 +566,12 @@ cuffdiff$(EXEEXT): $(cuffdiff_OBJECTS) $(cuffdiff_DEPENDENCIES)
cufflinks$(EXEEXT): $(cufflinks_OBJECTS) $(cufflinks_DEPENDENCIES)
@rm -f cufflinks$(EXEEXT)
$(CXXLINK) $(cufflinks_LDFLAGS) $(cufflinks_OBJECTS) $(cufflinks_LDADD) $(LIBS)
+cuffnorm$(EXEEXT): $(cuffnorm_OBJECTS) $(cuffnorm_DEPENDENCIES)
+ @rm -f cuffnorm$(EXEEXT)
+ $(CXXLINK) $(cuffnorm_LDFLAGS) $(cuffnorm_OBJECTS) $(cuffnorm_LDADD) $(LIBS)
+cuffquant$(EXEEXT): $(cuffquant_OBJECTS) $(cuffquant_DEPENDENCIES)
+ @rm -f cuffquant$(EXEEXT)
+ $(CXXLINK) $(cuffquant_LDFLAGS) $(cuffquant_OBJECTS) $(cuffquant_LDADD) $(LIBS)
gffread$(EXEEXT): $(gffread_OBJECTS) $(gffread_DEPENDENCIES)
@rm -f gffread$(EXEEXT)
$(CXXLINK) $(gffread_LDFLAGS) $(gffread_OBJECTS) $(gffread_LDADD) $(LIBS)
@@ -598,6 +627,8 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/cuffcompare.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/cuffdiff.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/cufflinks.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/cuffnorm.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/cuffquant.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/dens_haz.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/dens_int.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/dens_odi.Po at am__quote@
@@ -658,6 +689,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/startlf.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/strings.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/tokenize.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/tracking.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/vari.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/wdiag.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/weight.Po at am__quote@
@@ -1579,7 +1611,7 @@ uninstall-am: uninstall-binPROGRAMS uninstall-dist_binSCRIPTS \
#gtf_reads_SOURCES = gtf_reads.cpp
-#gtf_reads_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB)
+#gtf_reads_LDADD = libcufflinks.a libgc.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) $(BAM_LIB)
#gtf_reads_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS) #$(ZLIB_LDFLAGS)
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
diff --git a/src/abundances.cpp b/src/abundances.cpp
index e1992e9..125a36f 100644
--- a/src/abundances.cpp
+++ b/src/abundances.cpp
@@ -44,6 +44,7 @@
#include "sampling.h"
#include "jensen_shannon.h"
#include "rounding.h"
+#include "clustering.h"
#include "negative_binomial_distribution.h"
@@ -416,7 +417,7 @@ bool fit_negbin_dist(const vector<double> samples, double& r, double& p)
//#define USE_LOG_CACHE
-void compute_compatibilities(const vector<shared_ptr<Abundance> >& transcripts,
+void compute_compatibilities(const vector<boost::shared_ptr<Abundance> >& transcripts,
const vector<MateHit>& alignments,
vector<vector<char> >& compatibilities)
{
@@ -433,7 +434,7 @@ void compute_compatibilities(const vector<shared_ptr<Abundance> >& transcripts,
for (int j = 0; j < N; ++j)
{
- shared_ptr<Scaffold> transfrag_j = transcripts[j]->transfrag();
+ boost::shared_ptr<Scaffold> transfrag_j = transcripts[j]->transfrag();
for (int i = 0; i < M; ++i)
{
if (transfrag_j->contains(alignment_scaffs[i])
@@ -445,12 +446,12 @@ void compute_compatibilities(const vector<shared_ptr<Abundance> >& transcripts,
}
}
-AbundanceGroup::AbundanceGroup(const vector<shared_ptr<Abundance> >& abundances,
+AbundanceGroup::AbundanceGroup(const vector<boost::shared_ptr<Abundance> >& abundances,
const ublas::matrix<double>& gamma_covariance,
const ublas::matrix<double>& iterated_exp_count_covariance,
const ublas::matrix<double>& count_covariance,
const ublas::matrix<double>& fpkm_covariance,
- const set<shared_ptr<ReadGroupProperties const> >& rg_props) :
+ const set<boost::shared_ptr<ReadGroupProperties const> >& rg_props) :
_abundances(abundances),
_iterated_exp_count_covariance(iterated_exp_count_covariance),
_count_covariance(count_covariance),
@@ -495,7 +496,7 @@ AbundanceGroup::AbundanceGroup(const vector<shared_ptr<Abundance> >& abundances,
//cerr << _fpkm_covariance << endl;
}
- assert (FPKM() == 0 || fpkm_var > 0 || status() != NUMERIC_OK);
+ // assert (FPKM() == 0 || fpkm_var > 0 || status() != NUMERIC_OK);
}
for (size_t i = 0; i < _abundances.size(); ++i)
@@ -524,7 +525,7 @@ AbundanceStatus AbundanceGroup::status() const
{
bool has_lowdata_member = false;
bool has_ok_member = false;
- BOOST_FOREACH(shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
{
if (ab->status() == NUMERIC_FAIL)
{
@@ -560,7 +561,7 @@ void TranscriptAbundance::FPKM_variance(double v)
bool AbundanceGroup::has_member_with_status(AbundanceStatus member_status) const
{
- BOOST_FOREACH(shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
{
if (ab->status() == member_status)
{
@@ -574,7 +575,7 @@ double AbundanceGroup::num_fragments() const
{
double num_f = 0;
- BOOST_FOREACH(shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
{
num_f += ab->num_fragments();
}
@@ -586,7 +587,7 @@ CountPerReplicateTable AbundanceGroup::num_fragments_by_replicate() const
{
CountPerReplicateTable cpr;
- BOOST_FOREACH(shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
{
if (cpr.empty())
{
@@ -614,7 +615,7 @@ FPKMPerReplicateTable AbundanceGroup::FPKM_by_replicate() const
{
FPKMPerReplicateTable fpr;
- BOOST_FOREACH(shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
{
FPKMPerReplicateTable ab_fpr = ab->FPKM_by_replicate();
@@ -638,7 +639,7 @@ StatusPerReplicateTable AbundanceGroup::status_by_replicate() const
{
StatusPerReplicateTable fpr;
- BOOST_FOREACH(shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
{
if (fpr.empty())
{
@@ -684,7 +685,7 @@ double AbundanceGroup::mass_variance() const
{
double mass_var = 0;
- BOOST_FOREACH(shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
{
mass_var += ab->mass_variance();
}
@@ -723,7 +724,7 @@ double AbundanceGroup::FPKM() const
{
double fpkm = 0;
- BOOST_FOREACH(shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
{
fpkm += ab->FPKM();
}
@@ -735,7 +736,7 @@ double AbundanceGroup::gamma() const
{
double gamma = 0;
- BOOST_FOREACH(shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
{
gamma += ab->gamma();
}
@@ -743,6 +744,40 @@ double AbundanceGroup::gamma() const
return gamma;
}
+void TranscriptAbundance::clear_non_serialized_data()
+{
+ _fpkm_samples.clear();
+ std::vector<double>().swap(_fpkm_samples);
+
+ if (_cond_probs)
+ {
+ _cond_probs->clear();
+ std::vector<double>().swap(*_cond_probs);
+ }
+
+ if (_transfrag)
+ {
+ _transfrag->clear_hits();
+ _transfrag = boost::shared_ptr<Scaffold>();
+ }
+}
+
+void AbundanceGroup::clear_non_serialized_data()
+{
+
+ for (size_t i = 0; i < _abundances.size(); ++i)
+ {
+ _abundances[i]->clear_non_serialized_data();
+ }
+
+ _fpkm_samples.clear();
+ std::vector<double>().swap(_fpkm_samples);
+ _member_fpkm_samples.clear();
+ std::vector<Eigen::VectorXd>().swap(_member_fpkm_samples);
+ _assigned_count_samples.clear();
+ std::vector<Eigen::VectorXd>().swap(_assigned_count_samples);
+}
+
void AbundanceGroup::filter_group(const vector<bool>& to_keep,
AbundanceGroup& filtered_group) const
{
@@ -761,9 +796,9 @@ void AbundanceGroup::filter_group(const vector<bool>& to_keep,
ublas::matrix<double> new_count_cov = ublas::zero_matrix<double>(num_kept,num_kept);
ublas::matrix<double> new_fpkm_cov = ublas::zero_matrix<double>(num_kept,num_kept);
- vector<shared_ptr<Abundance> > new_ab;
+ vector<boost::shared_ptr<Abundance> > new_ab;
- vector<vector<double> > new_fpkm_samples(_fpkm_samples.size(), vector<double>(num_kept, 0));
+ //vector<vector<double> > new_fpkm_samples(_fpkm_samples.size(), vector<double>(num_kept, 0));
// rebuild covariance matrix and abundance vector after filtration
@@ -817,12 +852,12 @@ void AbundanceGroup::filter_group(const vector<bool>& to_keep,
filtered_group.description(_description);
}
-void AbundanceGroup::get_transfrags(vector<shared_ptr<Abundance> >& transfrags) const
+void AbundanceGroup::get_transfrags(vector<boost::shared_ptr<Abundance> >& transfrags) const
{
transfrags.clear();
- BOOST_FOREACH(shared_ptr<Abundance> pA, _abundances)
+ BOOST_FOREACH(boost::shared_ptr<Abundance> pA, _abundances)
{
- shared_ptr<Scaffold> pS = pA->transfrag();
+ boost::shared_ptr<Scaffold> pS = pA->transfrag();
if (pS)
{
transfrags.push_back(pA);
@@ -834,7 +869,7 @@ set<string> AbundanceGroup::gene_id() const
{
set<string> s;
- BOOST_FOREACH (shared_ptr<Abundance> pA, _abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
{
set<string> sub = pA->gene_id();
s.insert(sub.begin(), sub.end());
@@ -847,7 +882,7 @@ set<string> AbundanceGroup::gene_name() const
{
set<string> s;
- BOOST_FOREACH (shared_ptr<Abundance> pA, _abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
{
set<string> sub = pA->gene_name();
s.insert(sub.begin(), sub.end());
@@ -861,7 +896,7 @@ set<string> AbundanceGroup::tss_id() const
{
set<string> s;
- BOOST_FOREACH (shared_ptr<Abundance> pA, _abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
{
set<string> sub = pA->tss_id();
s.insert(sub.begin(), sub.end());
@@ -874,7 +909,7 @@ set<string> AbundanceGroup::protein_id() const
{
set<string> s;
- BOOST_FOREACH (shared_ptr<Abundance> pA, _abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
{
set<string> sub = pA->protein_id();
s.insert(sub.begin(), sub.end());
@@ -887,7 +922,7 @@ const string& AbundanceGroup::locus_tag() const
{
static string default_locus_tag = "-";
const string* pLast = NULL;
- BOOST_FOREACH (shared_ptr<Abundance> pA, _abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
{
if (pLast)
{
@@ -903,7 +938,7 @@ const string& AbundanceGroup::locus_tag() const
{
return *pLast;
}
- assert (false);
+ //assert (false);
return default_locus_tag;
}
@@ -911,7 +946,7 @@ const string& AbundanceGroup::reference_tag() const
{
static string default_reference_tag = "-";
const string* pLast = NULL;
- BOOST_FOREACH (shared_ptr<Abundance> pA, _abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
{
if (pLast)
{
@@ -927,7 +962,7 @@ const string& AbundanceGroup::reference_tag() const
{
return *pLast;
}
- assert (false);
+ //assert (false);
return default_reference_tag;
}
@@ -937,7 +972,7 @@ double AbundanceGroup::effective_length() const
double group_fpkm = FPKM();
if (group_fpkm == 0)
return 0;
- BOOST_FOREACH (shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> ab, _abundances)
{
eff_len += (ab->effective_length() * (ab->FPKM() / group_fpkm));
}
@@ -952,21 +987,21 @@ double AbundanceGroup::effective_length() const
// {
// if (!alignments[i].left_alignment())
// continue;
-// shared_ptr<ReadGroupProperties const> rg_props = alignments[i].read_group_props();
+// boost::shared_ptr<ReadGroupProperties const> rg_props = alignments[i].read_group_props();
//
// _read_group_props.insert(rg_props;
// }
//}
void AbundanceGroup::collect_per_replicate_mass(const vector<MateHit>& alignments,
- vector<shared_ptr<Abundance> >& transcripts)
+ vector<boost::shared_ptr<Abundance> >& transcripts)
{
size_t M = alignments.size();
size_t N = transcripts.size();
//_count_per_replicate.clear();
- for (map<shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
+ for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
itr != _count_per_replicate.end();
++itr)
{
@@ -976,9 +1011,9 @@ void AbundanceGroup::collect_per_replicate_mass(const vector<MateHit>& alignment
if (transcripts.empty())
return;
- //map<shared_ptr<ReadGroupProperties const>, double> count_per_replicate;
+ //map<boost::shared_ptr<ReadGroupProperties const>, double> count_per_replicate;
- vector<shared_ptr<Abundance> > mapped_transcripts; // This collects the transcripts that have alignments mapping to them
+ vector<boost::shared_ptr<Abundance> > mapped_transcripts; // This collects the transcripts that have alignments mapping to them
compute_cond_probs_and_effective_lengths(alignments, transcripts, mapped_transcripts);
for (size_t i = 0; i < M; ++i)
@@ -997,9 +1032,9 @@ void AbundanceGroup::collect_per_replicate_mass(const vector<MateHit>& alignment
}
if (mapped)
{
- shared_ptr<ReadGroupProperties const> rg_props = alignments[i].read_group_props();
+ boost::shared_ptr<ReadGroupProperties const> rg_props = alignments[i].read_group_props();
//assert (parent != NULL);
- pair<map<shared_ptr<ReadGroupProperties const>, double>::iterator, bool> inserted;
+ pair<map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator, bool> inserted;
inserted = _count_per_replicate.insert(make_pair(rg_props, 0.0));
_read_group_props.insert(rg_props);
@@ -1012,7 +1047,7 @@ void AbundanceGroup::collect_per_replicate_mass(const vector<MateHit>& alignment
}
}
-void AbundanceGroup::calculate_locus_scaled_mass_and_variance(const vector<shared_ptr<Abundance> >& transcripts)
+void AbundanceGroup::calculate_locus_scaled_mass_and_variance(const vector<boost::shared_ptr<Abundance> >& transcripts)
{
size_t N = transcripts.size();
@@ -1029,11 +1064,11 @@ void AbundanceGroup::calculate_locus_scaled_mass_and_variance(const vector<share
vector<double> avg_mass_variances(N, 0.0);
double external_scale_factor = -1.0;
- for (map<shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
+ for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
itr != _count_per_replicate.end();
++itr)
{
- shared_ptr<ReadGroupProperties const> rg_props = itr->first;
+ boost::shared_ptr<ReadGroupProperties const> rg_props = itr->first;
if (external_scale_factor < 0)
{
@@ -1051,7 +1086,7 @@ void AbundanceGroup::calculate_locus_scaled_mass_and_variance(const vector<share
double scaled_mass = itr->second;
double scaled_total_mass = rg_props->normalized_map_mass();
avg_X_g += scaled_mass;
- shared_ptr<MassDispersionModel const> disperser = rg_props->mass_dispersion_model();
+ boost::shared_ptr<MassDispersionModel const> disperser = rg_props->mass_dispersion_model();
for (size_t j = 0; j < N; ++j)
{
double scaled_variance;
@@ -1103,7 +1138,7 @@ void AbundanceGroup::calculate_locus_scaled_mass_and_variance(const vector<share
int total_cond_prob_calls = 0;
void collapse_equivalent_hits(const vector<MateHit>& alignments,
- vector<shared_ptr<Abundance> >& transcripts,
+ vector<boost::shared_ptr<Abundance> >& transcripts,
vector<MateHit>& nr_alignments,
vector<double>& log_conv_factors,
bool require_overlap = true)
@@ -1141,7 +1176,7 @@ void collapse_equivalent_hits(const vector<MateHit>& alignments,
{
for (int j = 0; j < N; ++j)
{
- shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
+ boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
if (compatibilities[j][i]==1)
{
@@ -1196,7 +1231,7 @@ void collapse_equivalent_hits(const vector<MateHit>& alignments,
cond_probs_k = &cached_cond_probs[k];
for (int j = 0; j < N; ++j)
{
- shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
+ boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
if (compatibilities[j][k]==1)
{
@@ -1283,7 +1318,7 @@ void collapse_equivalent_hits(const vector<MateHit>& alignments,
for (int j = 0; j < N; ++j)
{
- shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
+ boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
vector<double>& cond_probs = *(new vector<double>(nr_alignments.size(),0));
BiasCorrectionHelper& bch = bchs[j];
@@ -1323,7 +1358,7 @@ void collapse_equivalent_hits(const vector<MateHit>& alignments,
}
void collapse_equivalent_hits_helper(const vector<MateHit>& alignments,
- vector<shared_ptr<Abundance> >& transcripts,
+ vector<boost::shared_ptr<Abundance> >& transcripts,
vector<MateHit>& nr_alignments,
vector<double>& log_conv_factors)
{
@@ -1388,7 +1423,7 @@ void collapse_equivalent_hits_helper(const vector<MateHit>& alignments,
}
-AbundanceStatus bootstrap_gamma_mle(const vector<shared_ptr<Abundance> >& transcripts,
+AbundanceStatus bootstrap_gamma_mle(const vector<boost::shared_ptr<Abundance> >& transcripts,
const vector<MateHit>& nr_alignments,
const vector<double>& log_conv_factors,
ublas::vector<double>& gamma_map_estimate,
@@ -1553,7 +1588,7 @@ AbundanceStatus bootstrap_gamma_mle(const vector<shared_ptr<Abundance> >& transc
-AbundanceStatus bootstrap_gammas(const vector<shared_ptr<Abundance> >& transcripts,
+AbundanceStatus bootstrap_gammas(const vector<boost::shared_ptr<Abundance> >& transcripts,
const vector<MateHit>& alignments,
const vector<double>& log_conv_factors,
ublas::vector<double>& gamma_estimate,
@@ -1678,7 +1713,7 @@ bool generate_count_assignment_samples(int num_draws,
return true;
}
-void calculate_gamma_mle_covariance(const std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >& ab_group_per_replicate,
+void calculate_gamma_mle_covariance(const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >& ab_group_per_replicate,
ublas::vector<double>& estimated_gamma_mean,
ublas::matrix<double>& estimated_gamma_covariance)
{
@@ -1697,7 +1732,7 @@ void calculate_gamma_mle_covariance(const std::map<shared_ptr<ReadGroupPropertie
return;
}
- for(std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
+ for(std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
itr != ab_group_per_replicate.end();
++itr)
{
@@ -1757,7 +1792,7 @@ void calculate_gamma_mle_covariance(const std::map<shared_ptr<ReadGroupPropertie
}
}
-void calculate_fragment_assignment_distribution(const std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >& ab_group_per_replicate,
+void calculate_fragment_assignment_distribution(const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate,
ublas::vector<double>& estimated_gamma_mean,
ublas::matrix<double>& estimated_gamma_covariance,
vector<ublas::vector<double> >& all_assigned_count_samples)
@@ -1778,7 +1813,7 @@ void calculate_fragment_assignment_distribution(const std::map<shared_ptr<ReadGr
return;
}
- for(std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
+ for(std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
itr != ab_group_per_replicate.end();
++itr)
{
@@ -1790,7 +1825,7 @@ void calculate_fragment_assignment_distribution(const std::map<shared_ptr<ReadGr
ublas::matrix<double> count_covariance = itr->second->iterated_count_cov();
ublas::matrix<double> mle_error = ublas::zero_matrix<double>(count_mean.size(), count_mean.size());
- shared_ptr<const MleErrorModel> mle_model = itr->first->mle_error_model();
+ boost::shared_ptr<const MleErrorModel> mle_model = itr->first->mle_error_model();
if (mle_model != NULL)
{
for (size_t i = 0; i < count_mean.size(); ++i)
@@ -1884,8 +1919,9 @@ void calculate_fragment_assignment_distribution(const std::map<shared_ptr<ReadGr
//
//}
-void AbundanceGroup::calculate_abundance_group_variance(const vector<shared_ptr<Abundance> >& transcripts,
- const std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >& ab_group_per_replicate)
+
+void AbundanceGroup::calculate_abundance_group_variance(const vector<boost::shared_ptr<Abundance> >& transcripts,
+ const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate)
{
if (final_est_run) // Only on last estimation run
{
@@ -1936,22 +1972,22 @@ void AbundanceGroup::calculate_abundance_group_variance(const vector<shared_ptr<
}
- //cerr << _count_covariance << endl;
- for (size_t i = 0; i < _abundances.size(); ++i)
- {
- for (size_t j = 0; j < _abundances.size(); ++j)
- {
- if (i != j)
- {
- assert(!isinf(_fpkm_covariance(i,j)) && !isnan(_fpkm_covariance(i,j)));
- if (_abundances[i]->transfrag()->contains(*_abundances[j]->transfrag()) &&
- Scaffold::compatible(*_abundances[i]->transfrag(),*_abundances[j]->transfrag()))
- {
- _abundances[j]->status(NUMERIC_LOW_DATA);
- }
- }
- }
- }
+// //cerr << _count_covariance << endl;
+// for (size_t i = 0; i < _abundances.size(); ++i)
+// {
+// for (size_t j = 0; j < _abundances.size(); ++j)
+// {
+// if (i != j)
+// {
+// assert(!isinf(_fpkm_covariance(i,j)) && !isnan(_fpkm_covariance(i,j)));
+// if (_abundances[i]->transfrag()->contains(*_abundances[j]->transfrag()) &&
+// Scaffold::compatible(*_abundances[i]->transfrag(),*_abundances[j]->transfrag()))
+// {
+// _abundances[j]->status(NUMERIC_LOW_DATA);
+// }
+// }
+// }
+// }
//assert (FPKM() == 0 || _assigned_count_samples.size() > 0);
@@ -1961,10 +1997,10 @@ void AbundanceGroup::calculate_abundance_group_variance(const vector<shared_ptr<
void AbundanceGroup::calculate_abundance_for_replicate(const vector<MateHit>& alignments, bool perform_collapse)
{
- vector<shared_ptr<Abundance> > transcripts;
+ vector<boost::shared_ptr<Abundance> > transcripts;
get_transfrags(transcripts);
- vector<shared_ptr<Abundance> > mapped_transcripts; // This collects the transcripts that have alignments mapping to them
+ vector<boost::shared_ptr<Abundance> > mapped_transcripts; // This collects the transcripts that have alignments mapping to them
vector<MateHit> nr_alignments;
vector<double> joint_mle_gammas;
@@ -2000,7 +2036,7 @@ void AbundanceGroup::calculate_abundance_for_replicate(const vector<MateHit>& al
calculate_locus_scaled_mass_and_variance(transcripts);
}
-void AbundanceGroup::aggregate_replicate_abundances(const map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >& ab_group_per_replicate)
+void AbundanceGroup::aggregate_replicate_abundances(const map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate)
{
for (size_t i = 0; i < _abundances.size(); ++i)
{
@@ -2012,6 +2048,7 @@ void AbundanceGroup::aggregate_replicate_abundances(const map<shared_ptr<ReadGro
double avg_num_frags = 0.0;
double avg_gamma = 0.0;
double avg_mass_variance = 0.0;
+ double avg_effective_length = 0.0;
map<AbundanceStatus, int> status_table;
status_table[NUMERIC_OK] = 0;
@@ -2019,36 +2056,47 @@ void AbundanceGroup::aggregate_replicate_abundances(const map<shared_ptr<ReadGro
status_table[NUMERIC_FAIL] = 0;
status_table[NUMERIC_HI_DATA] = 0;
- for (std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
+ for (std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
itr != ab_group_per_replicate.end();
++itr)
{
status_table[itr->second->abundances()[i]->status()] += 1;
}
- for (std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
+ for (std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
itr != ab_group_per_replicate.end();
++itr)
{
+ const vector<boost::shared_ptr<Abundance> >& sc_ab = itr->second->abundances();
assert(itr->second->abundances().size() == _abundances.size());
cpr[itr->first] = itr->second->abundances()[i]->num_fragments();
//fprintf(stderr, "FPKM = %lg\n", itr->second->abundances()[i]->FPKM());
fpr[itr->first] = itr->second->abundances()[i]->FPKM();
spr[itr->first] = itr->second->abundances()[i]->status();
+ /*
if (itr->second->abundances()[i]->status() == NUMERIC_OK)
{
avg_fpkm += itr->second->abundances()[i]->FPKM() / (double)status_table[NUMERIC_OK];
avg_num_frags += itr->second->abundances()[i]->num_fragments() / (double)status_table[NUMERIC_OK];
avg_gamma += itr->second->abundances()[i]->gamma() / (double)status_table[NUMERIC_OK];
avg_mass_variance += itr->second->abundances()[i]->mass_variance() / (double)status_table[NUMERIC_OK];
+ avg_effective_length += itr->second->abundances()[i]->effective_length() / (double)status_table[NUMERIC_OK];
}
+ */
+ avg_fpkm += itr->second->abundances()[i]->FPKM() / (double)ab_group_per_replicate.size();
+ avg_num_frags += itr->second->abundances()[i]->num_fragments() / (double)ab_group_per_replicate.size();
+ avg_gamma += itr->second->abundances()[i]->gamma() / (double)ab_group_per_replicate.size();
+ avg_mass_variance += itr->second->abundances()[i]->mass_variance() / (double)ab_group_per_replicate.size();
+ avg_effective_length += itr->second->abundances()[i]->effective_length() / (double)ab_group_per_replicate.size();
+
}
_abundances[i]->FPKM(avg_fpkm);
_abundances[i]->gamma(avg_gamma);
_abundances[i]->num_fragments(avg_num_frags);
_abundances[i]->mass_variance(avg_mass_variance);
+ _abundances[i]->effective_length(avg_effective_length);
// if there was at least one good replicate, set the status to OK. The reduced power will be reflected
@@ -2063,14 +2111,14 @@ void AbundanceGroup::aggregate_replicate_abundances(const map<shared_ptr<ReadGro
{
_abundances[i]->status(NUMERIC_LOW_DATA);
}
- if (status_table[NUMERIC_HI_DATA] >= status_table[NUMERIC_FAIL])
+ else if (status_table[NUMERIC_HI_DATA] >= status_table[NUMERIC_FAIL])
{
_abundances[i]->status(NUMERIC_HI_DATA);
}
- if (status_table[NUMERIC_HI_DATA] >= status_table[NUMERIC_LOW_DATA]) // not sure this ever happens in practice
- {
- _abundances[i]->status(NUMERIC_FAIL);
- }
+// else if (status_table[NUMERIC_HI_DATA] >= status_table[NUMERIC_LOW_DATA]) // not sure this ever happens in practice
+// {
+// _abundances[i]->status(NUMERIC_FAIL);
+// }
else
{
_abundances[i]->status(NUMERIC_FAIL);
@@ -2083,15 +2131,15 @@ void AbundanceGroup::aggregate_replicate_abundances(const map<shared_ptr<ReadGro
}
}
-void AbundanceGroup::calculate_abundance(const vector<MateHit>& alignments, bool perform_collapse)
+void AbundanceGroup::calculate_abundance(const vector<MateHit>& alignments, bool perform_collapse, bool calculate_variance)
{
- vector<shared_ptr<Abundance> > transcripts;
+ vector<boost::shared_ptr<Abundance> > transcripts;
get_transfrags(transcripts);
- map<shared_ptr<ReadGroupProperties const >, vector<MateHit> > alignments_per_read_group;
+ map<boost::shared_ptr<ReadGroupProperties const >, vector<MateHit> > alignments_per_read_group;
- for(std::set<shared_ptr<ReadGroupProperties const > >::iterator itr = _read_group_props.begin();
+ for(std::set<boost::shared_ptr<ReadGroupProperties const > >::iterator itr = _read_group_props.begin();
itr != _read_group_props.end();
++itr)
{
@@ -2110,18 +2158,29 @@ void AbundanceGroup::calculate_abundance(const vector<MateHit>& alignments, bool
collect_per_replicate_mass(alignments, transcripts);
- std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> > ab_group_per_replicate;
+ std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> > ab_group_per_replicate;
calculate_per_replicate_abundances(transcripts,
alignments_per_read_group,
ab_group_per_replicate);
-
- aggregate_replicate_abundances(ab_group_per_replicate);
-
- calculate_abundance_group_variance(transcripts, ab_group_per_replicate);
+
+ std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> > const_ab_group_per_replicate;
+
+ for (std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >::iterator itr = ab_group_per_replicate.begin();
+ itr != ab_group_per_replicate.end(); ++itr)
+ {
+ const_ab_group_per_replicate[itr->first] = itr->second;
+ }
+
+ aggregate_replicate_abundances(const_ab_group_per_replicate);
+
+ if (calculate_variance)
+ {
+ calculate_abundance_group_variance(transcripts, const_ab_group_per_replicate);
+ }
}
-void AbundanceGroup::update_multi_reads(const vector<MateHit>& alignments, vector<shared_ptr<Abundance> > transcripts)
+void AbundanceGroup::update_multi_reads(const vector<MateHit>& alignments, vector<boost::shared_ptr<Abundance> > transcripts)
{
size_t M = alignments.size();
size_t N = transcripts.size();
@@ -2296,7 +2355,7 @@ bool estimate_count_variance(long double& variance,
bool simulate_count_covariance(const vector<double>& num_fragments,
const vector<double>& frag_variances,
const ublas::matrix<double>& iterated_exp_count_covariance,
- const vector<shared_ptr<Abundance> >& transcripts,
+ const vector<boost::shared_ptr<Abundance> >& transcripts,
ublas::matrix<double>& count_covariance,
vector<Eigen::VectorXd>& assigned_count_samples,
vector<ublas::vector<double> >* gamma_samples = NULL)
@@ -2436,6 +2495,13 @@ bool simulate_count_covariance(const vector<double>& num_fragments,
double over_disp_scale = fit_var - frags;
r /= over_disp_scale;
+ double after_decimal = r - (long)r;
+ //fprintf( stderr, "after decimal = %lg\n", after_decimal);
+ if (uniform_gen() < after_decimal)
+ r = floor(r);
+ else
+ r = ceil(r);
+
if (r == 0)
{
generated_and_assigned_counts(j) = 0;
@@ -2569,11 +2635,11 @@ void AbundanceGroup::generate_fpkm_samples()
double M = 0;
- for (set<shared_ptr<ReadGroupProperties const> >::iterator itr = _read_group_props.begin();
+ for (set<boost::shared_ptr<ReadGroupProperties const> >::iterator itr = _read_group_props.begin();
itr != _read_group_props.end();
++itr)
{
- shared_ptr<ReadGroupProperties const> rg_props = *itr;
+ boost::shared_ptr<ReadGroupProperties const> rg_props = *itr;
M += rg_props->normalized_map_mass();
if (external_scale_factor < 0)
@@ -2601,7 +2667,7 @@ void AbundanceGroup::generate_fpkm_samples()
for (size_t j = 0; j < sample.size(); ++j)
{
- double fpkm_sample = sample[j] / M;
+ double fpkm_sample = sample[j] / M;
if (_abundances[j]->effective_length() > 0)
{
@@ -2677,11 +2743,11 @@ void AbundanceGroup::calculate_FPKM_covariance()
double M = 0;
- for (map<shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
+ for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
itr != _count_per_replicate.end();
++itr)
{
- shared_ptr<ReadGroupProperties const> rg_props = itr->first;
+ boost::shared_ptr<ReadGroupProperties const> rg_props = itr->first;
M += rg_props->normalized_map_mass();
if (external_scale_factor < 0)
@@ -2777,12 +2843,156 @@ void AbundanceGroup::calculate_FPKM_covariance()
void AbundanceGroup::calculate_conf_intervals()
{
+ // We only really ever call this function for primary abundance groups
+ // (i.e. the transcript groups and read bundles with which we calculate
+ // transcript MLE expression levels. Genes, TSS groups, etc get broken
+ // off of primary bundles, so we should not call this function on those
+ // secondary groups. The group splitting code needs to manage the task
+ // of splitting up all the variout covariance matrices we're calculating
+ // here.
+ if (status() == NUMERIC_OK)
+ {
+ // This will compute the transcript level FPKM confidence intervals
+ for (size_t j = 0; j < _abundances.size(); ++j)
+ {
+ long double fpkm_var = _abundances[j]->FPKM_variance();
+ double FPKM_hi = 0.0;
+ double FPKM_lo = 0.0;
+ if (_abundances[j]->status() != NUMERIC_FAIL)
+ {
+ FPKM_hi = _abundances[j]->FPKM() + 2 * sqrt(fpkm_var);
+ FPKM_lo = max(0.0, (double)(_abundances[j]->FPKM() - 2 * sqrt(fpkm_var)));
+ if (!(FPKM_lo <= _abundances[j]->FPKM() && _abundances[j]->FPKM() <= FPKM_hi))
+ {
+ //fprintf(stderr, "Error: confidence intervals are illegal! var = %Lg, fpkm = %lg, lo = %lg, hi %lg, status = %d\n", fpkm_var, _abundances[j]->FPKM(), FPKM_lo, FPKM_hi, _abundances[j]->status());
+ }
+ assert (FPKM_lo <= _abundances[j]->FPKM() && _abundances[j]->FPKM() <= FPKM_hi);
+ ConfidenceInterval conf(FPKM_lo, FPKM_hi);
+ _abundances[j]->FPKM_conf(conf);
+ //_abundances[j]->FPKM_variance(fpkm_var);
+ }
+ else
+ {
+ // we shouldn't be able to get here
+ assert(false);
+ // TODO: nothing to do here?
+ }
+ }
+
+ // Now build a confidence interval for the whole abundance group
+ double group_fpkm = FPKM();
+ if (group_fpkm > 0.0)
+ {
+ double FPKM_hi = FPKM() + 2 * sqrt(FPKM_variance());
+ double FPKM_lo = max(0.0, FPKM() - 2 * sqrt(FPKM_variance()));
+ ConfidenceInterval conf(FPKM_lo, FPKM_hi);
+ FPKM_conf(conf);
+ }
+ else
+ {
+ _FPKM_variance = 0.0;
+ ConfidenceInterval conf(0.0, 0.0);
+ FPKM_conf(conf);
+ }
+ }
+ else
+ {
+ double sum_transfrag_FPKM_hi = 0;
+ double max_fpkm = 0.0;
+ //double min_fpkm = 1e100;
+
+ double avg_X_g = 0.0;
+ double avg_mass_fraction = 0.0;
+
+ int N = _abundances.size();
+
+ vector<double> avg_mass_variances(_abundances.size(), 0.0);
+
+ for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
+ itr != _count_per_replicate.end();
+ ++itr)
+ {
+ boost::shared_ptr<ReadGroupProperties const> rg_props = itr->first;
+ double scaled_mass = itr->second;
+ double scaled_total_mass = rg_props->normalized_map_mass();
+ avg_X_g += scaled_mass;
+ boost::shared_ptr<MassDispersionModel const> disperser = rg_props->mass_dispersion_model();
+ for (size_t j = 0; j < N; ++j)
+ {
+ double scaled_variance;
+ //scaled_variance = disperser->scale_mass_variance(scaled_mass * _abundances[j]->gamma());
+ scaled_variance = _abundances[j]->gamma() * disperser->scale_mass_variance(scaled_mass);
+ avg_mass_variances[j] += scaled_variance;
+ }
+ assert (disperser->scale_mass_variance(scaled_mass) != 0 || scaled_mass == 0);
+
+ //assert (scaled_total_mass != 0.0);
+ avg_mass_fraction += (scaled_mass / scaled_total_mass);
+ }
+
+ double num_replicates = _count_per_replicate.size();
+
+ if (num_replicates)
+ {
+ avg_X_g /= num_replicates;
+ avg_mass_fraction /= num_replicates;
+ for (size_t j = 0; j < N; ++j)
+ {
+ avg_mass_variances[j] /= num_replicates;
+ }
+ }
+
+ BOOST_FOREACH(boost::shared_ptr<Abundance> pA, _abundances)
+ {
+ double FPKM_hi;
+ double FPKM_lo;
+ if (pA->effective_length() > 0 && avg_mass_fraction > 0)
+ {
+ double norm_frag_density = 1000000000;
+ norm_frag_density /= pA->effective_length();
+
+ norm_frag_density *= avg_mass_fraction;
+ double fpkm_high = norm_frag_density;
+
+ double total_mass = (num_fragments() / avg_mass_fraction);
+ double fpkm_constant = 1000000000 / pA->effective_length() / total_mass;
+ double var_fpkm = mass_variance() * (fpkm_constant * fpkm_constant);
+
+ FPKM_hi = fpkm_high + 2 * sqrt(var_fpkm);
+ FPKM_lo = 0.0;
+ ConfidenceInterval conf(FPKM_lo, FPKM_hi);
+ //assert (FPKM_lo <= pA->FPKM() && pA->FPKM() <= FPKM_hi);
+ pA->FPKM_conf(conf);
+ //pA->FPKM_variance(var_fpkm);
+ max_fpkm = max(sum_transfrag_FPKM_hi, FPKM_hi);
+ }
+ else
+ {
+ FPKM_hi = 0.0;
+ FPKM_lo = 0.0;
+ ConfidenceInterval conf(0.0, 0.0);
+ pA->FPKM_conf(conf);
+ //pA->FPKM_variance(0.0);
+ }
+
+ }
+ // In the case of a numeric failure, the groups error bars need to be
+ // set such that
+ FPKM_conf(ConfidenceInterval(0.0, max_fpkm + 2 * sqrt(FPKM_variance())));
+ }
+}
+
+
+/*
+void AbundanceGroup::calculate_conf_intervals()
+{
for (size_t j = 0; j < _abundances.size(); ++j)
{
double FPKM_hi = 0.0;
double FPKM_lo = numeric_limits<double>::max();
const vector<double> ab_j_samples = _abundances[j]->fpkm_samples();
vector<pair<double, double> > fpkm_samples;
+ double target_fpkm = _abundances[j]->FPKM();
for (size_t i = 0; i < ab_j_samples.size(); ++i)
fpkm_samples.push_back(make_pair(abs(ab_j_samples[i] - _abundances[j]->FPKM()), ab_j_samples[i]));
@@ -2796,7 +3006,15 @@ void AbundanceGroup::calculate_conf_intervals()
FPKM_hi = fpkm_samples[i].second;
}
+ if (fpkm_samples.size() > 0 && FPKM_lo > target_fpkm)
+ {
+ fprintf(stderr, "Warning: transcript confidence interval lower bound is > FPKM\n");
+ }
+ if (fpkm_samples.size() > 0 && FPKM_hi < target_fpkm)
+ {
+ fprintf(stderr, "Warning: transcript confidence interval upper bound is < FPKM\n");
+ }
ConfidenceInterval conf(FPKM_lo, FPKM_hi);
_abundances[j]->FPKM_conf(conf);
@@ -2804,7 +3022,7 @@ void AbundanceGroup::calculate_conf_intervals()
double FPKM_hi = 0.0;
double FPKM_lo = numeric_limits<double>::max();
- const vector<double> ab_j_samples = _fpkm_samples;
+ const vector<double>& ab_j_samples = _fpkm_samples;
vector<pair<double, double> > fpkm_samples;
for (size_t i = 0; i < ab_j_samples.size(); ++i)
fpkm_samples.push_back(make_pair(abs(ab_j_samples[i] - FPKM()), ab_j_samples[i]));
@@ -2819,15 +3037,28 @@ void AbundanceGroup::calculate_conf_intervals()
FPKM_hi = fpkm_samples[i].second;
}
+ double target_fpkm = FPKM();
+
+ if (fpkm_samples.size() > 0 && FPKM_lo > target_fpkm)
+ {
+ fprintf(stderr, "Warning: group confidence interval lower bound is > FPKM\n");
+ }
+
+ if (fpkm_samples.size() > 0 && FPKM_hi < target_fpkm)
+ {
+ fprintf(stderr, "Warning: group confidence interval upper bound is < FPKM\n");
+ }
+
ConfidenceInterval conf(FPKM_lo, FPKM_hi);
FPKM_conf(conf);
return;
}
+*/
void compute_cond_probs_and_effective_lengths(const vector<MateHit>& alignments,
- vector<shared_ptr<Abundance> >& transcripts,
- vector<shared_ptr<Abundance> >& mapped_transcripts)
+ vector<boost::shared_ptr<Abundance> >& transcripts,
+ vector<boost::shared_ptr<Abundance> >& mapped_transcripts)
{
int N = transcripts.size();
int M = alignments.size();
@@ -2837,7 +3068,7 @@ void compute_cond_probs_and_effective_lengths(const vector<MateHit>& alignments,
for (int j = 0; j < N; ++j)
{
- shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
+ boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
vector<double>& cond_probs = *(new vector<double>(M,0));
BiasCorrectionHelper bch(transfrag);
@@ -2877,13 +3108,13 @@ double trace(const ublas::matrix<double>& m)
// the cluster. Needs refactoring
bool AbundanceGroup::calculate_gammas(const vector<MateHit>& nr_alignments,
const vector<double>& log_conv_factors,
- const vector<shared_ptr<Abundance> >& transcripts,
- const vector<shared_ptr<Abundance> >& mapped_transcripts)
+ const vector<boost::shared_ptr<Abundance> >& transcripts,
+ const vector<boost::shared_ptr<Abundance> >& mapped_transcripts)
{
if (mapped_transcripts.empty())
{
//gammas = vector<double>(transfrags.size(), 0.0);
- BOOST_FOREACH (shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> ab, _abundances)
{
ab->gamma(0);
}
@@ -2926,14 +3157,14 @@ bool AbundanceGroup::calculate_gammas(const vector<MateHit>& nr_alignments,
locus_mass += alignment.collapse_mass();
}
- vector<shared_ptr<Abundance> > filtered_transcripts = mapped_transcripts;
+ vector<boost::shared_ptr<Abundance> > filtered_transcripts = mapped_transcripts;
vector<double> filtered_gammas = gammas;
filter_junk_isoforms(filtered_transcripts, filtered_gammas, mapped_transcripts, locus_mass);
if (filtered_transcripts.empty())
{
//gammas = vector<double>(transfrags.size(), 0.0);
- BOOST_FOREACH (shared_ptr<Abundance> ab, _abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> ab, _abundances)
{
ab->gamma(0);
}
@@ -2971,7 +3202,7 @@ bool AbundanceGroup::calculate_gammas(const vector<MateHit>& nr_alignments,
size_t N = transcripts.size();
- set<shared_ptr<ReadGroupProperties const> > rg_props;
+ set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
for (size_t i = 0; i < nr_alignments.size(); ++i)
{
rg_props.insert(nr_alignments[i].read_group_props());
@@ -3009,13 +3240,13 @@ bool AbundanceGroup::calculate_gammas(const vector<MateHit>& nr_alignments,
updated_fpkm_cov = ublas::zero_matrix<double>(N, N);
size_t cfs = 0;
- shared_ptr<Scaffold> curr_filtered_scaff = filtered_transcripts[cfs]->transfrag();
+ boost::shared_ptr<Scaffold> curr_filtered_scaff = filtered_transcripts[cfs]->transfrag();
StructurallyEqualScaffolds se;
vector<size_t> scaff_present(N, N);
for (size_t i = 0; i < N; ++i)
{
- shared_ptr<Scaffold> scaff_i = transcripts[i]->transfrag();
+ boost::shared_ptr<Scaffold> scaff_i = transcripts[i]->transfrag();
if (cfs < filtered_transcripts.size())
{
curr_filtered_scaff = filtered_transcripts[cfs]->transfrag();
@@ -3191,7 +3422,7 @@ void calculate_average_assignment_probs(const Eigen::VectorXd& alignment_multipl
void calculate_iterated_exp_count_covariance(const vector<double>& gammas,
const vector<MateHit>& nr_alignments,
- const vector<shared_ptr<Abundance> >& transcripts,
+ const vector<boost::shared_ptr<Abundance> >& transcripts,
ublas::matrix<double>& count_covariance)
{
// Now calculate the _iterated_exp_count_covariance matrix via iterated expectation
@@ -3334,7 +3565,7 @@ void AbundanceGroup::calculate_kappas()
//tss_group.sub_quants = vector<QuantGroup>(isos_in_tss);
double S_FPKM = 0.0;
- BOOST_FOREACH (shared_ptr<Abundance> pA, _abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
{
if (pA->effective_length() > 0)
{
@@ -3343,7 +3574,7 @@ void AbundanceGroup::calculate_kappas()
}
//fprintf (stderr, "*********\n");
- BOOST_FOREACH (shared_ptr<Abundance> pA, _abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
{
if (S_FPKM > 0)
{
@@ -3437,14 +3668,14 @@ void AbundanceGroup::calculate_kappas()
}
}
-void get_alignments_from_scaffolds(const vector<shared_ptr<Abundance> >& abundances,
+void get_alignments_from_scaffolds(const vector<boost::shared_ptr<Abundance> >& abundances,
vector<MateHit>& alignments)
{
set<const MateHit*> hits_in_gene_set;
- BOOST_FOREACH(shared_ptr<Abundance> pA, abundances)
+ BOOST_FOREACH(boost::shared_ptr<Abundance> pA, abundances)
{
- shared_ptr<Scaffold> pS = pA->transfrag();
+ boost::shared_ptr<Scaffold> pS = pA->transfrag();
assert (pS);
hits_in_gene_set.insert(pS->mate_hits().begin(),
pS->mate_hits().end());
@@ -3682,7 +3913,7 @@ double EM(int N, int M,
return newEll;
}
-void compute_fisher(const vector<shared_ptr<Abundance> >& transcripts,
+void compute_fisher(const vector<boost::shared_ptr<Abundance> >& transcripts,
const ublas::vector<double>& abundances,
const vector<MateHit>& alignments,
const vector<double>& u,
@@ -3806,33 +4037,33 @@ AbundanceStatus compute_posterior_expectation(const vector<ublas::vector<double>
return NUMERIC_OK;
}
-AbundanceStatus AbundanceGroup::calculate_per_replicate_abundances(vector<shared_ptr<Abundance> >& transcripts,
- const map<shared_ptr<ReadGroupProperties const >, vector<MateHit> >& alignments_per_read_group,
- std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >& ab_group_per_replicate,
+AbundanceStatus AbundanceGroup::calculate_per_replicate_abundances(vector<boost::shared_ptr<Abundance> >& transcripts,
+ const map<boost::shared_ptr<ReadGroupProperties const >, vector<MateHit> >& alignments_per_read_group,
+ std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >& ab_group_per_replicate,
bool perform_collapse)
{
- for(std::set<shared_ptr<ReadGroupProperties const > >::iterator itr = _read_group_props.begin();
+ for(std::set<boost::shared_ptr<ReadGroupProperties const > >::iterator itr = _read_group_props.begin();
itr != _read_group_props.end();
++itr)
{
- vector<shared_ptr<Abundance> > new_transcripts;
- BOOST_FOREACH(shared_ptr<Abundance> ab, transcripts)
+ vector<boost::shared_ptr<Abundance> > new_transcripts;
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, transcripts)
{
boost::shared_ptr<TranscriptAbundance> d = boost::static_pointer_cast<TranscriptAbundance>(ab);
- //new_transcripts.push_back(shared_ptr<Abundance>(new TranscriptAbundance(*boost::static_pointer_cast<TranscriptAbundance>(ab))));
+ //new_transcripts.push_back(boost::shared_ptr<Abundance>(new TranscriptAbundance(*boost::static_pointer_cast<TranscriptAbundance>(ab))));
TranscriptAbundance* pT = new TranscriptAbundance;
pT->transfrag(d->transfrag());
- shared_ptr<Abundance> ab_new(pT);
+ boost::shared_ptr<Abundance> ab_new(pT);
ab_new->description(ab_new->description());
ab_new->locus_tag("");
new_transcripts.push_back(ab_new);
}
- shared_ptr<AbundanceGroup> ab_group(new AbundanceGroup(new_transcripts));
- std::set<shared_ptr<ReadGroupProperties const > > rg_props;
+ boost::shared_ptr<AbundanceGroup> ab_group(new AbundanceGroup(new_transcripts));
+ std::set<boost::shared_ptr<ReadGroupProperties const > > rg_props;
rg_props.insert(*itr);
ab_group->init_rg_props(rg_props);
- map<shared_ptr<ReadGroupProperties const >, vector<MateHit> >::const_iterator al_itr =
+ map<boost::shared_ptr<ReadGroupProperties const >, vector<MateHit> >::const_iterator al_itr =
alignments_per_read_group.find(*itr);
assert(al_itr != alignments_per_read_group.end());
@@ -3841,7 +4072,7 @@ AbundanceStatus AbundanceGroup::calculate_per_replicate_abundances(vector<shared
vector<MateHit> nr_alignments;
vector<MateHit> non_equiv_alignments;
vector<double> log_conv_factors;
- vector<shared_ptr<Abundance> > mapped_transcripts;
+ vector<boost::shared_ptr<Abundance> > mapped_transcripts;
if (perform_collapse)
{
@@ -3858,8 +4089,6 @@ AbundanceStatus AbundanceGroup::calculate_per_replicate_abundances(vector<shared
log_conv_factors = vector<double>(nr_alignments.size(), 0);
}
- //rep_hit_counts.push_back(count_per_replicate.find(*itr)->second);
-
ab_group->calculate_abundance_for_replicate(non_equiv_alignments, false);
//fprintf (stderr, "FPKM = %lg\n", ab_group->FPKM());
@@ -3869,7 +4098,7 @@ AbundanceStatus AbundanceGroup::calculate_per_replicate_abundances(vector<shared
return NUMERIC_OK;
}
-AbundanceStatus calculate_inverse_fisher(const vector<shared_ptr<Abundance> >& transcripts,
+AbundanceStatus calculate_inverse_fisher(const vector<boost::shared_ptr<Abundance> >& transcripts,
const vector<MateHit>& alignments,
const ublas::vector<double>& gamma_mean,
ublas::matrix<double>& inverse_fisher)
@@ -4033,7 +4262,7 @@ bool is_identifiable(M &m, PM &pm)
for (size_type i = 0; i < size; ++ i) {
matrix_column<M> mci (column (m, i));
matrix_row<M> mri (row (m, i));
- size_type i_norm_inf = i + index_norm_inf (project (mci, range (i, size1)));
+ size_type i_norm_inf = i + index_norm_inf (project (mci, boost::numeric::ublas::range (i, size1)));
if (m (i_norm_inf, i) != value_type/*zero*/()) {
if (i_norm_inf != i) {
pm (i) = i_norm_inf;
@@ -4041,17 +4270,17 @@ bool is_identifiable(M &m, PM &pm)
} else {
//BOOST_UBLAS_CHECK (pm (i) == i_norm_inf, external_logic ());
}
- project (mci, range (i + 1, size1)) *= value_type (1) / m (i, i);
+ project (mci, boost::numeric::ublas::range (i + 1, size1)) *= value_type (1) / m (i, i);
} else if (singular == 0) {
singular = i + 1;
}
- project (m, range (i + 1, size1), range (i + 1, size2)).minus_assign (outer_prod (project (mci, range (i + 1, size1)),
- project (mri, range (i + 1, size2))));
+ project (m, boost::numeric::ublas::range (i + 1, size1), boost::numeric::ublas::range (i + 1, size2)).minus_assign (outer_prod (project (mci, boost::numeric::ublas::range (i + 1, size1)),
+ project (mri, boost::numeric::ublas::range (i + 1, size2))));
}
return singular == 0;
}
-AbundanceStatus gamma_mle(const vector<shared_ptr<Abundance> >& transcripts,
+AbundanceStatus gamma_mle(const vector<boost::shared_ptr<Abundance> >& transcripts,
const vector<MateHit>& nr_alignments,
const vector<double>& log_conv_factors,
vector<double>& gammas,
@@ -4515,4 +4744,418 @@ double get_scaffold_min_doc(int bundle_origin,
return min_doc;
}
+void tss_analysis(const string& locus_tag, SampleAbundances& sample)
+{
+ // Cluster transcripts by start site (TSS)
+ vector<AbundanceGroup> transcripts_by_tss;
+
+ ublas::matrix<double> tss_gamma_cov;
+ ublas::matrix<double> tss_count_cov;
+ ublas::matrix<double> tss_iterated_exp_count_cov;
+ ublas::matrix<double> tss_fpkm_cov;
+ vector<Eigen::VectorXd> tss_assigned_counts;
+
+ vector<bool> mask(sample.transcripts.abundances().size(), true);
+ for (size_t i = 0; i < sample.transcripts.abundances().size(); ++i)
+ {
+ if (*(sample.transcripts.abundances()[i]->tss_id().begin()) == "")
+ {
+ mask[i] = false;
+ }
+ }
+
+ AbundanceGroup trans_with_tss;
+ sample.transcripts.filter_group(mask, trans_with_tss);
+
+ cluster_transcripts<ConnectByAnnotatedTssId>(trans_with_tss,
+ transcripts_by_tss,
+ &tss_gamma_cov,
+ &tss_iterated_exp_count_cov,
+ &tss_count_cov,
+ &tss_fpkm_cov);
+
+
+ BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_tss)
+ {
+ ab_group.locus_tag(locus_tag);
+ set<string> tss_ids = ab_group.tss_id();
+ assert (tss_ids.size() == 1);
+ string desc = *(tss_ids.begin());
+ assert (desc != "");
+ ab_group.description(*(tss_ids.begin()));
+ }
+
+ sample.primary_transcripts = transcripts_by_tss;
+
+ // Group TSS clusters by gene
+ vector<boost::shared_ptr<Abundance> > primary_transcript_abundances;
+ set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
+ BOOST_FOREACH (AbundanceGroup& ab_group, sample.primary_transcripts)
+ {
+ primary_transcript_abundances.push_back(boost::shared_ptr<Abundance>(new AbundanceGroup(ab_group)));
+ rg_props.insert(ab_group.rg_props().begin(), ab_group.rg_props().end());
+ }
+
+ AbundanceGroup primary_transcripts(primary_transcript_abundances,
+ tss_gamma_cov,
+ tss_iterated_exp_count_cov,
+ tss_count_cov,
+ tss_fpkm_cov,
+ rg_props);
+
+ vector<AbundanceGroup> primary_transcripts_by_gene;
+
+ cluster_transcripts<ConnectByAnnotatedGeneId>(primary_transcripts,
+ primary_transcripts_by_gene);
+
+ BOOST_FOREACH(AbundanceGroup& ab_group, primary_transcripts_by_gene)
+ {
+ ab_group.locus_tag(locus_tag);
+ set<string> gene_ids = ab_group.gene_id();
+ if (gene_ids.size() > 1)
+ {
+ BOOST_FOREACH (string st, gene_ids)
+ {
+ fprintf(stderr, "%s\n", st.c_str());
+ }
+ ab_group.gene_id();
+ }
+ assert (gene_ids.size() == 1);
+ ab_group.description(*(gene_ids.begin()));
+ }
+
+ sample.gene_primary_transcripts = primary_transcripts_by_gene;
+}
+
+void cds_analyis(const string& locus_tag, SampleAbundances& sample)
+{
+ // Cluster transcripts by CDS
+ vector<AbundanceGroup> transcripts_by_cds;
+ ublas::matrix<double> cds_gamma_cov;
+ ublas::matrix<double> cds_count_cov;
+ ublas::matrix<double> cds_iterated_exp_count_cov;
+ ublas::matrix<double> cds_fpkm_cov;
+
+ vector<bool> mask(sample.transcripts.abundances().size(), true);
+ for (size_t i = 0; i < sample.transcripts.abundances().size(); ++i)
+ {
+ if (*(sample.transcripts.abundances()[i]->protein_id().begin()) == "")
+ {
+ mask[i] = false;
+ }
+ }
+
+ AbundanceGroup trans_with_p_id;
+ sample.transcripts.filter_group(mask, trans_with_p_id);
+
+ cluster_transcripts<ConnectByAnnotatedProteinId>(trans_with_p_id,
+ transcripts_by_cds,
+ &cds_gamma_cov,
+ &cds_iterated_exp_count_cov,
+ &cds_count_cov,
+ &cds_fpkm_cov);
+
+ BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_cds)
+ {
+ ab_group.locus_tag(locus_tag);
+ set<string> protein_ids = ab_group.protein_id();
+ assert (protein_ids.size() == 1);
+ string desc = *(protein_ids.begin());
+ //if (desc != "")
+ //{
+ assert (desc != "");
+ ab_group.description(*(protein_ids.begin()));
+ //}
+ }
+
+ sample.cds = transcripts_by_cds;
+
+ // Group the CDS clusters by gene
+ vector<boost::shared_ptr<Abundance> > cds_abundances;
+
+ set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
+ BOOST_FOREACH (AbundanceGroup& ab_group, sample.cds)
+ {
+ //if (ab_group.description() != "")
+ {
+ cds_abundances.push_back(boost::shared_ptr<Abundance>(new AbundanceGroup(ab_group)));
+ rg_props.insert(ab_group.rg_props().begin(), ab_group.rg_props().end());
+ }
+ }
+ AbundanceGroup cds(cds_abundances,
+ cds_gamma_cov,
+ cds_iterated_exp_count_cov,
+ cds_count_cov,
+ cds_fpkm_cov,
+ rg_props);
+
+ vector<AbundanceGroup> cds_by_gene;
+
+ cluster_transcripts<ConnectByAnnotatedGeneId>(cds,
+ cds_by_gene);
+
+ BOOST_FOREACH(AbundanceGroup& ab_group, cds_by_gene)
+ {
+ ab_group.locus_tag(locus_tag);
+ set<string> gene_ids = ab_group.gene_id();
+ assert (gene_ids.size() == 1);
+ ab_group.description(*(gene_ids.begin()));
+ }
+
+ sample.gene_cds = cds_by_gene;
+
+}
+
+void sample_abundance_worker(const string& locus_tag,
+ const set<boost::shared_ptr<ReadGroupProperties const> >& rg_props,
+ SampleAbundances& sample,
+ boost::shared_ptr<HitBundle> sample_bundle,
+ bool perform_cds_analysis,
+ bool perform_tss_analysis,
+ bool calculate_variance)
+{
+ vector<boost::shared_ptr<Abundance> > abundances;
+
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> s, sample_bundle->ref_scaffolds())
+ {
+ TranscriptAbundance* pT = new TranscriptAbundance;
+ pT->transfrag(s);
+ boost::shared_ptr<Abundance> ab(pT);
+ ab->description(s->annotated_trans_id());
+ ab->locus_tag(locus_tag);
+ abundances.push_back(ab);
+ }
+
+ sample.transcripts = AbundanceGroup(abundances);
+
+ sample.transcripts.init_rg_props(rg_props);
+
+ vector<MateHit> hits_in_cluster;
+
+ if (sample_bundle->hits().size() < (size_t)max_frags_per_bundle)
+ {
+ get_alignments_from_scaffolds(sample.transcripts.abundances(),
+ hits_in_cluster);
+
+ // Compute the individual transcript FPKMs via each sample's
+ // AbundanceGroup for this locus.
+
+ sample.transcripts.calculate_abundance(hits_in_cluster, true, calculate_variance);
+ }
+ else
+ {
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, abundances)
+ {
+ ab->status(NUMERIC_HI_DATA);
+
+ CountPerReplicateTable cpr;
+ FPKMPerReplicateTable fpr;
+ StatusPerReplicateTable spr;
+ for (set<boost::shared_ptr<ReadGroupProperties const> >::const_iterator itr = rg_props.begin();
+ itr != rg_props.end();
+ ++itr)
+ {
+ cpr[*itr] = 0;
+ fpr[*itr] = 0;
+ spr[*itr] = NUMERIC_HI_DATA;
+ }
+ ab->num_fragments_by_replicate(cpr);
+ ab->FPKM_by_replicate(fpr);
+ ab->status_by_replicate(spr);
+ }
+ }
+
+ // Cluster transcripts by gene_id
+ vector<AbundanceGroup> transcripts_by_gene_id;
+ cluster_transcripts<ConnectByAnnotatedGeneId>(sample.transcripts,
+ transcripts_by_gene_id);
+
+ BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_gene_id)
+ {
+ ab_group.locus_tag(locus_tag);
+ set<string> gene_ids = ab_group.gene_id();
+ assert (gene_ids.size() == 1);
+ ab_group.description(*(gene_ids.begin()));
+ }
+
+ sample.genes = transcripts_by_gene_id;
+
+ if (perform_cds_analysis)
+ {
+ cds_analyis(locus_tag, sample);
+ }
+
+ if (perform_tss_analysis)
+ {
+ tss_analysis(locus_tag, sample);
+ }
+}
+
+// This function applies library size factors to pre-computed expression entries
+void AbundanceGroup::apply_normalization_to_abundances(const map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> >& unnormalized_ab_group_per_replicate,
+ map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<AbundanceGroup> >& normalized_ab_group_per_replicate)
+{
+ for (map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = unnormalized_ab_group_per_replicate.begin();
+ itr != unnormalized_ab_group_per_replicate.end(); ++itr)
+ {
+ boost::shared_ptr<AbundanceGroup> norm_ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(*itr->second));
+ boost::shared_ptr<const ReadGroupProperties> rg_props = itr->first;
+ boost::shared_ptr<const MassDispersionModel> disp_model = rg_props->mass_dispersion_model();
+
+ boost::shared_ptr<const ReadGroupProperties> old_rg_props = *(itr->second->rg_props().begin());
+
+ double fpkm_correction_factor = old_rg_props->normalized_map_mass() / rg_props->normalized_map_mass();
+ double internal_scale_factor = rg_props->internal_scale_factor();
+
+ double total_mass = 0.0;
+
+ for (size_t i = 0; i < norm_ab->_abundances.size(); ++i)
+ {
+ norm_ab->_abundances[i]->num_fragments(itr->second->_abundances[i]->num_fragments() / internal_scale_factor);
+
+ total_mass += norm_ab->_abundances[i]->num_fragments();
+
+ norm_ab->_abundances[i]->FPKM(fpkm_correction_factor * itr->second->_abundances[i]->FPKM() / internal_scale_factor);
+ norm_ab->_iterated_exp_count_covariance = norm_ab->iterated_count_cov() / (internal_scale_factor*internal_scale_factor);
+ norm_ab->_fpkm_covariance = norm_ab->_fpkm_covariance * (fpkm_correction_factor * fpkm_correction_factor)/ (internal_scale_factor*internal_scale_factor);
+ norm_ab->_count_covariance = norm_ab->_count_covariance/ (internal_scale_factor*internal_scale_factor);
+ }
+
+ double locus_mass_variance = disp_model->scale_mass_variance(total_mass);
+
+ for (size_t i = 0; i < norm_ab->_abundances.size(); ++i)
+ {
+ norm_ab->_abundances[i]->mass_variance(locus_mass_variance * norm_ab->_abundances[i]->gamma());
+ }
+
+ normalized_ab_group_per_replicate[itr->first] = norm_ab;
+ }
+}
+
+void merge_precomputed_expression_worker(const string& locus_tag,
+ const vector<boost::shared_ptr<PrecomputedExpressionBundleFactory> >& expression_factories,
+ SampleAbundances& sample,
+ boost::shared_ptr<HitBundle> sample_bundle,
+ bool perform_cds_analysis,
+ bool perform_tss_analysis,
+ bool calculate_variance)
+{
+ map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> > unnormalized_ab_group_per_replicate;
+ map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<AbundanceGroup> > normalized_ab_group_per_replicate;
+ map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> > const_ab_group_per_replicate;
+
+ set<boost::shared_ptr<const ReadGroupProperties> > rg_props;
+ for (size_t i = 0; i < expression_factories.size(); ++i)
+ {
+ boost::shared_ptr<PrecomputedExpressionBundleFactory> pBundleFac = expression_factories[i];
+ boost::shared_ptr<const PrecomputedExpressionHitFactory> pHitFac = dynamic_pointer_cast<const PrecomputedExpressionHitFactory> (pBundleFac->hit_factory());
+ assert (pHitFac);
+
+ boost::shared_ptr<const ReadGroupProperties> rg_prop = pBundleFac->read_group_properties();
+ rg_props.insert(rg_prop);
+
+ boost::shared_ptr<const AbundanceGroup> ab = pBundleFac->get_abundance_for_locus(sample_bundle->id());
+ pBundleFac->clear_abundance_for_locus(sample_bundle->id());
+ if (!ab)
+ {
+ fprintf(stderr, "Error: no bundle with id %d in precomputed expression file\n", sample_bundle->id());
+ }
+ else if(ab->abundances().size() != sample_bundle->ref_scaffolds().size())
+ {
+ fprintf(stderr, "Error: bad bundle merge %s != %s\n", ab->description().c_str(), locus_tag.c_str());
+ }
+ unnormalized_ab_group_per_replicate[rg_prop] = ab;
+ }
+
+ AbundanceGroup::apply_normalization_to_abundances(unnormalized_ab_group_per_replicate, normalized_ab_group_per_replicate);
+
+ for (map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<AbundanceGroup> >::const_iterator itr = normalized_ab_group_per_replicate.begin();
+ itr != normalized_ab_group_per_replicate.end(); ++itr)
+ {
+ const_ab_group_per_replicate[itr->first] = itr->second;
+ }
+
+ vector<boost::shared_ptr<Abundance> > abundances;
+
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> s, sample_bundle->ref_scaffolds())
+ {
+ TranscriptAbundance* pT = new TranscriptAbundance;
+ pT->transfrag(s);
+ boost::shared_ptr<Abundance> ab(pT);
+ ab->description(s->annotated_trans_id());
+ ab->locus_tag(locus_tag);
+ abundances.push_back(ab);
+ }
+
+ sample.transcripts = AbundanceGroup(abundances);
+
+ sample.transcripts.init_rg_props(rg_props);
+
+ vector<MateHit> hits_in_cluster;
+
+ if (sample_bundle->hits().size() < (size_t)max_frags_per_bundle)
+ {
+ sample.transcripts.collect_per_replicate_mass(const_ab_group_per_replicate);
+ sample.transcripts.aggregate_replicate_abundances(const_ab_group_per_replicate);
+ if (calculate_variance)
+ {
+ sample.transcripts.calculate_abundance_group_variance(abundances, const_ab_group_per_replicate);
+ }
+ }
+ else // FIXME: THIS needs to do the right thing with sample.transcripts...
+ {
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, abundances)
+ {
+ ab->status(NUMERIC_HI_DATA);
+
+ CountPerReplicateTable cpr;
+ FPKMPerReplicateTable fpr;
+ StatusPerReplicateTable spr;
+ for (set<boost::shared_ptr<ReadGroupProperties const> >::const_iterator itr = rg_props.begin();
+ itr != rg_props.end();
+ ++itr)
+ {
+ cpr[*itr] = 0;
+ fpr[*itr] = 0;
+ spr[*itr] = NUMERIC_HI_DATA;
+ }
+ ab->num_fragments_by_replicate(cpr);
+ ab->FPKM_by_replicate(fpr);
+ ab->status_by_replicate(spr);
+ }
+ }
+
+ // Cluster transcripts by gene_id
+ vector<AbundanceGroup> transcripts_by_gene_id;
+ cluster_transcripts<ConnectByAnnotatedGeneId>(sample.transcripts,
+ transcripts_by_gene_id);
+
+ BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_gene_id)
+ {
+ ab_group.locus_tag(locus_tag);
+ set<string> gene_ids = ab_group.gene_id();
+ assert (gene_ids.size() == 1);
+ ab_group.description(*(gene_ids.begin()));
+ }
+
+ sample.genes = transcripts_by_gene_id;
+
+ if (perform_cds_analysis)
+ {
+ cds_analyis(locus_tag, sample);
+ }
+
+ if (perform_tss_analysis)
+ {
+ tss_analysis(locus_tag, sample);
+ }
+}
+
+
+//BOOST_CLASS_EXPORT(Abundance)
+BOOST_CLASS_EXPORT(TranscriptAbundance)
+BOOST_SERIALIZATION_SHARED_PTR(TranscriptAbundance);
+BOOST_CLASS_EXPORT(AbundanceGroup);
+BOOST_SERIALIZATION_SHARED_PTR(AbundanceGroup);
+BOOST_SERIALIZATION_ASSUME_ABSTRACT(Abundance);
diff --git a/src/abundances.h b/src/abundances.h
index 7ae52f2..a3f8469 100644
--- a/src/abundances.h
+++ b/src/abundances.h
@@ -33,18 +33,28 @@ struct ConfidenceInterval
: low(Low), high(High) {}
double low;
double high;
+
+private:
+ friend std::ostream & operator<<(std::ostream &os, const ConfidenceInterval &gp);
+ friend class boost::serialization::access;
+ template<class Archive>
+ void serialize(Archive & ar, const unsigned int /* file_version */){
+ ar & low & high;
+ }
};
enum AbundanceStatus { NUMERIC_OK, NUMERIC_FAIL, NUMERIC_LOW_DATA, NUMERIC_HI_DATA };
-typedef map<shared_ptr<ReadGroupProperties const>, double> CountPerReplicateTable;
-typedef map<shared_ptr<ReadGroupProperties const>, double> FPKMPerReplicateTable;
-typedef map<shared_ptr<ReadGroupProperties const>, AbundanceStatus> StatusPerReplicateTable;
+typedef map<boost::shared_ptr<ReadGroupProperties const>, double> CountPerReplicateTable;
+typedef map<boost::shared_ptr<ReadGroupProperties const>, double> FPKMPerReplicateTable;
+typedef map<boost::shared_ptr<ReadGroupProperties const>, AbundanceStatus> StatusPerReplicateTable;
bool fit_negbin_dist(const vector<double> samples, double& r, double& p);
long double negbin_log_likelihood(const vector<double>& samples, long double r, long double p);
long double poisson_log_likelihood(const vector<double>& samples, long double lambda);
+namespace bser = boost::serialization;
+
class Abundance
{
public:
@@ -110,7 +120,7 @@ public:
virtual void cond_probs(vector<double>* cp) = 0;
// The structural information for the object, if defined.
- virtual shared_ptr<Scaffold> transfrag() const { return shared_ptr<Scaffold>(); }
+ virtual boost::shared_ptr<Scaffold> transfrag() const { return boost::shared_ptr<Scaffold>(); }
virtual const vector<double>& fpkm_samples() const = 0;
virtual void fpkm_samples(const vector<double>& s) = 0;
@@ -129,15 +139,25 @@ public:
virtual const string& reference_tag() const = 0;
virtual void reference_tag(const string& r) = 0;
+
+ template<class Archive>
+ void serialize(Archive & ar, const unsigned int file_version)
+ {
+
+ }
+
+ virtual void clear_non_serialized_data() = 0;
};
+BOOST_SERIALIZATION_ASSUME_ABSTRACT(Abundance);
+
class TranscriptAbundance : public Abundance
{
public:
TranscriptAbundance() :
_status(NUMERIC_OK),
- _transfrag(shared_ptr<Scaffold>()),
+ _transfrag(boost::shared_ptr<Scaffold>()),
_FPKM(0),
_FPKM_variance(0),
_gamma(0),
@@ -147,8 +167,6 @@ public:
_num_fragment_uncertainty_var(0),
_eff_len(0),
_cond_probs(NULL),
- _fpkm_gamma_k(0.0),
- _fpkm_gamma_theta(0.0),
_sample_mass_variance(0.0){}
~TranscriptAbundance()
@@ -167,7 +185,8 @@ public:
void FPKM(double fpkm)
{
_FPKM = fpkm;
- _transfrag->fpkm(fpkm);
+ if (_transfrag)
+ _transfrag->fpkm(fpkm);
}
double FPKM_variance() const { return _FPKM_variance; }
void FPKM_variance(double v);
@@ -187,7 +206,8 @@ public:
{
assert (!isnan(nf));
_num_fragments = nf;
- _transfrag->num_fragments(nf);
+ if (_transfrag)
+ _transfrag->num_fragments(nf);
}
// This tracks the final modeled variance in the assigned counts.
@@ -211,8 +231,8 @@ public:
double mass_variance() const { return _sample_mass_variance; }
void mass_variance(double mv) { _sample_mass_variance = mv; }
- void transfrag(shared_ptr<Scaffold> tf) { _transfrag = tf; }
- shared_ptr<Scaffold> transfrag() const { return _transfrag; }
+ void transfrag(boost::shared_ptr<Scaffold> tf) { _transfrag = tf; }
+ boost::shared_ptr<Scaffold> transfrag() const { return _transfrag; }
double effective_length() const { return _eff_len; }
void effective_length(double el) { _eff_len = el; }
@@ -296,13 +316,39 @@ public:
virtual const string& reference_tag() const { return _ref_tag; }
virtual void reference_tag(const string& r) { _ref_tag = r; }
-
+ void clear_non_serialized_data();
private:
+ friend std::ostream & operator<<(std::ostream &os, const TranscriptAbundance &gp);
+ friend class boost::serialization::access;
+
+ template<class Archive>
+ void serialize(Archive & ar, const unsigned int /* file_version */){
+
+ ar & _status;
+ //ar & _transfrag;
+ ar & _FPKM;
+ ar & _FPKM_conf;
+ ar & _gamma;
+ ar & _kappa;
+ ar & _num_fragments;
+ ar & _num_fragment_var;
+ ar & _num_fragment_uncertainty_var;
+ ar & _eff_len;
+ //ar & _cond_probs;
+ ar & _description;
+ ar & _locus_tag;
+ ar & _ref_tag;
+ ar & _sample_mass_variance;
+ ar & _num_fragments_per_replicate;
+ ar & _fpkm_per_replicate;
+ ar & _status_per_replicate;
+ }
+
void calculate_FPKM_err_bar(double variance);
AbundanceStatus _status;
- shared_ptr<Scaffold> _transfrag;
+ boost::shared_ptr<Scaffold> _transfrag;
double _FPKM;
double _FPKM_variance;
ConfidenceInterval _FPKM_conf;
@@ -314,9 +360,6 @@ private:
double _eff_len;
vector<double>* _cond_probs;
- double _fpkm_gamma_k;
- double _fpkm_gamma_theta;
-
vector<double> _fpkm_samples;
string _description;
@@ -330,6 +373,9 @@ private:
StatusPerReplicateTable _status_per_replicate;
};
+//BOOST_CLASS_EXPORT_GUID(TranscriptAbundance, "TranscriptAbundance");
+//BOOST_SERIALIZATION_boost::shared_ptr(TranscriptAbundance)
+
class AbundanceGroup : public Abundance
{
public:
@@ -339,7 +385,7 @@ public:
_salient_frags(0.0),
_total_frags(0.0) {}
- AbundanceGroup(const vector<shared_ptr<Abundance> >& abundances) :
+ AbundanceGroup(const vector<boost::shared_ptr<Abundance> >& abundances) :
_abundances(abundances),
_iterated_exp_count_covariance(ublas::zero_matrix<double>(abundances.size(), abundances.size())),
_count_covariance(ublas::zero_matrix<double>(abundances.size(), abundances.size())),
@@ -351,12 +397,12 @@ public:
_salient_frags(0.0),
_total_frags(0.0) {}
- AbundanceGroup(const vector<shared_ptr<Abundance> >& abundances,
+ AbundanceGroup(const vector<boost::shared_ptr<Abundance> >& abundances,
const ublas::matrix<double>& gamma_covariance,
const ublas::matrix<double>& iterated_exp_count_covariance,
const ublas::matrix<double>& count_covariance,
const ublas::matrix<double>& fpkm_covariance,
- const std::set<shared_ptr<ReadGroupProperties const > >& rg_props);
+ const std::set<boost::shared_ptr<ReadGroupProperties const > >& rg_props);
AbundanceStatus status() const;
void status(AbundanceStatus s) { }
@@ -423,10 +469,10 @@ public:
void filter_group(const vector<bool>& to_keep,
AbundanceGroup& filtered_group) const;
- void get_transfrags(vector<shared_ptr<Abundance> >& transfrags) const;
+ void get_transfrags(vector<boost::shared_ptr<Abundance> >& transfrags) const;
- vector<shared_ptr<Abundance> >& abundances() { return _abundances; }
- const vector<shared_ptr<Abundance> >& abundances() const { return _abundances; }
+ vector<boost::shared_ptr<Abundance> >& abundances() { return _abundances; }
+ const vector<boost::shared_ptr<Abundance> >& abundances() const { return _abundances; }
const ublas::matrix<double>& gamma_cov() const { return _gamma_covariance; }
@@ -456,7 +502,8 @@ public:
void member_fpkm_samples(const vector<Eigen::VectorXd>& s) { _member_fpkm_samples = s; }
void calculate_abundance(const vector<MateHit>& alignments,
- bool perform_collapse = true);
+ bool perform_collapse = true,
+ bool calculate_variance = true);
double salient_frags() const { return _salient_frags; }
void salient_frags(double nf) { _salient_frags = nf; }
@@ -464,13 +511,13 @@ public:
double total_frags() const { return _total_frags; }
void total_frags(double nf) { _total_frags = nf; }
- const std::set<shared_ptr<ReadGroupProperties const> >& rg_props() const { return _read_group_props; }
+ const std::set<boost::shared_ptr<ReadGroupProperties const> >& rg_props() const { return _read_group_props; }
- void init_rg_props(const std::set<shared_ptr<ReadGroupProperties const> >& rg)
+ void init_rg_props(const std::set<boost::shared_ptr<ReadGroupProperties const> >& rg)
{
_read_group_props = rg;
_count_per_replicate.clear();
- for ( std::set<shared_ptr<ReadGroupProperties const> >::const_iterator itr = rg.begin();
+ for ( std::set<boost::shared_ptr<ReadGroupProperties const> >::const_iterator itr = rg.begin();
itr != rg.end();
++itr)
{
@@ -480,47 +527,130 @@ public:
void fit_gamma_distributions();
-private:
+ void clear_non_serialized_data();
+
+ void aggregate_replicate_abundances(const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate);
+
+ void calculate_abundance_group_variance(const vector<boost::shared_ptr<Abundance> >& transcripts,
+ const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate);
+
+ void collect_per_replicate_mass(std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate)
+ {
+ _count_per_replicate.clear();
+ for (std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
+ itr != ab_group_per_replicate.end(); ++itr)
+ {
+ _count_per_replicate[itr->first] = itr->second->num_fragments();
+ }
+ }
- void aggregate_replicate_abundances(const std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >& ab_group_per_replicate);
+ static void apply_normalization_to_abundances(const map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> >& unnormalized_ab_group_per_replicate,
+ map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<AbundanceGroup> >& normalized_ab_group_per_replicate);
+
+private:
void calculate_abundance_for_replicate(const vector<MateHit>& alignments, bool perform_collapse);
- void calculate_abundance_group_variance(const vector<shared_ptr<Abundance> >& transcripts,
- const std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >& ab_group_per_replicate);
-
+
void FPKM_conf(const ConfidenceInterval& cf) { _FPKM_conf = cf; }
bool calculate_gammas(const vector<MateHit>& nr_alignments,
const vector<double>& log_conv_factors,
- const vector<shared_ptr<Abundance> >& transcripts,
- const vector<shared_ptr<Abundance> >& mapped_transcripts);
+ const vector<boost::shared_ptr<Abundance> >& transcripts,
+ const vector<boost::shared_ptr<Abundance> >& mapped_transcripts);
void calculate_FPKM_covariance();
void calculate_conf_intervals();
- void calculate_locus_scaled_mass_and_variance(const vector<shared_ptr<Abundance> >& transcripts);
+ void calculate_locus_scaled_mass_and_variance(const vector<boost::shared_ptr<Abundance> >& transcripts);
- AbundanceStatus calculate_per_replicate_abundances(vector<shared_ptr<Abundance> >& transcripts,
- const std::map<shared_ptr<ReadGroupProperties const >, vector<MateHit> >& alignments_per_read_group,
- std::map<shared_ptr<ReadGroupProperties const >, shared_ptr<AbundanceGroup> >& ab_group_per_replicate,
+ AbundanceStatus calculate_per_replicate_abundances(vector<boost::shared_ptr<Abundance> >& transcripts,
+ const std::map<boost::shared_ptr<ReadGroupProperties const >, vector<MateHit> >& alignments_per_read_group,
+ std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >& ab_group_per_replicate,
bool perform_collapse = true);
void calculate_kappas();
- void update_multi_reads(const vector<MateHit>& alignments, vector<shared_ptr<Abundance> > transcripts);
+ void update_multi_reads(const vector<MateHit>& alignments, vector<boost::shared_ptr<Abundance> > transcripts);
void collect_per_replicate_mass(const vector<MateHit>& alignments,
- vector<shared_ptr<Abundance> >& transcripts);
+ vector<boost::shared_ptr<Abundance> >& transcripts);
void generate_fpkm_samples();
+ friend std::ostream & operator<<(std::ostream &os, const AbundanceGroup &gp);
+ friend class boost::serialization::access;
+
+ template<class Archive>
+ void save(Archive & ar, const unsigned int version) const
+ {
+ //ar & _abundances;
+ vector<TranscriptAbundance*> tmp;
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
+ {
+ tmp.push_back((TranscriptAbundance*)&(*ab));
+ }
+ ar & tmp;
+ ar & _iterated_exp_count_covariance;
+ ar & _count_covariance;
+ ar & _fpkm_covariance;
+ ar & _gamma_covariance;
+
+ //ar & _FPKM_conf;
+ //ar & _kappa_covariance;
+ //ar & _assign_probs;
+
+ ar & _kappa;
+ ar & _FPKM_variance;
+ ar & _description;
+ //ar & _salient_frags;
+ ar & _total_frags;
+ //ar & _fpkm_samples; // don't save the samples
+ ar & _read_group_props;
+ //ar & _member_fpkm_samples // don't save the member samples either
+ ar & _count_per_replicate;
+
+ }
+ template<class Archive>
+ void load(Archive & ar, const unsigned int version)
+ {
+
+ vector<TranscriptAbundance*> tmp;
+ ar & tmp;
+ BOOST_FOREACH(TranscriptAbundance* ab, tmp)
+ {
+ _abundances.push_back(boost::shared_ptr<Abundance>(ab));
+ }
+
+ //ar & _abundances;
+
+ ar & _iterated_exp_count_covariance;
+ ar & _count_covariance;
+ ar & _fpkm_covariance;
+ ar & _gamma_covariance;
+
+ //ar & _FPKM_conf;
+ //ar & _kappa_covariance;
+ //ar & _assign_probs;
+
+ ar & _kappa;
+ ar & _FPKM_variance;
+ ar & _description;
+ //ar & _salient_frags;
+ ar & _total_frags;
+ //ar & _fpkm_samples; // don't save the samples
+ ar & _read_group_props;
+ //ar & _member_fpkm_samples // don't save the member samples either
+ ar & _count_per_replicate;
+ }
+ BOOST_SERIALIZATION_SPLIT_MEMBER()
+
//void collect_read_group_props();
- vector<shared_ptr<Abundance> > _abundances;
+ vector<boost::shared_ptr<Abundance> > _abundances;
// _iterated_exp_count_covariance is the ITERATED EXPECTATION count covariance matrix. It's not the
// estimated count covariance matrix (i.e. it doesn't include biological variability from
@@ -548,32 +678,44 @@ private:
vector<double> _fpkm_samples;
vector<Eigen::VectorXd> _member_fpkm_samples;
- std::set<shared_ptr<ReadGroupProperties const > > _read_group_props;
+ std::set<boost::shared_ptr<ReadGroupProperties const > > _read_group_props;
vector<Eigen::VectorXd> _assigned_count_samples;
- map<shared_ptr<ReadGroupProperties const>, double> _count_per_replicate;
- //std::map<shared_ptr<ReadGroupProperties const >, ublas::vector<double> > _mles_for_read_groups;
+ map<boost::shared_ptr<ReadGroupProperties const>, double> _count_per_replicate;
+ //std::map<boost::shared_ptr<ReadGroupProperties const >, ublas::vector<double> > _mles_for_read_groups;
+};
+
+struct SampleAbundances
+{
+ string locus_tag;
+ AbundanceGroup transcripts;
+ vector<AbundanceGroup> primary_transcripts;
+ vector<AbundanceGroup> gene_primary_transcripts;
+ vector<AbundanceGroup> cds;
+ vector<AbundanceGroup> gene_cds;
+ vector<AbundanceGroup> genes;
+ double cluster_mass;
};
void compute_cond_probs_and_effective_lengths(const vector<MateHit>& alignments,
- vector<shared_ptr<Abundance> >& transcripts,
- vector<shared_ptr<Abundance> >& mapped_transcripts);
+ vector<boost::shared_ptr<Abundance> >& transcripts,
+ vector<boost::shared_ptr<Abundance> >& mapped_transcripts);
-void compute_compatibilities(const vector<shared_ptr<Abundance> >& transcripts,
+void compute_compatibilities(const vector<boost::shared_ptr<Abundance> >& transcripts,
const vector<MateHit>& alignments,
vector<vector<char> >& compatibilities);
-void get_alignments_from_scaffolds(const vector<shared_ptr<Abundance> >& abundances,
+void get_alignments_from_scaffolds(const vector<boost::shared_ptr<Abundance> >& abundances,
vector<MateHit>& alignments);
-AbundanceStatus empirical_replicate_gammas(vector<shared_ptr<Abundance> >& transcripts,
+AbundanceStatus empirical_replicate_gammas(vector<boost::shared_ptr<Abundance> >& transcripts,
const vector<MateHit>& nr_alignments,
const vector<double>& log_conv_factors,
ublas::vector<double>& gamma_map_estimate,
ublas::matrix<double>& gamma_map_covariance,
- std::map<shared_ptr<ReadGroupProperties const >, ublas::vector<double> >& mles_for_read_groups);
+ std::map<boost::shared_ptr<ReadGroupProperties const >, ublas::vector<double> >& mles_for_read_groups);
-AbundanceStatus gamma_mle(const vector<shared_ptr<Abundance> >& transcripts,
+AbundanceStatus gamma_mle(const vector<boost::shared_ptr<Abundance> >& transcripts,
const vector<MateHit>& nr_alignments,
const vector<double>& log_conv_factors,
vector<double>& gammas,
@@ -618,7 +760,7 @@ double get_scaffold_min_doc(int bundle_origin,
const Scaffold& s,
const vector<float>& depth_of_coverage);
-AbundanceStatus calculate_inverse_fisher(const vector<shared_ptr<Abundance> >& transcripts,
+AbundanceStatus calculate_inverse_fisher(const vector<boost::shared_ptr<Abundance> >& transcripts,
const vector<MateHit>& alignments,
const ublas::vector<double>& gamma_mean,
ublas::matrix<double>& inverse_fisher);
@@ -635,14 +777,30 @@ void calculate_average_assignment_probs(const Eigen::VectorXd& alignment_multipl
void calculate_iterated_exp_count_covariance(const vector<double>& gammas,
const vector<MateHit>& nr_alignments,
- const vector<shared_ptr<Abundance> >& transcripts,
+ const vector<boost::shared_ptr<Abundance> >& transcripts,
ublas::matrix<double>& count_covariance);
bool simulate_count_covariance(const vector<double>& num_fragments,
const vector<double>& frag_variances,
const ublas::matrix<double>& iterated_exp_count_covariances,
- const vector<shared_ptr<Abundance> >& transcripts,
+ const vector<boost::shared_ptr<Abundance> >& transcripts,
ublas::matrix<double>& count_covariances,
vector<Eigen::VectorXd>& assigned_count_samples,
vector<ublas::vector<double> >* gamma_samples);
+
+void sample_abundance_worker(const string& locus_tag,
+ const set<boost::shared_ptr<ReadGroupProperties const> >& rg_props,
+ SampleAbundances& sample,
+ boost::shared_ptr<HitBundle> sample_bundle,
+ bool perform_cds_analysis,
+ bool perform_tss_analysis,
+ bool calculate_variance);
+
+void merge_precomputed_expression_worker(const string& locus_tag,
+ const vector<boost::shared_ptr<PrecomputedExpressionBundleFactory> >& expression_factories,
+ SampleAbundances& sample,
+ boost::shared_ptr<HitBundle> sample_bundle,
+ bool perform_cds_analysis,
+ bool perform_tss_analysis,
+ bool calculate_variance);
#endif
diff --git a/src/assemble.h b/src/assemble.h
index 0accd39..4bb89f1 100644
--- a/src/assemble.h
+++ b/src/assemble.h
@@ -23,7 +23,7 @@
class BiasLearner;
-bool assemble_hits(BundleFactory& bundle_factory, shared_ptr<BiasLearner> bl_ptr);
+bool assemble_hits(BundleFactory& bundle_factory, boost::shared_ptr<BiasLearner> bl_ptr);
//bool intron_compatible(const MateHit& lhs, const MateHit& rhs);
bool read_hits_overlap(const ReadHit* lhs, const ReadHit* rhs);
diff --git a/src/biascorrection.cpp b/src/biascorrection.cpp
index 852330c..e44a0ae 100644
--- a/src/biascorrection.cpp
+++ b/src/biascorrection.cpp
@@ -65,7 +65,7 @@ void ones(ublas::matrix<long double>& A)
A(i,j) = 1;
}
-void get_compatibility_list(const vector<shared_ptr<Scaffold> >& transcripts,
+void get_compatibility_list(const vector<boost::shared_ptr<Scaffold> >& transcripts,
const vector<MateHit>& alignments,
vector<list<int> >& compatibilities)
{
@@ -155,7 +155,7 @@ const int BiasLearner::_n = 64; //Length of maximum connection in VLMM
const int BiasLearner::lengthBins[] = {791,1265,1707,2433}; //Quantiles derived from human mRNA length distribution in UCSC genome browser
const double BiasLearner::positionBins[] = {.02,.04,.06,.08,.10,.15,.2,.3,.4,.5,.6,.7,.8,.85,.9,.92,.94,.96,.98,1};
-BiasLearner::BiasLearner(shared_ptr<EmpDist const> frag_len_dist)
+BiasLearner::BiasLearner(boost::shared_ptr<EmpDist const> frag_len_dist)
{
paramTypes = vlmmSpec;
if (bias_mode==SITE || bias_mode==POS_SITE)
@@ -615,7 +615,7 @@ void BiasLearner::output(FILE* output_file, const string& condition_name, int re
-int BiasCorrectionHelper::add_read_group(shared_ptr<ReadGroupProperties const> rgp)
+int BiasCorrectionHelper::add_read_group(boost::shared_ptr<ReadGroupProperties const> rgp)
{
int trans_len = _transcript->length();
_rg_index.insert(make_pair(rgp, _size));
@@ -625,7 +625,7 @@ int BiasCorrectionHelper::add_read_group(shared_ptr<ReadGroupProperties const> r
vector<double> end_bias(trans_len+1, 1.0);
double eff_len = 0.0;
- shared_ptr<EmpDist const> fld = rgp->frag_len_dist();
+ boost::shared_ptr<EmpDist const> fld = rgp->frag_len_dist();
vector<double> tot_bias_for_len(trans_len+1, 0);
vector<double> start_bias_for_len(trans_len+1, 0);
@@ -677,9 +677,9 @@ int BiasCorrectionHelper::add_read_group(shared_ptr<ReadGroupProperties const> r
}
int num_adds = 0;
-int BiasCorrectionHelper::get_index(shared_ptr<ReadGroupProperties const> rgp)
+int BiasCorrectionHelper::get_index(boost::shared_ptr<ReadGroupProperties const> rgp)
{
- boost::unordered_map<shared_ptr<ReadGroupProperties const>, int>::iterator iter;
+ boost::unordered_map<boost::shared_ptr<ReadGroupProperties const>, int>::iterator iter;
iter = _rg_index.find(rgp);
if (iter==_rg_index.end()) //This rg is not yet in the index, so add it.
@@ -694,7 +694,7 @@ int BiasCorrectionHelper::get_index(shared_ptr<ReadGroupProperties const> rgp)
// Hit needs to be from the collapsed (non_redundant) list to match indexing
double BiasCorrectionHelper::get_cond_prob(const MateHit& hit)
{
- shared_ptr<ReadGroupProperties const> rgp = hit.read_group_props();
+ boost::shared_ptr<ReadGroupProperties const> rgp = hit.read_group_props();
int i = get_index(rgp);
@@ -705,7 +705,7 @@ double BiasCorrectionHelper::get_cond_prob(const MateHit& hit)
_transcript->map_frag(hit, start, end, frag_len);
- shared_ptr<const EmpDist> fld = rgp->frag_len_dist();
+ boost::shared_ptr<const EmpDist> fld = rgp->frag_len_dist();
double cond_prob = 1.0;
cond_prob *= _start_biases[i][start];
@@ -815,7 +815,7 @@ double BiasCorrectionHelper::get_effective_length()
if (tot_mass==0)
return _transcript->length();
- for (boost::unordered_map<shared_ptr<ReadGroupProperties const>, int>::iterator itr = _rg_index.begin();
+ for (boost::unordered_map<boost::shared_ptr<ReadGroupProperties const>, int>::iterator itr = _rg_index.begin();
itr != _rg_index.end();
++itr)
{
diff --git a/src/biascorrection.h b/src/biascorrection.h
index 43b0df9..962426d 100644
--- a/src/biascorrection.h
+++ b/src/biascorrection.h
@@ -60,7 +60,7 @@ class BiasLearner{
#if ENABLE_THREADS
boost::mutex _bl_lock;
#endif
-
+
public:
BiasLearner(boost::shared_ptr<EmpDist const> frag_len_dist);
diff --git a/src/bundles.cpp b/src/bundles.cpp
index b6d99f2..c9e42d1 100644
--- a/src/bundles.cpp
+++ b/src/bundles.cpp
@@ -11,18 +11,21 @@
#include <map>
#include <numeric>
#include <boost/math/distributions/binomial.hpp>
+#include <boost/crc.hpp>
#include "common.h"
#include "bundles.h"
#include "scaffolds.h"
+#include "abundances.h"
+
using namespace std;
using boost::math::binomial;
//struct ScaffoldSorter
//{
// ScaffoldSorter(RefSequenceTable& _rt) : rt(_rt) {}
-// bool operator()(shared_ptr<Scaffold const> lhs, shared_ptr<Scaffold const> rhs)
+// bool operator()(boost::shared_ptr<Scaffold const> lhs, boost::shared_ptr<Scaffold const> rhs)
// {
// assert (lhs);
// assert (rhs);
@@ -46,7 +49,7 @@ using boost::math::binomial;
struct ScaffoldSorter
{
ScaffoldSorter(RefSequenceTable& _rt) : rt(_rt) {}
- bool operator()(shared_ptr<Scaffold const> lhs, shared_ptr<Scaffold const> rhs)
+ bool operator()(boost::shared_ptr<Scaffold const> lhs, boost::shared_ptr<Scaffold const> rhs)
{
//assert (lhs);
//assert (rhs);
@@ -74,7 +77,8 @@ struct ScaffoldSorter
//FIXME: needs refactoring
void load_ref_rnas(FILE* ref_mRNA_file,
RefSequenceTable& rt,
- vector<shared_ptr<Scaffold> >& ref_mRNAs,
+ vector<boost::shared_ptr<Scaffold> >& ref_mRNAs,
+ boost::crc_32_type& gtf_crc_result,
bool loadSeqs,
bool loadFPKM)
{
@@ -97,7 +101,7 @@ void load_ref_rnas(FILE* ref_mRNA_file,
if (ref_mRNA_file)
{
gtf_tracking_verbose=cuff_verbose;
- read_transcripts(ref_mRNA_file, ref_rnas, true);
+ read_transcripts(ref_mRNA_file, ref_rnas, gtf_crc_result, true);
}
int last_gseq_id = -1;
@@ -221,14 +225,14 @@ void load_ref_rnas(FILE* ref_mRNA_file,
GFREE(rna_seq);
}
- shared_ptr<Scaffold> scaff(new Scaffold());
+ boost::shared_ptr<Scaffold> scaff(new Scaffold());
*scaff = ref_scaff;
assert (scaff);
ref_mRNAs.push_back(scaff);
}
}
- BOOST_FOREACH (shared_ptr<Scaffold> s, ref_mRNAs)
+ BOOST_FOREACH (boost::shared_ptr<Scaffold> s, ref_mRNAs)
{
assert (s);
}
@@ -268,7 +272,7 @@ bool HitBundle::add_hit(const MateHit& hit)
struct HitlessScaffold
{
- bool operator()(shared_ptr<Scaffold> x)
+ bool operator()(boost::shared_ptr<Scaffold> x)
{
return x->mate_hits().empty();
}
@@ -280,7 +284,7 @@ bool unmapped_hit(const MateHit& x)
}
-bool HitBundle::add_open_hit(shared_ptr<ReadGroupProperties const> rg_props,
+bool HitBundle::add_open_hit(boost::shared_ptr<ReadGroupProperties const> rg_props,
const ReadHit* bh,
bool expand_by_partner)
{
@@ -432,7 +436,7 @@ void HitBundle::finalize_open_mates()
void HitBundle::remove_hitless_scaffolds()
{
- vector<shared_ptr<Scaffold> >::iterator new_end = remove_if(_ref_scaffs.begin(),
+ vector<boost::shared_ptr<Scaffold> >::iterator new_end = remove_if(_ref_scaffs.begin(),
_ref_scaffs.end(),
HitlessScaffold());
_ref_scaffs.erase(new_end, _ref_scaffs.end());
@@ -518,20 +522,21 @@ void HitBundle::combine(const vector<HitBundle*>& in_bundles,
}
}
+ /*
// Merge ref scaffolds
indices = vector<size_t>(in_bundles.size(), 0);
while(true)
{
int next_bundle = -1;
- shared_ptr<Scaffold> next_scaff;
+ boost::shared_ptr<Scaffold> next_scaff;
for(size_t i = 0; i < in_bundles.size(); ++i)
{
- const vector<shared_ptr<Scaffold> >& curr_scaffs = in_bundles[i]->_ref_scaffs;
+ const vector<boost::shared_ptr<Scaffold> >& curr_scaffs = in_bundles[i]->_ref_scaffs;
if (indices[i] == curr_scaffs.size())
continue;
- shared_ptr<Scaffold> curr_scaff = curr_scaffs[indices[i]];
+ boost::shared_ptr<Scaffold> curr_scaff = curr_scaffs[indices[i]];
if (next_bundle == -1 || scaff_lt_rt_oplt(*curr_scaff, *next_scaff))
{
@@ -547,7 +552,23 @@ void HitBundle::combine(const vector<HitBundle*>& in_bundles,
out_bundle.add_ref_scaffold(next_scaff);
indices[next_bundle]++;
}
-
+ */
+
+ for (size_t i = 0; i < in_bundles.size(); ++i)
+ {
+ for (size_t j = 0; j < in_bundles[i]->ref_scaffolds().size(); ++j)
+ {
+ out_bundle.add_ref_scaffold(in_bundles[i]->ref_scaffolds()[j]);
+ }
+ }
+
+ sort(out_bundle._ref_scaffs.begin(), out_bundle._ref_scaffs.end(), scaff_lt_rt_oplt_sp);
+ vector<boost::shared_ptr<Scaffold> >::iterator new_end = unique(out_bundle._ref_scaffs.begin(),
+ out_bundle._ref_scaffs.end(),
+ StructurallyEqualScaffolds());
+ out_bundle._ref_scaffs.erase(new_end, out_bundle._ref_scaffs.end());
+ vector<boost::shared_ptr<Scaffold> >(out_bundle._ref_scaffs).swap(out_bundle._ref_scaffs);
+
out_bundle.finalize(true); // true means everything is already sorted, etc.
out_bundle._num_replicates = (int)in_bundles.size();
}
@@ -600,11 +621,11 @@ void HitBundle::finalize(bool is_combined)
}
sort(_ref_scaffs.begin(), _ref_scaffs.end(), scaff_lt_rt_oplt_sp);
- vector<shared_ptr<Scaffold> >::iterator new_end = unique(_ref_scaffs.begin(),
+ vector<boost::shared_ptr<Scaffold> >::iterator new_end = unique(_ref_scaffs.begin(),
_ref_scaffs.end(),
StructurallyEqualScaffolds());
_ref_scaffs.erase(new_end, _ref_scaffs.end());
- vector<shared_ptr<Scaffold> >(_ref_scaffs).swap(_ref_scaffs);
+ vector<boost::shared_ptr<Scaffold> >(_ref_scaffs).swap(_ref_scaffs);
}
for (size_t j = 0; j < _ref_scaffs.size(); ++j)
@@ -718,7 +739,7 @@ double BundleFactory::next_valid_alignment(const ReadHit*& bh)
(*next_mask_scaff)->ref_id() != tmp.ref_id())
{
bool found_scaff = false;
- vector<shared_ptr<Scaffold> >::iterator curr_mask_scaff = mask_gtf_recs.begin();
+ vector<boost::shared_ptr<Scaffold> >::iterator curr_mask_scaff = mask_gtf_recs.begin();
for (size_t i = 0; i < _mask_scaff_offsets.size(); ++i)
{
if (_mask_scaff_offsets[i].first == tmp.ref_id())
@@ -859,10 +880,11 @@ bool BundleFactory::next_bundle_ref_driven(HitBundle& bundle)
}
bundle.add_ref_scaffold(*next_ref_scaff);
+
++next_ref_scaff;
_expand_by_refs(bundle);
-
+
// The most recent RefID and position we've seen in the hit stream
RefID last_hit_ref_id_seen = 0;
int last_hit_pos_seen = 0;
@@ -1055,10 +1077,15 @@ bool BundleFactory::_expand_by_refs(HitBundle& bundle)
while(next_ref_scaff < ref_mRNAs.end())
{
assert(bundle.ref_id() != (*next_ref_scaff)->ref_id() || (*next_ref_scaff)->left() >= bundle.left());
+// if (*next_ref_scaff && (*next_ref_scaff)->annotated_gene_id() == "XLOC_009372")
+// {
+// int a = 5;
+// }
if (bundle.ref_id() == (*next_ref_scaff)->ref_id()
&& overlap_in_genome((*next_ref_scaff)->left(),(*next_ref_scaff)->right(),bundle.left(), bundle.right()))
{
- bundle.add_ref_scaffold(*next_ref_scaff++);
+ bundle.add_ref_scaffold(*next_ref_scaff);
+ next_ref_scaff++;
}
else
{
@@ -1125,22 +1152,26 @@ bool BundleFactory::next_bundle(HitBundle& bundle)
#if ENABLE_THREADS
boost::mutex::scoped_lock lock(_factory_lock);
#endif
+ bool got_bundle = false;
switch(_bundle_mode)
{
case HIT_DRIVEN:
_curr_bundle++;
- return next_bundle_hit_driven(bundle);
+ got_bundle = next_bundle_hit_driven(bundle);
+ bundle.id(_curr_bundle);
break;
case REF_DRIVEN:
_curr_bundle++;
- return next_bundle_ref_driven(bundle);
+ got_bundle = next_bundle_ref_driven(bundle);
+ bundle.id(_curr_bundle);
break;
case REF_GUIDED:
_curr_bundle++;
- return next_bundle_ref_guided(bundle);
+ got_bundle = next_bundle_ref_guided(bundle);
+ bundle.id(_curr_bundle);
break;
}
- return false;
+ return got_bundle;
}
@@ -1592,7 +1623,7 @@ bool BundleFactory::spans_bad_intron(const ReadHit& read)
return false;
}
-void inspect_map(BundleFactory& bundle_factory,
+void inspect_map(boost::shared_ptr<BundleFactory> bundle_factory,
BadIntronTable* bad_introns,
vector<LocusCount>& compatible_count_table,
vector<LocusCount>& total_count_table,
@@ -1602,7 +1633,7 @@ void inspect_map(BundleFactory& bundle_factory,
ProgressBar p_bar;
if (progress_bar)
- p_bar = ProgressBar("Inspecting reads and determining fragment length distribution.",bundle_factory.ref_table().size());
+ p_bar = ProgressBar("Inspecting reads and determining fragment length distribution.",bundle_factory->ref_table().size());
RefID last_chrom = 0;
long double map_mass = 0.0;
@@ -1624,13 +1655,13 @@ void inspect_map(BundleFactory& bundle_factory,
int max_1 = 0;
int max_2 = 0;
- shared_ptr<MultiReadTable> mrt(new MultiReadTable());
+ boost::shared_ptr<MultiReadTable> mrt(new MultiReadTable());
while(true)
{
HitBundle* bundle_ptr = new HitBundle();
- bool valid_bundle = bundle_factory.next_bundle(*bundle_ptr);
+ bool valid_bundle = bundle_factory->next_bundle(*bundle_ptr);
HitBundle& bundle = *bundle_ptr;
if (use_compat_mass) //only count hits that are compatible with ref transcripts
@@ -1661,15 +1692,25 @@ void inspect_map(BundleFactory& bundle_factory,
exit(1);
}
- const RefSequenceTable& rt = bundle_factory.ref_table();
+ const RefSequenceTable& rt = bundle_factory->ref_table();
const char* chrom = rt.get_name(bundle.ref_id());
char bundle_label_buf[2048];
if (chrom)
{
sprintf(bundle_label_buf, "%s:%d-%d", chrom, bundle.left(), bundle.right());
verbose_msg("Inspecting bundle %s with %lu reads\n", bundle_label_buf, bundle.hits().size());
- compatible_count_table.push_back(LocusCount(bundle_label_buf, floor(bundle.compatible_mass()), bundle.ref_scaffolds().size()));
- total_count_table.push_back(LocusCount(bundle_label_buf, floor(bundle.raw_mass()), bundle.ref_scaffolds().size()));
+
+ vector<string> gene_ids;
+ vector<string> gene_short_names;
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> s, bundle.ref_scaffolds())
+ {
+ if (s->annotated_gene_id() != "")
+ gene_ids.push_back(s->annotated_gene_id());
+ if (s->annotated_gene_name() != "")
+ gene_short_names.push_back(s->annotated_gene_name());
+ }
+ compatible_count_table.push_back(LocusCount(bundle_label_buf, floor(bundle.compatible_mass()), bundle.ref_scaffolds().size(), gene_ids, gene_short_names));
+ total_count_table.push_back(LocusCount(bundle_label_buf, floor(bundle.raw_mass()), bundle.ref_scaffolds().size(), gene_ids, gene_short_names));
}
if (!valid_bundle)
@@ -1750,7 +1791,7 @@ void inspect_map(BundleFactory& bundle_factory,
// Annotation provided and single isoform gene
{
int start, end, mate_length;
- shared_ptr<Scaffold> scaff = bundle.ref_scaffolds()[0];
+ boost::shared_ptr<Scaffold> scaff = bundle.ref_scaffolds()[0];
if (scaff->map_frag(hits[i], start, end, mate_length))
{
if (mate_length >= min_len && mate_length <= max_len)
@@ -1935,7 +1976,7 @@ void inspect_map(BundleFactory& bundle_factory,
std_dev = sqrt(std_dev);
- shared_ptr<ReadGroupProperties> rg_props = bundle_factory.read_group_properties();
+ boost::shared_ptr<ReadGroupProperties> rg_props = bundle_factory->read_group_properties();
FLDSource source = DEFAULT;
if (empirical)
@@ -1947,7 +1988,7 @@ void inspect_map(BundleFactory& bundle_factory,
source = USER;
}
- shared_ptr<EmpDist const> fld(new EmpDist(frag_len_pdf, frag_len_cdf, frag_len_mode, mean, std_dev, min_len, max_len, source));
+ boost::shared_ptr<EmpDist const> fld(new EmpDist(frag_len_pdf, frag_len_cdf, frag_len_mode, mean, std_dev, min_len, max_len, source));
rg_props->multi_read_table(mrt);
rg_props->frag_len_dist(fld);
rg_props->normalized_map_mass(norm_map_mass);
@@ -1987,7 +2028,77 @@ void inspect_map(BundleFactory& bundle_factory,
fprintf(stderr, ">\t Default Std Dev: %d\n", def_frag_len_std_dev);
}
}
- bundle_factory.num_bundles(num_bundles);
- bundle_factory.reset();
+ bundle_factory->num_bundles(num_bundles);
+ bundle_factory->reset();
return;
}
+
+//////////////////////
+
+
+bool PrecomputedExpressionBundleFactory::next_bundle(HitBundle& bundle)
+{
+#if ENABLE_THREADS
+ boost::mutex::scoped_lock lock(_factory_lock);
+#endif
+ bool got_bundle = BundleFactory::next_bundle(bundle);
+ if (got_bundle)
+ {
+ RefSequenceTable& rt = ref_table();
+
+ char bundle_label_buf[2048];
+ sprintf(bundle_label_buf, "%s:%d-%d", rt.get_name(bundle.ref_id()), bundle.left(), bundle.right());
+
+ boost::shared_ptr<const AbundanceGroup> ab = _hit_fac->next_locus(bundle.id());
+ if (ab)
+ {
+ double compatible_mass = _hit_fac->get_compat_mass(bundle_label_buf);
+ double total_mass = _hit_fac->get_total_mass(bundle_label_buf);
+
+ bundle.finalize();
+ bundle.add_raw_mass(total_mass);
+ bundle.compatible_mass(compatible_mass);
+
+ //fprintf (stderr, "Reconstituting bundle %s (%d) with mass %lf\n", bundle_label_buf, bundle.id(), compatible_mass);
+ if (bundle.ref_scaffolds().size() != ab->abundances().size())
+ {
+ fprintf (stderr, "Error in file %s: reconstituted expression bundle %s (%d transcripts) does not match GTF (%d transcripts):\n", read_group_properties()->file_path().c_str(), bundle_label_buf, ab->abundances().size(), bundle.ref_scaffolds().size());
+ fprintf(stderr, "Reconstituted:\n");
+ for (size_t i = 0; i < ab->abundances().size(); ++i)
+ {
+ fprintf(stderr, "%s\n", ab->abundances()[i]->description().c_str());
+ }
+ fprintf(stderr, "GTF:\n");
+ for (size_t i = 0; i < bundle.ref_scaffolds().size(); ++i)
+ {
+ fprintf(stderr, "%s\n", bundle.ref_scaffolds()[i]->annotated_trans_id().c_str());
+ }
+ exit(1);
+ }
+ }
+ else
+ {
+ fprintf (stderr, "Error: no abundance info for locus %s\n", bundle_label_buf);
+ }
+
+ }
+ return got_bundle;
+}
+
+boost::shared_ptr<const AbundanceGroup> PrecomputedExpressionBundleFactory::get_abundance_for_locus(int locus_id)
+{
+#if ENABLE_THREADS
+ boost::mutex::scoped_lock lock(_factory_lock);
+#endif
+ return _hit_fac->get_abundance_for_locus(locus_id);
+}
+
+void PrecomputedExpressionBundleFactory::clear_abundance_for_locus(int locus_id)
+{
+#if ENABLE_THREADS
+ boost::mutex::scoped_lock lock(_factory_lock);
+#endif
+ _hit_fac->clear_abundance_for_locus(locus_id);
+}
+
+
diff --git a/src/bundles.h b/src/bundles.h
index dc38b5d..c1b5d64 100644
--- a/src/bundles.h
+++ b/src/bundles.h
@@ -56,8 +56,8 @@ public:
~HitBundle()
{
- vector<shared_ptr<Scaffold> >& bundle_ref_scaffs = ref_scaffolds();
- BOOST_FOREACH(shared_ptr<Scaffold>& ref_scaff, bundle_ref_scaffs)
+ vector<boost::shared_ptr<Scaffold> >& bundle_ref_scaffs = ref_scaffolds();
+ BOOST_FOREACH(boost::shared_ptr<Scaffold>& ref_scaff, bundle_ref_scaffs)
{
// This bundle and the factory that actually owns the ref_mRNAs
// are the only objects that should have access to these scaffolds
@@ -108,12 +108,14 @@ public:
return _compatible_mass;
}
+ void compatible_mass(double c) { _compatible_mass = c; }
+
void clear_hits()
{
_hits.clear();
_non_redundant.clear();
- vector<shared_ptr<Scaffold> >& bundle_ref_scaffs = ref_scaffolds();
- BOOST_FOREACH(shared_ptr<Scaffold>& ref_scaff, bundle_ref_scaffs)
+ vector<boost::shared_ptr<Scaffold> >& bundle_ref_scaffs = ref_scaffolds();
+ BOOST_FOREACH(boost::shared_ptr<Scaffold>& ref_scaff, bundle_ref_scaffs)
{
if (ref_scaff.use_count() <= 3)
{
@@ -132,8 +134,9 @@ public:
RefID ref_id() const {return _ref_id; }
int id() const { return _id; }
+ void id(int i) { _id = i; }
- void add_ref_scaffold(shared_ptr<Scaffold> scaff)
+ void add_ref_scaffold(boost::shared_ptr<Scaffold> scaff)
{
if (scaff->left() < _leftmost)
_leftmost = scaff->left();
@@ -143,11 +146,11 @@ public:
_ref_id = scaff->ref_id();
}
- vector<shared_ptr<Scaffold> >& ref_scaffolds() { return _ref_scaffs; }
+ vector<boost::shared_ptr<Scaffold> >& ref_scaffolds() { return _ref_scaffs; }
// Adds a Bowtie hit to the open hits buffer. The Bundle will handle turning
// the Bowtie hit into a properly mated Cufflinks hit record
- bool add_open_hit(shared_ptr<ReadGroupProperties const> rg_props,
+ bool add_open_hit(boost::shared_ptr<ReadGroupProperties const> rg_props,
const ReadHit* bh,
bool expand_by = true);
@@ -182,7 +185,7 @@ private:
int _rightmost;
std::vector<MateHit> _hits;
std::vector<MateHit> _non_redundant;
- std::vector<shared_ptr<Scaffold> > _ref_scaffs; // user-supplied reference annotations overlapping the bundle
+ std::vector<boost::shared_ptr<Scaffold> > _ref_scaffs; // user-supplied reference annotations overlapping the bundle
bool _final;
int _id;
RefID _ref_id;
@@ -199,7 +202,8 @@ private:
void load_ref_rnas(FILE* ref_mRNA_file,
RefSequenceTable& rt,
- vector<shared_ptr<Scaffold> >& ref_mRNAs,
+ vector<boost::shared_ptr<Scaffold> >& ref_mRNAs,
+ boost::crc_32_type& gtf_crc_result,
bool loadSeqs=false,
bool loadFPKM=false);
@@ -207,15 +211,14 @@ class BundleFactory
{
public:
- BundleFactory(shared_ptr<HitFactory> fac, BundleMode bm)
+ BundleFactory(boost::shared_ptr<HitFactory> fac, BundleMode bm)
: _hit_fac(fac), _bundle_mode(bm), _prev_pos(0), _prev_ref_id(0), _curr_bundle(0), _zeroone(rng)
{
- _rg_props = shared_ptr<ReadGroupProperties>(new ReadGroupProperties(fac->read_group_properties()));
-
-
-
+ _rg_props = boost::shared_ptr<ReadGroupProperties>(new ReadGroupProperties(fac->read_group_properties()));
}
+ boost::shared_ptr<const HitFactory> hit_factory() const { return _hit_fac; }
+
bool bundles_remain()
{
#if ENABLE_THREADS
@@ -224,7 +227,7 @@ public:
return _curr_bundle < num_bundles();
}
- bool next_bundle(HitBundle& bundle_out);
+ virtual bool next_bundle(HitBundle& bundle_out);
bool next_bundle_hit_driven(HitBundle& bundle_out);
bool next_bundle_ref_driven(HitBundle& bundle_out);
bool next_bundle_ref_guided(HitBundle& bundle_out);
@@ -247,7 +250,7 @@ public:
next_ref_scaff = ref_mRNAs.begin();
next_mask_scaff = mask_gtf_recs.begin();
- BOOST_FOREACH(shared_ptr<Scaffold> ref_scaff, ref_mRNAs)
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff, ref_mRNAs)
{
ref_scaff->clear_hits();
}
@@ -258,19 +261,19 @@ public:
// This function NEEDS to deep copy the ref_mRNAs, otherwise cuffdiff'd
// samples will clobber each other
- void set_ref_rnas(const vector<shared_ptr<Scaffold> >& mRNAs)
+ void set_ref_rnas(const vector<boost::shared_ptr<Scaffold> >& mRNAs)
{
#if ENABLE_THREADS
boost::mutex::scoped_lock lock(_factory_lock);
#endif
ref_mRNAs.clear();
- for (vector<shared_ptr<Scaffold> >::const_iterator i = mRNAs.begin(); i < mRNAs.end(); ++i)
+ for (vector<boost::shared_ptr<Scaffold> >::const_iterator i = mRNAs.begin(); i < mRNAs.end(); ++i)
{
- ref_mRNAs.push_back(shared_ptr<Scaffold>(new Scaffold(**i)));
+ ref_mRNAs.push_back(boost::shared_ptr<Scaffold>(new Scaffold(**i)));
}
RefID last_id = 0;
- for (vector<shared_ptr<Scaffold> >::iterator i = ref_mRNAs.begin(); i < ref_mRNAs.end(); ++i)
+ for (vector<boost::shared_ptr<Scaffold> >::iterator i = ref_mRNAs.begin(); i < ref_mRNAs.end(); ++i)
{
if ((*i)->ref_id() != last_id)
{
@@ -282,14 +285,14 @@ public:
next_ref_scaff = ref_mRNAs.begin();
}
- void set_mask_rnas(const vector<shared_ptr<Scaffold> >& masks)
+ void set_mask_rnas(const vector<boost::shared_ptr<Scaffold> >& masks)
{
#if ENABLE_THREADS
boost::mutex::scoped_lock lock(_factory_lock);
#endif
mask_gtf_recs = masks;
RefID last_id = 0;
- for (vector<shared_ptr<Scaffold> >::iterator i = mask_gtf_recs.begin(); i < mask_gtf_recs.end(); ++i)
+ for (vector<boost::shared_ptr<Scaffold> >::iterator i = mask_gtf_recs.begin(); i < mask_gtf_recs.end(); ++i)
{
if ((*i)->ref_id() != last_id)
{
@@ -309,7 +312,7 @@ public:
_bad_introns = bad_introns;
}
- void read_group_properties(shared_ptr<ReadGroupProperties> rg)
+ void read_group_properties(boost::shared_ptr<ReadGroupProperties> rg)
{
#if ENABLE_THREADS
boost::mutex::scoped_lock lock(_factory_lock);
@@ -317,7 +320,7 @@ public:
_rg_props = rg;
}
- shared_ptr<ReadGroupProperties> read_group_properties()
+ boost::shared_ptr<ReadGroupProperties> read_group_properties()
{
return _rg_props;
}
@@ -329,21 +332,21 @@ private:
bool _expand_by_hits(HitBundle& bundle);
bool _expand_by_refs(HitBundle& bundle);
- shared_ptr<HitFactory> _hit_fac;
+ boost::shared_ptr<HitFactory> _hit_fac;
- vector<shared_ptr<Scaffold> > ref_mRNAs;
+ vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
//FILE* ref_mRNA_file;
- vector<pair<RefID, vector<shared_ptr<Scaffold> >::iterator> > _ref_scaff_offsets;
- vector<shared_ptr<Scaffold> >::iterator next_ref_scaff;
+ vector<pair<RefID, vector<boost::shared_ptr<Scaffold> >::iterator> > _ref_scaff_offsets;
+ vector<boost::shared_ptr<Scaffold> >::iterator next_ref_scaff;
- vector<shared_ptr<Scaffold> > mask_gtf_recs;
+ vector<boost::shared_ptr<Scaffold> > mask_gtf_recs;
//FILE* mask_file;
- vector<pair<RefID, vector<shared_ptr<Scaffold> >::iterator> > _mask_scaff_offsets;
- vector<shared_ptr<Scaffold> >::iterator next_mask_scaff;
+ vector<pair<RefID, vector<boost::shared_ptr<Scaffold> >::iterator> > _mask_scaff_offsets;
+ vector<boost::shared_ptr<Scaffold> >::iterator next_mask_scaff;
BadIntronTable _bad_introns;
- shared_ptr<ReadGroupProperties> _rg_props;
+ boost::shared_ptr<ReadGroupProperties> _rg_props;
// Sets nva to point to the next valid alignment
// Returns the mass of any alignments that are seen, valid or not
@@ -367,10 +370,32 @@ private:
#endif
};
+class PrecomputedExpressionBundleFactory : public BundleFactory
+{
+public:
+ PrecomputedExpressionBundleFactory(boost::shared_ptr<PrecomputedExpressionHitFactory> fac)
+ : BundleFactory(fac, REF_DRIVEN), _hit_fac(fac)
+ {
+
+ }
+
+ bool next_bundle(HitBundle& bundle_out);
+
+ boost::shared_ptr<const AbundanceGroup> get_abundance_for_locus(int locus_id);
+ void clear_abundance_for_locus(int locus_id);
+
+private:
+
+ boost::shared_ptr<PrecomputedExpressionHitFactory> _hit_fac;
+#if ENABLE_THREADS
+ boost::mutex _factory_lock;
+#endif
+};
+
void identify_bad_splices(const HitBundle& bundle,
BadIntronTable& bad_splice_ops);
-void inspect_map(BundleFactory& bundle_factory,
+void inspect_map(boost::shared_ptr<BundleFactory> bundle_factory,
BadIntronTable* bad_introns,
vector<LocusCount>& compatible_count_table,
vector<LocusCount>& total_count_table,
diff --git a/src/clustering.cpp b/src/clustering.cpp
index 73d1558..c402ac3 100644
--- a/src/clustering.cpp
+++ b/src/clustering.cpp
@@ -12,7 +12,7 @@
void ConnectByExonOverlap::operator()(const AbundanceGroup& cluster,
AbundanceGraph& G)
{
- const vector<shared_ptr<Abundance> >& abundances = cluster.abundances();
+ const vector<boost::shared_ptr<Abundance> >& abundances = cluster.abundances();
for (size_t i = 0; i < abundances.size(); ++i)
{
add_vertex(G);
@@ -20,12 +20,12 @@ void ConnectByExonOverlap::operator()(const AbundanceGroup& cluster,
for (size_t i = 0; i < abundances.size(); ++i)
{
- shared_ptr<Scaffold> scaff_i = abundances[i]->transfrag();
+ boost::shared_ptr<Scaffold> scaff_i = abundances[i]->transfrag();
assert (scaff_i);
for (size_t j = i + 1; j < abundances.size(); ++j)
{
- shared_ptr<Scaffold> scaff_j = abundances[j]->transfrag();
+ boost::shared_ptr<Scaffold> scaff_j = abundances[j]->transfrag();
assert (scaff_j);
if (Scaffold::exons_overlap(*scaff_i, *scaff_j))
@@ -37,7 +37,7 @@ void ConnectByExonOverlap::operator()(const AbundanceGroup& cluster,
void ConnectByAnnotatedGeneId::operator()(const AbundanceGroup& cluster,
AbundanceGraph& G)
{
- const vector<shared_ptr<Abundance> >& abundances = cluster.abundances();
+ const vector<boost::shared_ptr<Abundance> >& abundances = cluster.abundances();
for (size_t i = 0; i < abundances.size(); ++i)
{
add_vertex(G);
@@ -60,7 +60,7 @@ void ConnectByAnnotatedGeneId::operator()(const AbundanceGroup& cluster,
void ConnectByAnnotatedTssId::operator()(const AbundanceGroup& cluster,
AbundanceGraph& G)
{
- const vector<shared_ptr<Abundance> >& abundances = cluster.abundances();
+ const vector<boost::shared_ptr<Abundance> >& abundances = cluster.abundances();
for (size_t i = 0; i < abundances.size(); ++i)
{
add_vertex(G);
@@ -83,7 +83,7 @@ void ConnectByAnnotatedTssId::operator()(const AbundanceGroup& cluster,
void ConnectByAnnotatedProteinId::operator()(const AbundanceGroup& cluster,
AbundanceGraph& G)
{
- const vector<shared_ptr<Abundance> >& abundances = cluster.abundances();
+ const vector<boost::shared_ptr<Abundance> >& abundances = cluster.abundances();
for (size_t i = 0; i < abundances.size(); ++i)
{
add_vertex(G);
@@ -106,7 +106,7 @@ void ConnectByAnnotatedProteinId::operator()(const AbundanceGroup& cluster,
void ConnectByStrand::operator()(const AbundanceGroup& cluster,
AbundanceGraph& G)
{
- const vector<shared_ptr<Abundance> >& abundances = cluster.abundances();
+ const vector<boost::shared_ptr<Abundance> >& abundances = cluster.abundances();
for (size_t i = 0; i < abundances.size(); ++i)
{
add_vertex(G);
@@ -114,12 +114,12 @@ void ConnectByStrand::operator()(const AbundanceGroup& cluster,
for (size_t i = 0; i < abundances.size(); ++i)
{
- shared_ptr<Scaffold> scaff_i = abundances[i]->transfrag();
+ boost::shared_ptr<Scaffold> scaff_i = abundances[i]->transfrag();
assert (scaff_i);
for (size_t j = i + 1; j < abundances.size(); ++j)
{
- shared_ptr<Scaffold> scaff_j = abundances[j]->transfrag();
+ boost::shared_ptr<Scaffold> scaff_j = abundances[j]->transfrag();
assert (scaff_j);
if (scaff_i->strand() == scaff_j->strand())
{
diff --git a/src/common.cpp b/src/common.cpp
index 7c94762..b82515f 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -55,6 +55,8 @@ uint32_t max_gene_length = 3500000;
std::string ref_gtf_filename = "";
std::string mask_gtf_filename = "";
std::string contrast_filename = "";
+bool use_sample_sheet = false;
+std::string norm_standards_filename = "";
std::string output_dir = "./";
std::string fasta_dir;
string default_library_type = "fr-unstranded";
@@ -156,6 +158,13 @@ string default_lib_norm_method = "geometric";
string default_cufflinks_lib_norm_method = "classic-fpkm";
LibNormalizationMethod lib_norm_method = LIB_NORM_NOT_SET;
+boost::shared_ptr<const std::map<std::string, LibNormStandards> > lib_norm_standards;
+
+// Output format table for Cuffnorm:
+map<string, OutputFormat> output_format_table;
+string default_output_format = "simple-table"; // note: the default is only for cuffnorm, Cuffdiff always uess the cuffdiff format
+OutputFormat output_format = OUTPUT_FMT_NOT_SET;
+
#if ENABLE_THREADS
boost::thread_specific_ptr<std::string> bundle_label;
@@ -352,20 +361,6 @@ void init_library_table()
//global_read_properties = &(library_type_table.find(default_library_type)->second);
}
-//string get_dispersion_method_str(DispersionMethod disp_meth)
-//{
-// switch (disp_meth)
-// {
-// case POOLED:
-// return "pooled";
-// case PER_CONDITION:
-// return "per-condition";
-// case BLIND:
-// return "blind";
-// }
-// return "";
-//}
-
void print_library_table()
{
fprintf (stderr, "\nSupported library types:\n");
@@ -389,6 +384,7 @@ void init_dispersion_method_table()
dispersion_method_table["pooled"] = POOLED;
dispersion_method_table["blind"] = BLIND;
dispersion_method_table["per-condition"] = PER_CONDITION;
+ dispersion_method_table["poisson"] = POISSON;
}
void print_dispersion_method_table()
@@ -415,7 +411,6 @@ void init_lib_norm_method_table()
lib_norm_method_table["geometric"] = GEOMETRIC;
lib_norm_method_table["classic-fpkm"] = CLASSIC_FPKM;
lib_norm_method_table["quartile"] = QUARTILE;
- lib_norm_method_table["poisson"] = QUARTILE;
//lib_norm_method_table["tmm"] = TMM;
//lib_norm_method_table["absolute"] = ABSOLUTE;
}
@@ -423,7 +418,6 @@ void init_lib_norm_method_table()
void init_cufflinks_lib_norm_method_table()
{
lib_norm_method_table["classic-fpkm"] = CLASSIC_FPKM;
- lib_norm_method_table["poisson"] = QUARTILE;
//lib_norm_method_table["quartile"] = QUARTILE;
//lib_norm_method_table["absolute"] = ABSOLUTE;
}
@@ -447,6 +441,30 @@ void print_lib_norm_method_table()
}
}
+void init_output_format_table()
+{
+ output_format_table["cuffdiff"] = CUFFDIFF_OUTPUT_FMT;
+ output_format_table["simple-table"] = SIMPLE_TABLE_OUTPUT_FMT;
+}
+
+void print_output_format_table()
+{
+ fprintf (stderr, "\nSupported output formats:\n");
+ for (map<string, OutputFormat>::const_iterator itr = output_format_table.begin();
+ itr != output_format_table.end();
+ ++itr)
+ {
+ if (itr->first == default_output_format)
+ {
+ fprintf(stderr, "\t%s (default)\n", itr->first.c_str());
+ }
+ else
+ {
+ fprintf(stderr, "\t%s\n", itr->first.c_str());
+ }
+ }
+}
+
// c_seq is complement, *NOT* REVERSE complement
diff --git a/src/common.h b/src/common.h
index 67f6989..55f5341 100644
--- a/src/common.h
+++ b/src/common.h
@@ -21,11 +21,28 @@
#include <boost/math/distributions/normal.hpp>
using boost::math::normal;
+#include <boost/archive/tmpdir.hpp>
+
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/archive/binary_oarchive.hpp>
+
+#include <boost/serialization/base_object.hpp>
+#include <boost/serialization/utility.hpp>
+#include <boost/serialization/list.hpp>
+#include <boost/serialization/map.hpp>
+#include <boost/serialization/set.hpp>
+#include <boost/serialization/vector.hpp>
+#include <boost/serialization/assume_abstract.hpp>
+#include <boost/serialization/shared_ptr.hpp>
+#include <boost/serialization/export.hpp>
+
#include <boost/foreach.hpp>
#include <boost/thread.hpp>
#include <boost/shared_ptr.hpp>
+#include <boost/crc.hpp>
+
// Non-option globals
extern bool final_est_run;
extern bool allow_junk_filtering;
@@ -48,6 +65,8 @@ extern uint32_t max_gene_length;
extern std::string ref_gtf_filename;
extern std::string mask_gtf_filename;
extern std::string contrast_filename;
+extern std::string norm_standards_filename;
+extern bool use_sample_sheet;
extern std::string output_dir;
extern std::string fasta_dir;
extern std::string library_type;
@@ -116,6 +135,7 @@ extern double min_outlier_p;
extern std::string default_dispersion_method;
extern std::string default_lib_norm_method;
extern std::string default_cufflinks_lib_norm_method;
+extern std::string default_output_format;
// SECRET OPTIONS:
// These options are just for instrumentation and benchmarking code
@@ -252,6 +272,14 @@ enum LibNormalizationMethod
ABSOLUTE // Requires spike-in controls, not yet implemented
};
+enum OutputFormat
+{
+ OUTPUT_FMT_NOT_SET,
+ CUFFDIFF_OUTPUT_FMT,
+ SIMPLE_TABLE_OUTPUT_FMT
+};
+
+
class EmpDist
{
//Vectors only valid between min and max!
@@ -264,6 +292,23 @@ class EmpDist
int _max;
FLDSource _source;
+ EmpDist() {}
+
+ friend std::ostream & operator<<(std::ostream &os, const EmpDist &gp);
+ friend class boost::serialization::access;
+
+ template<class Archive>
+ void serialize(Archive & ar, const unsigned int /* file_version */){
+ ar & _pdf;
+ ar & _cdf;
+ ar & _mode;
+ ar & _mean;
+ ar & _std_dev;
+ ar & _min;
+ ar & _max;
+ ar & _source;
+ }
+
public:
EmpDist(std::vector<double>& pdf, std::vector<double>& cdf, int mode, double mean, double std_dev, int min, int max, FLDSource source)
: _pdf(pdf), _cdf(cdf), _mode(mode), _mean(mean), _std_dev(std_dev), _min(min), _max(max), _source(source) {}
@@ -328,11 +373,125 @@ class MleErrorModel;
struct LocusCount
{
- LocusCount(std::string ld, double c, int nt) :
- locus_desc(ld), count(c), num_transcripts(nt) {}
+ LocusCount(std::string ld, double c, int nt, const std::vector<std::string>& gids, const std::vector<std::string>& gnms) :
+ locus_desc(ld), count(c), num_transcripts(nt), gene_ids(gids), gene_short_names(gnms) {}
std::string locus_desc;
double count;
int num_transcripts;
+ std::vector<std::string> gene_ids;
+ std::vector<std::string> gene_short_names;
+
+private:
+
+ LocusCount() {} //needs an empty constructor for serialization
+
+ friend std::ostream & operator<<(std::ostream &os, const LocusCount &gp);
+ friend class boost::serialization::access;
+
+ template<class Archive>
+ void serialize(Archive & ar, const unsigned int /* file_version */){
+ ar & locus_desc;
+ ar & count;
+ ar & num_transcripts;
+ ar & gene_ids;
+ ar & gene_short_names;
+ }
+};
+
+// This class stores user-supplied options that affect quantification
+// We'll serialize these into abundance files (i.e. CXB files)
+// so we can ensure that they're consistent across all samples
+// provided to cuffnorm and cuffdiff.
+struct CheckedParameters
+{
+ CheckedParameters() :
+ frag_len_mean(0.0),
+ frag_len_std_dev(0.0),
+ corr_bias(0.0),
+ frag_bias_mode(VLMM),
+ corr_multireads(false),
+ max_mle_iterations(false),
+ min_mle_accuracy(0.0),
+ max_bundle_frags(0.0),
+ max_frags_multihits(0.0),
+ no_effective_length_correction(false),
+ no_length_correction(false),
+ ref_gtf_file_path(""),
+ ref_gtf_crc(0),
+ mask_gtf_file_path(""),
+ mask_gtf_crc(0)
+ {} //needs an empty constructor for serialization
+
+ double frag_len_mean;
+ double frag_len_std_dev;
+
+ // TODO: add CRCs for reference GTF, mask file
+ bool corr_bias;
+
+ BiasMode frag_bias_mode;
+ bool corr_multireads;
+
+ double max_mle_iterations;
+ double min_mle_accuracy;
+
+ double max_bundle_frags;
+ double max_frags_multihits;
+
+ bool no_effective_length_correction;
+ bool no_length_correction;
+
+ std::string ref_gtf_file_path;
+ boost::crc_32_type::value_type ref_gtf_crc;
+
+ std::string mask_gtf_file_path;
+ boost::crc_32_type::value_type mask_gtf_crc;
+
+ friend std::ostream & operator<<(std::ostream &os, const CheckedParameters &gp);
+ friend class boost::serialization::access;
+
+ template<class Archive>
+ void serialize(Archive & ar, const unsigned int /* file_version */){
+ ar & frag_len_mean;
+ ar & frag_len_std_dev;
+ ar & corr_bias;
+ ar & frag_bias_mode;
+ ar & corr_multireads;
+ ar & max_mle_iterations;
+ ar & min_mle_accuracy;
+ ar & max_bundle_frags;
+ ar & max_frags_multihits;
+ ar & no_effective_length_correction;
+ ar & no_length_correction;
+ ar & ref_gtf_file_path;
+ ar & ref_gtf_crc;
+ ar & mask_gtf_file_path;
+ ar & mask_gtf_crc;
+ }
+
+ bool operator!=(const CheckedParameters& rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator==(const CheckedParameters& rhs) const
+ {
+ return (frag_len_mean == rhs.frag_len_mean &&
+ frag_len_std_dev == rhs.frag_len_std_dev &&
+ corr_bias == rhs.corr_bias &&
+ frag_bias_mode == rhs.frag_bias_mode &&
+ corr_multireads == rhs.corr_multireads &&
+ max_mle_iterations == rhs.max_mle_iterations &&
+ min_mle_accuracy == rhs.min_mle_accuracy &&
+ max_bundle_frags == rhs.max_bundle_frags &&
+ max_frags_multihits == rhs.max_frags_multihits &&
+ no_effective_length_correction == rhs.no_effective_length_correction &&
+ no_length_correction == rhs.no_length_correction &&
+ ref_gtf_file_path == rhs.ref_gtf_file_path &&
+ ref_gtf_crc == rhs.ref_gtf_crc &&
+ mask_gtf_file_path == rhs.mask_gtf_file_path &&
+ mask_gtf_crc == rhs.mask_gtf_crc);
+
+ }
+
};
class ReadGroupProperties
@@ -360,7 +519,7 @@ public:
void normalized_map_mass(long double p) { _norm_map_mass = p; }
boost::shared_ptr<EmpDist const> frag_len_dist() const { return _frag_len_dist; }
- void frag_len_dist(boost::shared_ptr<EmpDist const> p) { _frag_len_dist = p; }
+ void frag_len_dist(boost::shared_ptr<EmpDist const> p) { _frag_len_dist = p; }
boost::shared_ptr<BiasLearner const> bias_learner() const { return _bias_learner; }
void bias_learner(boost::shared_ptr<BiasLearner const> bl) { _bias_learner = bl; }
@@ -432,8 +591,77 @@ public:
int replicate_num() const { return _replicate_num; }
void replicate_num(int rn) { _replicate_num = rn; }
+ void ref_gtf(const std::string& file_path, const boost::crc_32_type& gtf_crc )
+ {
+ _checked_params.ref_gtf_file_path = file_path;
+ _checked_params.ref_gtf_crc = gtf_crc();
+ }
+
+ void mask_gtf(const std::string& file_path, const boost::crc_32_type& gtf_crc )
+ {
+ _checked_params.mask_gtf_file_path = file_path;
+ _checked_params.mask_gtf_crc = gtf_crc();
+ }
+
+
+ const CheckedParameters& checked_parameters() const { return _checked_params; }
+ void checked_parameters(const CheckedParameters& rhs) { _checked_params = rhs; }
+
+ // NOTE: this only picks up user-supplied options, not GTF files!
+ void collect_checked_parameters() {
+
+ _checked_params.frag_len_mean = def_frag_len_mean;
+ _checked_params.frag_len_std_dev = def_frag_len_std_dev;
+
+ // TODO: add CRCs for reference GTF, mask file, norm standards file if using.
+ _checked_params.corr_bias = corr_bias;
+
+ _checked_params.frag_bias_mode = bias_mode;
+ _checked_params.corr_multireads = corr_multi;
+
+ _checked_params.max_mle_iterations = max_mle_iterations;
+ _checked_params.min_mle_accuracy = mle_accuracy;
+
+ _checked_params.max_bundle_frags = max_frags_per_bundle;
+ _checked_params.max_frags_multihits = max_frag_multihits;
+
+ _checked_params.no_effective_length_correction = no_effective_length_correction;
+ _checked_params.no_length_correction = no_length_correction;
+ }
+
+
private:
+ friend std::ostream & operator<<(std::ostream &os, const ReadGroupProperties &gp);
+ friend class boost::serialization::access;
+
+ template<class Archive>
+ void serialize(Archive & ar, const unsigned int /* file_version */){
+ ar & _strandedness;
+ ar & _std_mate_orient;
+ ar & _mate_strand_mapping;
+ ar & _platform;
+ ar & _total_map_mass;
+ ar & _norm_map_mass;
+ ar & _frag_len_dist;
+ // TODO: probably should serialize the bias parameters somehow.
+ //ar & _bias_learner;
+ //ar & _multi_read_table; // we should never need this, I think.
+ ar & _internal_scale_factor;
+ ar & _external_scale_factor;
+ //ar & _mass_dispersion_model;
+ ar & _common_scale_compatible_counts;
+ ar & _common_scale_total_counts;
+ ar & _raw_compatible_counts;
+ ar & _raw_total_counts;
+ //ar & _mle_error_model;
+ ar & _complete_fragments;
+ ar & _condition_name;
+ ar & _file_path;
+ ar & _replicate_num;
+ ar & _checked_params;
+ }
+
Strandedness _strandedness;
StandardMateOrientation _std_mate_orient;
MateStrandMapping _mate_strand_mapping;
@@ -459,8 +687,12 @@ private:
std::string _condition_name;
std::string _file_path;
int _replicate_num;
+
+ CheckedParameters _checked_params;
};
+BOOST_SERIALIZATION_SHARED_PTR(ReadGroupProperties)
+
extern std::map<std::string, ReadGroupProperties> library_type_table;
extern const ReadGroupProperties* global_read_properties;
@@ -471,6 +703,10 @@ extern DispersionMethod dispersion_method;
extern std::map<std::string, LibNormalizationMethod> lib_norm_method_table;
extern LibNormalizationMethod lib_norm_method;
+extern std::map<std::string, OutputFormat> output_format_table;
+extern OutputFormat output_format;
+
+
void print_library_table();
void init_library_table();
@@ -481,6 +717,16 @@ void print_lib_norm_method_table();
void init_lib_norm_method_table();
void init_cufflinks_lib_norm_method_table();
+void print_output_format_table();
+void init_output_format_table();
+
+
+struct LibNormStandards
+{
+
+};
+
+extern boost::shared_ptr<const std::map<std::string, LibNormStandards> > lib_norm_standards;
template<typename T>
std::string cat_strings(const T& container, const char* delimiter=",")
@@ -564,5 +810,7 @@ std::string cat_strings(const T& container, const char* delimiter=",")
#define OPT_DISPERSION_METHOD 315
#define OPT_LIB_NORM_METHOD 316
#define OPT_NO_SCV_CORRECTION 317
-
+#define OPT_NORM_STANDARDS_FILE 318
+#define OPT_USE_SAMPLE_SHEET 319
+#define OPT_OUTPUT_FORMAT 320
#endif
diff --git a/src/compress_gtf.cpp b/src/compress_gtf.cpp
index fe8e1c4..7660e33 100644
--- a/src/compress_gtf.cpp
+++ b/src/compress_gtf.cpp
@@ -119,7 +119,7 @@ int parse_options(int argc, char** argv)
void compress_genes(FILE* ftranscripts,
RefSequenceTable& rt,
- vector<shared_ptr<Scaffold> >& ref_mRNAs)
+ vector<boost::shared_ptr<Scaffold> >& ref_mRNAs)
{
adjacency_list <vecS, vecS, undirectedS> G;
@@ -130,10 +130,10 @@ void compress_genes(FILE* ftranscripts,
for (size_t i = 0; i < ref_mRNAs.size(); ++i)
{
- shared_ptr<Scaffold> scaff_i = ref_mRNAs[i];
+ boost::shared_ptr<Scaffold> scaff_i = ref_mRNAs[i];
for (size_t j = 0; j < ref_mRNAs.size(); ++j)
{
- shared_ptr<Scaffold> scaff_j = ref_mRNAs[j];
+ boost::shared_ptr<Scaffold> scaff_j = ref_mRNAs[j];
if (scaff_i->annotated_gene_id() == scaff_j->annotated_gene_id())
add_edge(i, j, G);
}
@@ -147,7 +147,7 @@ void compress_genes(FILE* ftranscripts,
//vector<vector<size_t> > cluster_indices(three_prime_ends.size());
- vector<vector<shared_ptr<Scaffold> > > grouped_scaffolds(ref_mRNAs.size());
+ vector<vector<boost::shared_ptr<Scaffold> > > grouped_scaffolds(ref_mRNAs.size());
for (size_t i = 0; i < ref_mRNAs.size(); ++i)
{
clusters[component[i]][i] = true;
@@ -156,10 +156,10 @@ void compress_genes(FILE* ftranscripts,
for (size_t i = 0; i < grouped_scaffolds.size(); ++i)
{
- vector<shared_ptr<Scaffold> >& gene = grouped_scaffolds[i];
+ vector<boost::shared_ptr<Scaffold> >& gene = grouped_scaffolds[i];
vector<Scaffold> gene_scaffs;
string gene_id;
- BOOST_FOREACH (shared_ptr<Scaffold> s, gene)
+ BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
{
if (gene_id == "")
gene_id = s->annotated_gene_id();
@@ -175,7 +175,7 @@ void compress_genes(FILE* ftranscripts,
Scaffold smashed_gene;
if (!proj_intersection && !proj_union)
{
- BOOST_FOREACH (shared_ptr<Scaffold> s, gene)
+ BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
{
/*
*transfrag,
@@ -224,7 +224,7 @@ void compress_genes(FILE* ftranscripts,
int gmax = -1;
int gmin = numeric_limits<int>::max();
- BOOST_FOREACH (shared_ptr<Scaffold> s, gene)
+ BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
{
//iso_ops.push_back(s->augmented_ops());
//sort (iso_ops.back().begin(), iso_ops.back().end());
@@ -234,7 +234,7 @@ void compress_genes(FILE* ftranscripts,
gmax = s->right();
}
- BOOST_FOREACH (shared_ptr<Scaffold> s, gene)
+ BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
{
if (s->left() > gmin)
{
@@ -344,19 +344,20 @@ void driver(vector<FILE*> ref_gtf_files, FILE* gtf_out)
ReadTable it;
RefSequenceTable rt(true, false);
- vector<vector<shared_ptr<Scaffold> > > ref_mRNA_table;
+ vector<vector<boost::shared_ptr<Scaffold> > > ref_mRNA_table;
vector<pair<string, vector<double> > > sample_count_table;
BOOST_FOREACH (FILE* ref_gtf, ref_gtf_files)
{
- vector<shared_ptr<Scaffold> > ref_mRNAs;
- ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, false, true);
+ vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
+ boost::crc_32_type gtf_crc_result;
+ ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, gtf_crc_result, false, true);
ref_mRNA_table.push_back(ref_mRNAs);
}
for (size_t j = 0; j < ref_mRNA_table.size(); ++j)
{
- vector<shared_ptr<Scaffold> > ref_mRNAs = ref_mRNA_table[j];
+ vector<boost::shared_ptr<Scaffold> > ref_mRNAs = ref_mRNA_table[j];
if (!raw_fpkm)
compress_genes(gtf_out, rt, ref_mRNAs);
diff --git a/src/cuffcompare.cpp b/src/cuffcompare.cpp
index 444d901..0cdfc9a 100644
--- a/src/cuffcompare.cpp
+++ b/src/cuffcompare.cpp
@@ -53,13 +53,18 @@ Options:\n\
-p the name prefix to use for consensus transcripts in the \n\
<outprefix>.combined.gtf file (default: 'TCONS')\n\
-C include the \"contained\" transcripts in the .combined.gtf file\n\
--G generic GFF input file(s) (do not assume Cufflinks GTF)\n\
+-F do not discard intron-redundant transfrags if they share the 5' end\n\
+ (if they differ only at the 3' end))\n\
+-G generic GFF input file(s): do not assume Cufflinks GTF, do not\n\
+ discard any intron-redundant transfrags)\n\
-T do not generate .tmap and .refmap files for each input file\n\
-V verbose processing mode (showing all GFF parsing warnings)\n\
"
bool debug=false;
bool perContigStats=false; // -S to enable stats for every single contig
-bool generic_GFF=false; //-G, don't assume Cufflinks GTF as input
+//bool generic_GFF=false;
+//true if -G: won't discard intron-redundant transfrags
+
bool showContained=false; // -C
bool reduceRefs=false; //-R
bool reduceQrys=false; //-Q
@@ -179,7 +184,7 @@ int main(int argc, char * const argv[]) {
HeapProfilerStart("./cuffcompare_dbg.hprof");
#endif
- GArgs args(argc, argv, "XDTMNVGSCKQRLhp:e:d:s:i:n:r:o:");
+ GArgs args(argc, argv, "XDTMNVFGSCKQRLhp:e:d:s:i:n:r:o:");
int e;
if ((e=args.isError())>0) {
show_usage();
@@ -266,15 +271,25 @@ int main(int argc, char * const argv[]) {
if (f_ref==NULL) GError("Error opening reference gff: %s\n",s.chars());
haveRefs=true;
if (gtf_tracking_verbose) GMessage("Loading reference transcripts..\n");
- read_mRNAs(f_ref, ref_data, &ref_data, true, -1, s.chars(), (multiexonrefs_only || multiexon_only));
+ read_mRNAs(f_ref, ref_data, &ref_data, 1, -1, s.chars(), (multiexonrefs_only || multiexon_only));
haveRefs=(ref_data.Count()>0);
reduceRefs=(args.getOpt('R')!=NULL);
reduceQrys=(args.getOpt('Q')!=NULL);
if (gtf_tracking_verbose) GMessage("..reference annotation loaded\n");
}
- bool discard_redundant=true; //discard redundant input transfrags
- generic_GFF=args.getOpt('G');
- if (generic_GFF) discard_redundant=false; //generic GTF, don't try to discard "redundant" transcripts
+ int discard_redundant=1; //discard intron-redundant input transfrags
+ //generic_GFF=args.getOpt('G');
+ if (args.getOpt('G')) discard_redundant=0; //generic GTF, don't try to discard "redundant" transcripts
+ if (args.getOpt('F')) {
+ if (discard_redundant==0) {
+ show_usage();
+ GMessage("Error: options -F and -G are mutually exclusive!\n");
+ exit(1);
+ }
+ else {
+ discard_redundant=2; // don't discard "redundant" transcripts if they start with the same 5' intron
+ }
+ }
//if a full pathname is given
//the other common output files will still be created in the current directory:
// .loci, .tracking, .stats
@@ -1273,17 +1288,18 @@ void processLoci(GSeqData& seqdata, GSeqData* refdata, int qfidx) {
//adjust stats for a list of unoverlapped (completely missed) ref loci
void collectRLocData(GSuperLocus& stats, GLocus& loc) {
-stats.total_rmrnas+=loc.mrnas.Count();
-stats.total_rexons+=loc.uexons.Count();
-stats.total_rintrons+=loc.introns.Count();
-stats.total_rmexons+=loc.mexons.Count();
-stats.total_richains+=loc.ichains;
-stats.m_exons+=loc.uexons.Count();
-stats.m_introns+=loc.introns.Count();
-stats.total_rloci++;
-for (int e=0;e<loc.mexons.Count();e++) {
- stats.rbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
- }
+ stats.total_rmrnas+=loc.mrnas.Count();
+ stats.total_rexons+=loc.uexons.Count();
+ stats.total_rintrons+=loc.introns.Count();
+ stats.total_rmexons+=loc.mexons.Count();
+ stats.total_richains+=loc.ichains;
+ stats.m_exons+=loc.uexons.Count();
+ stats.m_introns+=loc.introns.Count();
+ stats.total_rloci++;
+ stats.m_loci++; //missed ref loci
+ for (int e=0;e<loc.mexons.Count();e++) {
+ stats.rbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
+ }
}
void collectRData(GSuperLocus& stats, GList<GLocus>& loci) {
@@ -1299,6 +1315,7 @@ void collectQLocData(GSuperLocus& stats, GLocus& loc) {
stats.total_qintrons+=loc.introns.Count();
stats.total_qichains+=loc.ichains;
stats.total_qloci++;
+ stats.w_loci++; //add to the count of novel/wrong loci
if (loc.ichains>0 && loc.mrnas.Count()>1)
stats.total_qloci_alt++;
stats.w_exons+=loc.uexons.Count();
@@ -1319,7 +1336,6 @@ void collectQData(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci)
void collectQNOvl(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci) {
for (int l=0;l<loci.Count();l++) {
if (loci[l]->cmpovl.Count()==0) {//locus with no ref loci overlaps
- stats.w_loci++; //novel/wrong loci
nloci.Add(loci[l]);
collectQLocData(stats,*loci[l]);
}
@@ -1328,7 +1344,7 @@ void collectQNOvl(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci)
void collectQU(GSuperLocus& stats, GList<GLocus>& nloci) {
for (int l=0;l<nloci.Count();l++) {
- stats.w_loci++; //novel/wrong loci
+ //stats.w_loci++; //novel/wrong loci
collectQLocData(stats, *nloci[l]);
}
}
@@ -1343,7 +1359,6 @@ void printLocus(FILE* f, GLocus& loc, const char* gseqname) {
void collectRNOvl(GSuperLocus& stats, GList<GLocus>& loci) { //, const char* gseqname) {
for (int l=0;l<loci.Count();l++) {
if (loci[l]->cmpovl.Count()==0) {
- stats.m_loci++; //missed ref loci
//if (f_mloci!=NULL)
// printLocus(f_mloci,*loci[l], gseqname);
collectRLocData(stats,*loci[l]);
@@ -1422,7 +1437,7 @@ void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
fprintf(fout, "# Reference mRNAs : %7d in %7d loci (%d multi-exon)\n",
ps->total_rmrnas, ps->total_rloci, ps->total_richains);
if (ps->baseTP+ps->baseFP==0 || ps->baseTP+ps->baseFN==0) return;
- fprintf(fout, "# Corresponding super-loci: %7d\n",ps->total_superloci);
+ fprintf(fout, "# Super-loci w/ reference transcripts: %7d\n",ps->total_superloci);
/*if (seqdata!=NULL) {
fprintf(fout, " ( %d/%d on forward/reverse strand)\n",
diff --git a/src/cuffdiff.cpp b/src/cuffdiff.cpp
index d7a65b5..40d4b46 100644
--- a/src/cuffdiff.cpp
+++ b/src/cuffdiff.cpp
@@ -39,6 +39,7 @@
#include <boost/numeric/ublas/vector.hpp>
#include <boost/numeric/ublas/vector_proxy.hpp>
#include <boost/numeric/ublas/io.hpp>
+#include <boost/algorithm/string.hpp>
#include "differential.h"
@@ -76,6 +77,8 @@ static struct option long_options[] = {
{"seed", required_argument, 0, OPT_RANDOM_SEED},
{"mask-file", required_argument, 0, 'M'},
{"contrast-file", required_argument, 0, 'C'},
+{"norm-standards-file", required_argument, 0, OPT_NORM_STANDARDS_FILE},
+{"use-sample-sheet", no_argument, 0, OPT_USE_SAMPLE_SHEET},
{"output-dir", required_argument, 0, 'o'},
{"verbose", no_argument, 0, 'v'},
{"quiet", no_argument, 0, 'q'},
@@ -138,7 +141,8 @@ void print_usage()
fprintf(stderr, " -L/--labels comma-separated list of condition labels\n");
fprintf(stderr, " --FDR False discovery rate used in testing [ default: 0.05 ]\n");
fprintf(stderr, " -M/--mask-file ignore all alignment within transcripts in this file [ default: NULL ]\n");
- //fprintf(stderr, " -C/--contrast-file Perform the constrasts specified in this file [ default: NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
+ fprintf(stderr, " -C/--contrast-file Perform the constrasts specified in this file [ default: NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
+ //fprintf(stderr, " --norm-standards-file Housekeeping/spike genes to normalize libraries [ default: NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
fprintf(stderr, " -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]\n");
fprintf(stderr, " -u/--multi-read-correct use 'rescue method' for multi-reads [ default: FALSE ]\n");
#if ENABLE_THREADS
@@ -259,7 +263,17 @@ int parse_options(int argc, char** argv)
contrast_filename = optarg;
break;
}
- case 'v':
+ case OPT_NORM_STANDARDS_FILE:
+ {
+ norm_standards_filename = optarg;
+ break;
+ }
+ case OPT_USE_SAMPLE_SHEET:
+ {
+ use_sample_sheet = true;
+ break;
+ }
+ case 'v':
{
if (cuff_quiet)
{
@@ -793,27 +807,17 @@ void print_read_group_tracking(FILE* fout,
for (size_t i = 0; i < fpkms.size(); ++i)
{
- for (CountPerReplicateTable::const_iterator itr = fpkms[i].count_per_rep.begin();
- itr != fpkms[i].count_per_rep.end();
- ++itr)
+ for (size_t j = 0; j != fpkms[i].tracking_info_per_rep.size();
+ ++j)
{
- FPKMPerReplicateTable::const_iterator f_itr = fpkms[i].fpkm_per_rep.find(itr->first);
- StatusPerReplicateTable::const_iterator s_itr = fpkms[i].status_per_rep.find(itr->first);
+ double FPKM = fpkms[i].tracking_info_per_rep[j].fpkm;
+ double internal_count = fpkms[i].tracking_info_per_rep[j].count;
+ double external_count = internal_count / fpkms[i].tracking_info_per_rep[j].rg_props->external_scale_factor();
+ double raw_count = internal_count * fpkms[i].tracking_info_per_rep[j].rg_props->internal_scale_factor();
+ const string& condition_name = fpkms[i].tracking_info_per_rep[j].rg_props->condition_name();
+ AbundanceStatus status = fpkms[i].tracking_info_per_rep[j].status;
-
- if (f_itr == fpkms[i].fpkm_per_rep.end())
- {
- fprintf(stderr, "Error: missing per-replicate FPKM data\n");
- }
-
- double FPKM = f_itr->second;
- double internal_count = itr->second;
- double external_count = internal_count / itr->first->external_scale_factor();
- double raw_count = internal_count * itr->first->internal_scale_factor();
- const string& condition_name = itr->first->condition_name();
- AbundanceStatus status = s_itr->second;
-
- int rep_num = itr->first->replicate_num();
+ int rep_num = fpkms[i].tracking_info_per_rep[j].rg_props->replicate_num();
const char* status_str = "OK";
@@ -854,12 +858,12 @@ void print_read_group_tracking(FILE* fout,
}
void print_read_group_info(FILE* fout,
- const vector<shared_ptr<ReadGroupProperties> >& all_read_groups)
+ const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups)
{
fprintf(fout, "file\tcondition\treplicate_num\ttotal_mass\tnorm_mass\tinternal_scale\texternal_scale\n");
for (size_t i = 0; i < all_read_groups.size(); ++i)
{
- shared_ptr<ReadGroupProperties const> rg_props = all_read_groups[i];
+ boost::shared_ptr<ReadGroupProperties const> rg_props = all_read_groups[i];
fprintf(fout, "%s\t%s\t%d\t%Lg\t%Lg\t%lg\t%lg\n",
rg_props->file_path().c_str(),
rg_props->condition_name().c_str(),
@@ -886,37 +890,6 @@ bool p_value_lt(const SampleDifference* lhs, const SampleDifference* rhs)
return lhs->p_value < rhs->p_value;
}
-//// Benjamani-Hochberg procedure
-//int fdr_significance(double fdr,
-// vector<SampleDifference*>& tests)
-//{
-// sort(tests.begin(), tests.end(), p_value_lt);
-// vector<SampleDifference*> passing;
-//
-// for (int k = 0; k < (int)tests.size(); ++k)
-// {
-// if (tests[k]->test_status == OK)
-// {
-// passing.push_back(tests[k]);
-// }
-// else
-// {
-// tests[k]->significant = false;
-// }
-// }
-// int significant = 0;
-// for (int k = 0; k < (int)passing.size(); ++k)
-// {
-// double r = (double)passing.size() / ((double) k + 1);
-// double corrected_p = passing[k]->p_value * r;
-// passing[k]->corrected_p = corrected_p;
-// passing[k]->significant = (corrected_p <= fdr);
-// significant += passing[k]->significant;
-// }
-//
-// return passing.size();
-//}
-
// Benjamani-Hochberg procedure
int fdr_significance(double fdr,
vector<SampleDifference*>& tests)
@@ -999,27 +972,27 @@ void inspect_map_worker(ReplicatedBundleFactory& fac,
#endif
}
-void learn_bias_worker(shared_ptr<BundleFactory> fac)
+void learn_bias_worker(boost::shared_ptr<BundleFactory> fac)
{
#if ENABLE_THREADS
boost::this_thread::at_thread_exit(decr_pool_count);
#endif
- shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
+ boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
BiasLearner* bl = new BiasLearner(rg_props->frag_len_dist());
learn_bias(*fac, *bl, false);
- rg_props->bias_learner(shared_ptr<BiasLearner>(bl));
+ rg_props->bias_learner(boost::shared_ptr<BiasLearner>(bl));
}
-shared_ptr<TestLauncher> test_launcher;
+boost::shared_ptr<TestLauncher> test_launcher;
bool quantitate_next_locus(const RefSequenceTable& rt,
- vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factories,
- shared_ptr<TestLauncher> launcher)
+ vector<boost::shared_ptr<ReplicatedBundleFactory> >& bundle_factories,
+ boost::shared_ptr<TestLauncher> launcher)
{
for (size_t i = 0; i < bundle_factories.size(); ++i)
{
- shared_ptr<SampleAbundances> s_ab = shared_ptr<SampleAbundances>(new SampleAbundances);
+ boost::shared_ptr<SampleAbundances> s_ab = boost::shared_ptr<SampleAbundances>(new SampleAbundances);
#if ENABLE_THREADS
while(1)
@@ -1039,35 +1012,54 @@ bool quantitate_next_locus(const RefSequenceTable& rt,
locus_curr_threads++;
locus_thread_pool_lock.unlock();
- thread quantitate(sample_worker,
+ boost::shared_ptr<HitBundle> pBundle = boost::shared_ptr<HitBundle>(new HitBundle());
+ bool non_empty = bundle_factories[i]->next_bundle(*pBundle);
+
+ if (pBundle->compatible_mass() > 0)
+ {
+ thread quantitate(sample_worker,
+ non_empty,
+ pBundle,
+ boost::ref(rt),
+ boost::ref(*(bundle_factories[i])),
+ s_ab,
+ i,
+ launcher,
+ true);
+ }
+ else
+ {
+ sample_worker(non_empty,
+ pBundle,
boost::ref(rt),
boost::ref(*(bundle_factories[i])),
s_ab,
i,
- launcher);
+ launcher,
+ true);
+ locus_thread_pool_lock.lock();
+ locus_curr_threads--;
+ locus_thread_pool_lock.unlock();
+ }
#else
- sample_worker(boost::ref(rt),
+ HitBundle bundle;
+ bool non_empty = sample_factory.next_bundle(bundle);
+
+ sample_worker(non_emtpy,
+ pBundle,
+ boost::ref(rt),
boost::ref(*(bundle_factories[i])),
s_ab,
i,
- launcher);
+ launcher,
+ true);
#endif
}
return true;
}
-void fit_mle_error()
-{
-
-}
-
-void normalize_as_pool(vector<shared_ptr<ReadGroupProperties> >& all_read_groups)
-{
-
-}
-
void parse_contrast_file(FILE* contrast_file,
- const vector<shared_ptr<ReplicatedBundleFactory> >& factories,
+ const vector<boost::shared_ptr<ReplicatedBundleFactory> >& factories,
vector<pair<size_t, size_t > >& contrasts)
{
@@ -1095,34 +1087,41 @@ void parse_contrast_file(FILE* contrast_file,
char* nl = strchr(pBuf, '\n');
if (nl)
*nl = 0;
- non_blank_lines_read++;
- vector<string> columns;
- tokenize(pBuf, "\t", columns);
- if (non_blank_lines_read == 1)
- continue;
+ string pBufstr = pBuf;
+ string trimmed = boost::trim_copy(pBufstr);
- if (columns.size() < 2)
+ if (trimmed.length() > 0 && trimmed[0] != '#')
{
- if (columns.size() > 0)
- fprintf(stderr, "Malformed record in contrast file: \n > %s\n", pBuf);
- else
+ non_blank_lines_read++;
+ vector<string> columns;
+ tokenize(trimmed, "\t", columns);
+
+ if (non_blank_lines_read == 1)
continue;
- }
-
- string factor_1 = columns[0];
- string factor_2 = columns[1];
-
- if (columns.size() >= 3)
- {
- string contrast_name = columns[2];
- contrast_table.insert(make_pair(contrast_name, make_pair(factor_1, factor_2)));
- }
- else
- {
- char contrast_name[1024];
- sprintf(contrast_name, "contrast_%lu", contrast_table.size());
- contrast_table.insert(make_pair(contrast_name, make_pair(factor_1, factor_2)));
+
+ if (columns.size() < 2)
+ {
+ if (columns.size() > 0)
+ fprintf(stderr, "Malformed record in contrast file: \n > %s\n", pBuf);
+ else
+ continue;
+ }
+
+ string factor_1 = columns[0];
+ string factor_2 = columns[1];
+
+ if (columns.size() >= 3)
+ {
+ string contrast_name = columns[2];
+ contrast_table.insert(make_pair(contrast_name, make_pair(factor_1, factor_2)));
+ }
+ else
+ {
+ char contrast_name[1024];
+ sprintf(contrast_name, "contrast_%lu", contrast_table.size());
+ contrast_table.insert(make_pair(contrast_name, make_pair(factor_1, factor_2)));
+ }
}
}
}
@@ -1151,7 +1150,66 @@ void parse_contrast_file(FILE* contrast_file,
}
}
-void init_default_contrasts(const vector<shared_ptr<ReplicatedBundleFactory> >& factories,
+void parse_sample_sheet_file(FILE* sample_sheet_file,
+ vector<string>& sample_labels,
+ vector<string>& sam_hit_filename_lists)
+{
+
+ char pBuf[10 * 1024];
+ size_t non_blank_lines_read = 0;
+
+ sample_labels.clear();
+
+ map<string, vector<string> > sample_groups;
+
+ while (fgets(pBuf, 10*1024, sample_sheet_file))
+ {
+ if (strlen(pBuf) > 0)
+ {
+ char* nl = strchr(pBuf, '\n');
+ if (nl)
+ *nl = 0;
+
+ string pBufstr = pBuf;
+ string trimmed = boost::trim_copy(pBufstr);
+
+ if (trimmed.length() > 0 && trimmed[0] != '#')
+ {
+ non_blank_lines_read++;
+ vector<string> columns;
+ tokenize(trimmed, "\t", columns);
+
+ if (non_blank_lines_read == 1)
+ continue;
+
+ if (columns.size() < 2)
+ {
+ if (columns.size() > 0)
+ fprintf(stderr, "Malformed record in sample sheet: \n > %s\n", pBuf);
+ else
+ continue;
+ }
+
+ string sam_file = columns[0];
+ string sample_group = columns[1];
+
+ pair<map<string, vector<string> >::iterator, bool> inserted = sample_groups.insert(make_pair(sample_group, vector<string>()));
+ inserted.first->second.push_back(sam_file);
+ }
+ }
+ }
+
+ for (map<string, vector<string> >::iterator itr = sample_groups.begin();
+ itr != sample_groups.end(); ++itr)
+ {
+ sample_labels.push_back(itr->first);
+ string sam_list = boost::join(itr->second, ",");
+ sam_hit_filename_lists.push_back(sam_list);
+ }
+}
+
+
+void init_default_contrasts(const vector<boost::shared_ptr<ReplicatedBundleFactory> >& factories,
bool samples_are_time_series,
vector<pair<size_t, size_t > >& contrasts)
{
@@ -1172,7 +1230,49 @@ void init_default_contrasts(const vector<shared_ptr<ReplicatedBundleFactory> >&
}
}
-void print_variability_models(FILE* var_model_out, const vector<shared_ptr<ReplicatedBundleFactory> >& factories)
+void parse_norm_standards_file(FILE* norm_standards_file)
+{
+ char pBuf[10 * 1024];
+ size_t non_blank_lines_read = 0;
+
+ boost::shared_ptr<map<string, LibNormStandards> > norm_standards(new map<string, LibNormStandards>);
+
+ while (fgets(pBuf, 10*1024, norm_standards_file))
+ {
+ if (strlen(pBuf) > 0)
+ {
+ char* nl = strchr(pBuf, '\n');
+ if (nl)
+ *nl = 0;
+
+ string pBufstr = pBuf;
+ string trimmed = boost::trim_copy(pBufstr);
+
+ if (trimmed.length() > 0 && trimmed[0] != '#')
+ {
+ non_blank_lines_read++;
+ vector<string> columns;
+ tokenize(trimmed, "\t", columns);
+
+ if (non_blank_lines_read == 1)
+ continue;
+
+ if (columns.size() < 1) //
+ {
+ continue;
+ }
+
+ string gene_id = columns[0];
+ LibNormStandards L;
+ norm_standards->insert(make_pair(gene_id, L));
+ }
+ }
+ }
+ lib_norm_standards = norm_standards;
+}
+
+
+void print_variability_models(FILE* var_model_out, const vector<boost::shared_ptr<ReplicatedBundleFactory> >& factories)
{
fprintf(var_model_out, "condition\tlocus\tcompatible_count_mean\tcompatible_count_var\ttotal_count_mean\ttotal_count_var\tfitted_var\n");
@@ -1180,7 +1280,7 @@ void print_variability_models(FILE* var_model_out, const vector<shared_ptr<Repli
for (size_t i = 0; i < factories.size(); ++i)
{
string factor_name = factories[i]->condition_name();
- shared_ptr<ReadGroupProperties> rg = factories[i]->factories()[0]->read_group_properties();
+ boost::shared_ptr<ReadGroupProperties> rg = factories[i]->factories()[0]->read_group_properties();
boost::shared_ptr<MassDispersionModel const> model = rg->mass_dispersion_model();
// const vector<double>& means = model->scaled_compatible_mass_means();
// const vector<double>& raw_vars = model->scaled_compatible_variances();
@@ -1221,7 +1321,7 @@ struct DispModelAverageContext
double weight;
};
-void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factories)
+void fit_dispersions(vector<boost::shared_ptr<ReplicatedBundleFactory> >& bundle_factories)
{
if (dispersion_method == PER_CONDITION)
{
@@ -1235,7 +1335,7 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
size_t num_samples = 0;
for (size_t cond_idx = 0; cond_idx < bundle_factories.size(); ++cond_idx)
{
- const vector<shared_ptr<BundleFactory> >& factories = bundle_factories[cond_idx]->factories();
+ const vector<boost::shared_ptr<BundleFactory> >& factories = bundle_factories[cond_idx]->factories();
for (size_t fac_idx = 0; fac_idx < factories.size(); ++fac_idx)
{
num_samples++;
@@ -1248,12 +1348,12 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
size_t curr_fac = 0;
for (size_t cond_idx = 0; cond_idx < bundle_factories.size(); ++cond_idx)
{
- vector<shared_ptr<BundleFactory> > factories = bundle_factories[cond_idx]->factories();
+ vector<boost::shared_ptr<BundleFactory> > factories = bundle_factories[cond_idx]->factories();
for (size_t fac_idx = 0; fac_idx < factories.size(); ++fac_idx)
{
- shared_ptr<BundleFactory> fac = factories[fac_idx];
+ boost::shared_ptr<BundleFactory> fac = factories[fac_idx];
- shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
+ boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
const vector<LocusCount>& compatible_count_table = rg_props->common_scale_compatible_counts();
const vector<LocusCount>& total_count_table = rg_props->common_scale_total_counts();
@@ -1265,7 +1365,7 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
if (i >= sample_compatible_count_table.size())
{
- LocusCountList locus_count(c.locus_desc, num_samples, c.num_transcripts);
+ LocusCountList locus_count(c.locus_desc, num_samples, c.num_transcripts, c.gene_ids, c.gene_short_names);
sample_compatible_count_table.push_back(locus_count);
sample_compatible_count_table.back().counts[0] = common_scale_compatible_count;
sample_total_count_table.push_back(locus_count);
@@ -1289,7 +1389,7 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
curr_fac += factories.size();
}
- shared_ptr<MassDispersionModel> disperser = fit_dispersion_model("blind", scale_factors, sample_compatible_count_table);
+ boost::shared_ptr<MassDispersionModel> disperser = fit_dispersion_model("blind", scale_factors, sample_compatible_count_table);
vector<pair<double, double> > compatible_means_and_vars;
calculate_count_means_and_vars(sample_compatible_count_table,
@@ -1328,18 +1428,18 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
}
// now need to replace them with the average
- shared_ptr<MassDispersionModel> pooled_model;
+ boost::shared_ptr<MassDispersionModel> pooled_model;
// Let's compute the pooled average of the dispersion models
if (dispersion_method != BLIND)
{
- vector<shared_ptr<MassDispersionModel const> > disp_models;
+ vector<boost::shared_ptr<MassDispersionModel const> > disp_models;
double total_replicates = 0.0;
vector<double> disp_model_weight;
- BOOST_FOREACH (shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
{
total_replicates += fac->num_replicates();
}
- BOOST_FOREACH (shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
{
if (fac->num_replicates() > 1)
{
@@ -1350,7 +1450,7 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
double max_mass = 0.0;
- BOOST_FOREACH(shared_ptr<MassDispersionModel const> disp, disp_models)
+ BOOST_FOREACH(boost::shared_ptr<MassDispersionModel const> disp, disp_models)
{
if (disp->scaled_compatible_mass_means().empty() == false && max_mass < disp->scaled_compatible_mass_means().back())
{
@@ -1361,7 +1461,7 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
map<std::string, vector<DispModelAverageContext> > disp_info_by_locus;
for (size_t disp_idx = 0; disp_idx < disp_models.size(); ++disp_idx)
{
- shared_ptr<MassDispersionModel const> disp = disp_models[disp_idx];
+ boost::shared_ptr<MassDispersionModel const> disp = disp_models[disp_idx];
const std::map<std::string, std::pair<double, double> >& total_mv_by_locus = disp->total_mv_by_locus();
const std::map<std::string, std::pair<double, double> >& compatible_mv_by_locus = disp->compatible_mv_by_locus();
@@ -1433,7 +1533,7 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
double var_est = 0.0;
for(size_t i = 0; i < disp_models.size(); ++i)
{
- shared_ptr<MassDispersionModel const> disp = disp_models[i];
+ boost::shared_ptr<MassDispersionModel const> disp = disp_models[i];
double weight = disp_model_weight[i];
var_est += disp->scale_mass_variance(frag_idx) * weight;
}
@@ -1441,7 +1541,7 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
est_fitted_var.push_back(var_est);
}
- pooled_model = shared_ptr<MassDispersionModel>(new MassDispersionModel("pooled", compatible_mass, compatible_variances, est_fitted_var));
+ pooled_model = boost::shared_ptr<MassDispersionModel>(new MassDispersionModel("pooled", compatible_mass, compatible_variances, est_fitted_var));
for (map<std::string, DispModelAverageContext>::iterator itr = pooled_info_by_locus.begin();
itr != pooled_info_by_locus.end();
@@ -1458,7 +1558,7 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
if (dispersion_method == POOLED)
{
- BOOST_FOREACH (shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
{
fac->mass_dispersion_model(pooled_model);
}
@@ -1468,7 +1568,7 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
}
else if (dispersion_method == POISSON)
{
- shared_ptr<MassDispersionModel> disperser = shared_ptr<MassDispersionModel>(new PoissonDispersionModel(""));
+ boost::shared_ptr<MassDispersionModel> disperser = boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(""));
for (size_t i = 0; i < bundle_factories.size(); ++i)
{
bundle_factories[i]->mass_dispersion_model(disperser);
@@ -1481,54 +1581,62 @@ void fit_dispersions(vector<shared_ptr<ReplicatedBundleFactory> >& bundle_factor
}
}
-void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>& sam_hit_filename_lists, Outfiles& outfiles)
+void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, FILE* norm_standards_file, vector<string>& sam_hit_filename_lists, Outfiles& outfiles)
{
ReadTable it;
RefSequenceTable rt(true, false);
- vector<shared_ptr<Scaffold> > ref_mRNAs;
+ vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
- vector<shared_ptr<ReplicatedBundleFactory> > bundle_factories;
- vector<shared_ptr<ReadGroupProperties> > all_read_groups;
- vector<shared_ptr<HitFactory> > all_hit_factories;
+ vector<boost::shared_ptr<ReplicatedBundleFactory> > bundle_factories;
+ vector<boost::shared_ptr<ReadGroupProperties> > all_read_groups;
for (size_t i = 0; i < sam_hit_filename_lists.size(); ++i)
{
vector<string> sam_hit_filenames;
tokenize(sam_hit_filename_lists[i], ",", sam_hit_filenames);
- vector<shared_ptr<BundleFactory> > replicate_factories;
+ vector<boost::shared_ptr<BundleFactory> > replicate_factories;
string condition_name = sample_labels[i];
for (size_t j = 0; j < sam_hit_filenames.size(); ++j)
{
- shared_ptr<HitFactory> hs;
+ boost::shared_ptr<HitFactory> hs;
+ boost::shared_ptr<BundleFactory> hf;
try
{
- hs = shared_ptr<HitFactory>(new BAMHitFactory(sam_hit_filenames[j], it, rt));
+ hs = boost::shared_ptr<HitFactory>(new PrecomputedExpressionHitFactory(sam_hit_filenames[j], it, rt));
+ hf = boost::shared_ptr<BundleFactory>(new PrecomputedExpressionBundleFactory(static_pointer_cast<PrecomputedExpressionHitFactory>(hs)));
}
- catch (std::runtime_error& e)
+
+ catch(boost::archive::archive_exception & e)
{
try
{
- fprintf(stderr, "File %s doesn't appear to be a valid BAM file, trying SAM...\n",
- sam_hit_filenames[j].c_str());
- hs = shared_ptr<HitFactory>(new SAMHitFactory(sam_hit_filenames[j], it, rt));
+ hs = boost::shared_ptr<HitFactory>(new BAMHitFactory(sam_hit_filenames[j], it, rt));
}
- catch (std::runtime_error& e)
+ catch (std::runtime_error& e)
{
- fprintf(stderr, "Error: cannot open alignment file %s for reading\n",
- sam_hit_filenames[j].c_str());
- exit(1);
+ try
+ {
+// fprintf(stderr, "File %s doesn't appear to be a valid BAM file, trying SAM...\n",
+// sam_hit_filenames[j].c_str());
+ hs = boost::shared_ptr<HitFactory>(new SAMHitFactory(sam_hit_filenames[j], it, rt));
+ }
+ catch (std::runtime_error& e)
+ {
+ fprintf(stderr, "Error: cannot open file %s for reading. Unrecognized file type\n",
+ sam_hit_filenames[j].c_str());
+ exit(1);
+ }
}
+ hf = boost::shared_ptr<BundleFactory>(new BundleFactory(hs, REF_DRIVEN));
}
- all_hit_factories.push_back(hs);
- shared_ptr<BundleFactory> hf(new BundleFactory(hs, REF_DRIVEN));
- shared_ptr<ReadGroupProperties> rg_props(new ReadGroupProperties);
+ boost::shared_ptr<ReadGroupProperties> rg_props(new ReadGroupProperties);
if (global_read_properties)
{
@@ -1539,6 +1647,7 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
*rg_props = hs->read_group_properties();
}
+ rg_props->checked_parameters(hs->read_group_properties().checked_parameters());
rg_props->condition_name(condition_name);
rg_props->replicate_num(j);
rg_props->file_path(sam_hit_filenames[j]);
@@ -1551,20 +1660,22 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
//replicate_factories.back()->set_ref_rnas(ref_mRNAs);
}
- bundle_factories.push_back(shared_ptr<ReplicatedBundleFactory>(new ReplicatedBundleFactory(replicate_factories, condition_name)));
+ bundle_factories.push_back(boost::shared_ptr<ReplicatedBundleFactory>(new ReplicatedBundleFactory(replicate_factories, condition_name)));
}
- ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, corr_bias, false);
+ boost::crc_32_type ref_gtf_crc_result;
+ ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, ref_gtf_crc_result, corr_bias, false);
if (ref_mRNAs.empty())
return;
- vector<shared_ptr<Scaffold> > mask_rnas;
+ vector<boost::shared_ptr<Scaffold> > mask_rnas;
if (mask_gtf)
{
- ::load_ref_rnas(mask_gtf, rt, mask_rnas, false, false);
+ boost::crc_32_type mask_gtf_crc_result;
+ ::load_ref_rnas(mask_gtf, rt, mask_rnas, mask_gtf_crc_result, false, false);
}
- BOOST_FOREACH (shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
{
fac->set_ref_rnas(ref_mRNAs);
if (mask_gtf)
@@ -1581,6 +1692,12 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
init_default_contrasts(bundle_factories, samples_are_time_series, contrasts);
}
+ if (norm_standards_file != NULL)
+ {
+ parse_norm_standards_file(norm_standards_file);
+ }
+
+ validate_cross_sample_parameters(all_read_groups);
#if ENABLE_THREADS
locus_num_threads = num_threads;
@@ -1625,7 +1742,7 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
int tmp_max_frag_len = 0;
ProgressBar p_bar("Inspecting maps and determining fragment length distributions.",0);
- BOOST_FOREACH (shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
{
#if ENABLE_THREADS
while(1)
@@ -1673,207 +1790,12 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
normalize_counts(all_read_groups);
fit_dispersions(bundle_factories);
-
- /*
- if (pool_all_samples)
- {
- ProgressBar("Modeling dispersion on all samples as a pool", 0);
- normalize_as_pool(all_read_groups);
- }
- else if ((use_quartile_norm || use_geometric_norm))
- {
- ProgressBar("Modeling dispersion by condition", 0);
- vector<LocusCountList> sample_count_table;
-
- //vector<shared_ptr<ReplicatedBundleFactory> > bundle_factories;
-
- for (size_t fac_idx = 0; fac_idx < bundle_factories.size(); ++fac_idx)
- {
- shared_ptr<ReplicatedBundleFactory> rep_fac = bundle_factories[fac_idx];
- vector<shared_ptr<BundleFactory> > replicates = rep_fac->factories();
- vector<double> count_table;
- for (size_t j = 0; j < replicates.size(); ++j)
- {
- shared_ptr<ReadGroupProperties> rg = replicates[j]->read_group_properties();
- const vector<LocusCount>& rep_count_table = rg->common_scale_compatible_counts();
- if (count_table.empty())
- count_table = vector<double>(rep_count_table.size(), 0);
-
- for (size_t i = 0; i < rep_count_table.size(); ++i)
- {
- const LocusCount& c = rep_count_table[i];
- double count = c.count;
- count_table[i] += (count / replicates.size());;
- }
-
- }
-
- for (size_t i = 0; i < count_table.size(); ++i)
- {
-
- const LocusCount& c = replicates.front()->read_group_properties()->common_scale_compatible_counts()[i];
- double count = count_table[i];
-
- if (i >= sample_count_table.size())
- {
- LocusCountList locus_count(c.locus_desc, bundle_factories.size(), c.num_transcripts);
- sample_count_table.push_back(locus_count);
- sample_count_table.back().counts[0] = count;
- }
- else
- {
- if (sample_count_table[i].locus_desc != c.locus_desc)
- {
- fprintf (stderr, "Error: bundle boundaries don't match across replicates!\n");
- exit(1);
- }
- sample_count_table[i].counts[fac_idx] = count;
- }
-
- }
- }
-
- vector<double> scale_factors(bundle_factories.size(), 0.0);
-
- calc_scaling_factors(sample_count_table, scale_factors);
-
- for (size_t j = 0; j < scale_factors.size(); ++j)
- {
- shared_ptr<ReplicatedBundleFactory> rep_fac = bundle_factories[j];
- vector<shared_ptr<BundleFactory> > replicates = rep_fac->factories();
-
- for (size_t i = 0; i < replicates.size(); ++i)
- {
- shared_ptr<ReadGroupProperties> rg = replicates[i]->read_group_properties();
- rg->external_scale_factor(scale_factors[j]);
- }
-
- double total = 0.0;
-
- for (size_t i = 0; i < sample_count_table.size(); ++i)
- {
- total += sample_count_table[i].counts[j];
- }
- //double sf = scale_factors[j];
- //fprintf(stderr, "SF: %lg, Total: %lg\n", sf, total);
- }
-
- if (use_quartile_norm)
- {
- vector<double> upper_quartiles(bundle_factories.size(), 0);
- vector<double> total_common_masses(bundle_factories.size(), 0);
-
- for (size_t fac_idx = 0; fac_idx < bundle_factories.size(); ++fac_idx)
- {
- //shared_ptr<ReadGroupProperties> rg = bundle_factories[fac_idx];
- //double scaled_mass = scale_factors[fac_idx] * rg->total_map_mass();
- vector<double> common_scaled_counts;
- double total_common = 0.0;
-
- for (size_t j = 0; j < sample_count_table.size(); ++j)
- {
- total_common += sample_count_table[j].counts[fac_idx];
- common_scaled_counts.push_back(sample_count_table[j].counts[fac_idx]);
- }
-
- sort(common_scaled_counts.begin(), common_scaled_counts.end());
- if (common_scaled_counts.empty())
- continue;
-
- int upper_quart_index = common_scaled_counts.size() * 0.75;
- double upper_quart_count = common_scaled_counts[upper_quart_index];
- upper_quartiles[fac_idx] = upper_quart_count;
- total_common_masses[fac_idx] = total_common;
- }
-
- long double total_mass = accumulate(total_common_masses.begin(), total_common_masses.end(), 0.0);
- long double total_norm_mass = accumulate(upper_quartiles.begin(), upper_quartiles.end(), 0.0);
-
- for (size_t fac_idx = 0; fac_idx < bundle_factories.size(); ++fac_idx)
- {
- if (total_mass > 0)
- {
- //double scaling_factor = total_mass / total_norm_mass;
- double external_scaling_factor = upper_quartiles[fac_idx] / (total_norm_mass / upper_quartiles.size());
- BOOST_FOREACH(shared_ptr<BundleFactory> bf, bundle_factories[fac_idx]->factories())
- {
- //double scaled_mass = scaling_factor * upper_quartiles[fac_idx];
- bf->read_group_properties()->normalized_map_mass(total_mass / total_common_masses.size());
- //bf->read_group_properties()->external_scale_factor(1.0);
- bf->read_group_properties()->external_scale_factor(external_scaling_factor);
- }
- }
- }
- }
- else
- {
- transform_counts_to_common_scale(scale_factors, sample_count_table);
-
- double avg_total_common_scaled_count = 0.0;
-
- for (size_t fac_idx = 0; fac_idx < bundle_factories.size(); ++fac_idx)
- {
- //shared_ptr<ReadGroupProperties> rg = bundle_factories[fac_idx];
- //double scaled_mass = scale_factors[fac_idx] * rg->total_map_mass();
- double total_common = 0.0;
- for (size_t j = 0; j < sample_count_table.size(); ++j)
- {
- total_common += sample_count_table[j].counts[fac_idx];
- }
-
- avg_total_common_scaled_count += (1.0/bundle_factories.size()) * total_common;
-
- //rg->normalized_map_mass(scale_factors[fac_idx])
- }
-
- for (size_t fac_idx = 0; fac_idx < bundle_factories.size(); ++fac_idx)
- {
- BOOST_FOREACH(shared_ptr<BundleFactory> bf, bundle_factories[fac_idx]->factories())
- {
- bf->read_group_properties()->normalized_map_mass(avg_total_common_scaled_count);
- }
- }
- }
-
-
-
-// BOOST_FOREACH (shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
-// {
-// // for now, "borrow" the dispersion model for the condition with the most replicates
-// size_t borrowed_disp_model_idx = most_reps_idx;
-// if (fac->num_replicates() == 1)
-// {
-// fac->mass_dispersion_model(bundle_factories[borrowed_disp_model_idx]->mass_dispersion_model());
-// double borrowed_internal_size_factor = scale_factors[borrowed_disp_model_idx];
-// double borrowed_external_size_factor = scale_factors[borrowed_disp_model_idx];
-// double borrowed_norm_map_mass = bundle_factories[borrowed_disp_model_idx]->factories().front()->read_group_properties()->normalized_map_mass();
-// BOOST_FOREACH(shared_ptr<BundleFactory> bf, fac->factories())
-// {
-// // we need to adjust the scaling factors so that the FPKMs aren't skewed
-// // and the variance function from the dispersion model is correct.
-// //bf->read_group_properties()->normalized_map_mass(avg_total_common_scaled_count);
-// bf->read_group_properties()->internal_scale_factor(bf->read_group_properties()->external_scale_factor()/borrowed_internal_size_factor);
-// bf->read_group_properties()->normalized_map_mass(borrowed_norm_map_mass);
-// bf->read_group_properties()->external_scale_factor(borrowed_external_size_factor);
-// }
-// }
-// }
- }
- else if (use_raw_mapped_norm)
- {
- // no need to do anything beyond what's already being done during
- // per-condition map inspection. Counts are common-scale-transformed
- // on a per condition basis. External scale factors are set to 1.0
- // by default
- }
-
-*/
print_variability_models(outfiles.var_model_out, bundle_factories);
for (size_t i = 0; i < all_read_groups.size(); ++i)
{
- shared_ptr<ReadGroupProperties> rg = all_read_groups[i];
+ boost::shared_ptr<ReadGroupProperties> rg = all_read_groups[i];
fprintf(stderr, "> Map Properties:\n");
fprintf(stderr, ">\tNormalized Map Mass: %.2Lf\n", rg->normalized_map_mass());
@@ -1900,7 +1822,7 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
long double total_norm_mass = 0.0;
long double total_mass = 0.0;
- BOOST_FOREACH (shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
+ BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
{
total_norm_mass += rg_props->normalized_map_mass();
total_mass += rg_props->total_map_mass();
@@ -1925,16 +1847,16 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
Tracking tracking;
- test_launcher = shared_ptr<TestLauncher>(new TestLauncher(bundle_factories.size(), contrasts, NULL, &tracking, &p_bar));
+ test_launcher = boost::shared_ptr<TestLauncher>(new TestLauncher(bundle_factories.size(), contrasts, NULL, &tracking, &p_bar));
if (model_mle_error || corr_bias || corr_multi) // Only run initial estimation if correcting bias or multi-reads
{
while (1)
{
- shared_ptr<vector<shared_ptr<SampleAbundances> > > abundances(new vector<shared_ptr<SampleAbundances> >());
+ boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
quantitate_next_locus(rt, bundle_factories, test_launcher);
bool more_loci_remain = false;
- BOOST_FOREACH (shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
{
if (rep_fac->bundles_remain())
{
@@ -1966,7 +1888,7 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
}
}
- BOOST_FOREACH (shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
{
rep_fac->reset();
}
@@ -1977,9 +1899,9 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
{
bias_run = true;
p_bar = ProgressBar("Learning bias parameters.", 0);
- BOOST_FOREACH (shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
{
- BOOST_FOREACH (shared_ptr<BundleFactory> fac, rep_fac->factories())
+ BOOST_FOREACH (boost::shared_ptr<BundleFactory> fac, rep_fac->factories())
{
#if ENABLE_THREADS
while(1)
@@ -2020,7 +1942,7 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
boost::this_thread::sleep(boost::posix_time::milliseconds(5));
}
#endif
- BOOST_FOREACH (shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
{
rep_fac->reset();
}
@@ -2028,7 +1950,7 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
}
fprintf(outfiles.bias_out, "condition_name\treplicate_num\tparam\tpos_i\tpos_j\tvalue\n");
- BOOST_FOREACH (shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
+ BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
{
if (rg_props->bias_learner())
rg_props->bias_learner()->output(outfiles.bias_out, rg_props->condition_name(), rg_props->replicate_num());
@@ -2036,7 +1958,7 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
// Allow the multiread tables to do their thing...
- BOOST_FOREACH (shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
+ BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
{
rg_props->multi_read_table()->valid_mass(true);
}
@@ -2069,14 +1991,14 @@ void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, vector<string>&
final_est_run = true;
p_bar = ProgressBar("Testing for differential expression and regulation in locus.", num_bundles);
- test_launcher = shared_ptr<TestLauncher>(new TestLauncher(bundle_factories.size(), contrasts, &tests, &tracking, &p_bar));
+ test_launcher = boost::shared_ptr<TestLauncher>(new TestLauncher(bundle_factories.size(), contrasts, &tests, &tracking, &p_bar));
while (true)
{
- //shared_ptr<vector<shared_ptr<SampleAbundances> > > abundances(new vector<shared_ptr<SampleAbundances> >());
+ //boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
quantitate_next_locus(rt, bundle_factories, test_launcher);
bool more_loci_remain = false;
- BOOST_FOREACH (shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
{
if (rep_fac->bundles_remain())
{
@@ -2386,15 +2308,51 @@ int main(int argc, char** argv)
if (!no_update_check)
check_version(PACKAGE_VERSION);
-
string ref_gtf_filename = argv[optind++];
-
- vector<string> sam_hit_filenames;
- while(optind < argc)
+ vector<string> sam_hit_filenames;
+
+ if (use_sample_sheet)
+ {
+ if (optind < argc)
+ {
+
+ string sample_sheet_filename = argv[optind++];
+ FILE* sample_sheet_file = NULL;
+ if (sample_sheet_filename != "")
+ {
+ sample_sheet_file = fopen(sample_sheet_filename.c_str(), "r");
+ if (!sample_sheet_file)
+ {
+ fprintf(stderr, "Error: cannot open sample sheet file %s for reading\n",
+ sample_sheet_filename.c_str());
+ exit(1);
+ }
+ }
+ parse_sample_sheet_file(sample_sheet_file, sample_labels, sam_hit_filenames);
+ }
+ else
+ {
+ fprintf(stderr, "Error: option --use-sample-sheet requires a single sample sheet filename instead of a list of SAM/BAM files\n");
+ }
+ }
+ else
{
- string sam_hits_file_name = argv[optind++];
- sam_hit_filenames.push_back(sam_hits_file_name);
+ while(optind < argc)
+ {
+ string sam_hits_file_name = argv[optind++];
+ sam_hit_filenames.push_back(sam_hits_file_name);
+ }
+
+ if (sample_labels.size() == 0)
+ {
+ for (size_t i = 1; i < sam_hit_filenames.size() + 1; ++i)
+ {
+ char buf[256];
+ sprintf(buf, "q%lu", i);
+ sample_labels.push_back(buf);
+ }
+ }
}
while (sam_hit_filenames.size() < 2)
@@ -2403,15 +2361,6 @@ int main(int argc, char** argv)
exit(1);
}
- if (sample_labels.size() == 0)
- {
- for (size_t i = 1; i < sam_hit_filenames.size() + 1; ++i)
- {
- char buf[256];
- sprintf(buf, "q%lu", i);
- sample_labels.push_back(buf);
- }
- }
if (sam_hit_filenames.size() != sample_labels.size())
{
@@ -2462,8 +2411,20 @@ int main(int argc, char** argv)
exit(1);
}
}
+
+ FILE* norm_standards_file = NULL;
+ if (norm_standards_filename != "")
+ {
+ norm_standards_file = fopen(norm_standards_filename.c_str(), "r");
+ if (!norm_standards_file)
+ {
+ fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
+ norm_standards_filename.c_str());
+ exit(1);
+ }
+ }
-
+
// Note: we don't want the assembly filters interfering with calculations
// here
@@ -2746,7 +2707,7 @@ int main(int argc, char** argv)
outfiles.var_model_out = var_model_out;
- driver(ref_gtf, mask_gtf, contrast_file, sam_hit_filenames, outfiles);
+ driver(ref_gtf, mask_gtf, contrast_file, norm_standards_file, sam_hit_filenames, outfiles);
#if 0
if (emit_count_tables)
diff --git a/src/cufflinks.cpp b/src/cufflinks.cpp
index 6793e4e..e8df03a 100644
--- a/src/cufflinks.cpp
+++ b/src/cufflinks.cpp
@@ -515,7 +515,7 @@ int parse_options(int argc, char** argv)
void combine_strand_assemblies(vector<Scaffold>& lhs,
vector<Scaffold>& rhs,
vector<Scaffold>& scaffolds,
- vector<shared_ptr<Scaffold> >* ref_scaffs)
+ vector<boost::shared_ptr<Scaffold> >* ref_scaffs)
{
// first check for strand support
for (size_t l = 0; l < lhs.size(); ++l)
@@ -537,7 +537,7 @@ void combine_strand_assemblies(vector<Scaffold>& lhs,
{
for(size_t l = 0; l < lhs.size(); ++l)
{
- BOOST_FOREACH(shared_ptr<Scaffold> ref_scaff, *ref_scaffs)
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff, *ref_scaffs)
{
// if we're past all the overlaps, just stop
if (ref_scaff->left() >= lhs[l].right() + overhang_3)
@@ -570,7 +570,7 @@ void combine_strand_assemblies(vector<Scaffold>& lhs,
}
for(size_t r = 0; r < rhs.size(); ++r)
{
- BOOST_FOREACH(shared_ptr<Scaffold> ref_scaff, *ref_scaffs)
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff, *ref_scaffs)
{
if (ref_scaff->left() >= rhs[r].right() + overhang_3)
{
@@ -689,8 +689,8 @@ CuffStrand guess_strand_for_interval(const vector<uint8_t>& strand_guess,
bool scaffolds_for_bundle(const HitBundle& bundle,
- vector<shared_ptr<Scaffold> >& scaffolds,
- vector<shared_ptr<Scaffold> >* ref_scaffs = NULL,
+ vector<boost::shared_ptr<Scaffold> >& scaffolds,
+ vector<boost::shared_ptr<Scaffold> >* ref_scaffs = NULL,
BundleStats* stats = NULL)
{
if (bundle.hits().size() >= max_frags_per_bundle)
@@ -731,7 +731,7 @@ bool scaffolds_for_bundle(const HitBundle& bundle,
if (ref_guided && enable_faux_reads && !hits.empty())
{
vector<Scaffold> pseudohits;
- BOOST_FOREACH(shared_ptr<Scaffold const> ref_scaff, *ref_scaffs)
+ BOOST_FOREACH(boost::shared_ptr<Scaffold const> ref_scaff, *ref_scaffs)
{
ref_scaff->tile_with_scaffs(pseudohits, tile_len, tile_off);
}
@@ -910,7 +910,7 @@ bool scaffolds_for_bundle(const HitBundle& bundle,
{
BOOST_FOREACH(Scaffold& scaff, tmp_scaffs)
{
- scaffolds.push_back(shared_ptr<Scaffold>(new Scaffold(scaff)));
+ scaffolds.push_back(boost::shared_ptr<Scaffold>(new Scaffold(scaff)));
}
}
sort(scaffolds.begin(), scaffolds.end(), scaff_lt_sp);
@@ -970,7 +970,7 @@ void quantitate_transcript_cluster(AbundanceGroup& transfrag_cluster,
}
else
{
- BOOST_FOREACH(shared_ptr<Abundance> ab, transfrag_cluster.abundances())
+ BOOST_FOREACH(boost::shared_ptr<Abundance> ab, transfrag_cluster.abundances())
{
ab->status(NUMERIC_HI_DATA);
}
@@ -978,7 +978,7 @@ void quantitate_transcript_cluster(AbundanceGroup& transfrag_cluster,
}
else
{
- vector<shared_ptr<Abundance> >& abundances = transfrag_cluster.abundances();
+ vector<boost::shared_ptr<Abundance> >& abundances = transfrag_cluster.abundances();
int N = abundances.size();
double total_fpkm = 0.0;
@@ -997,14 +997,14 @@ void quantitate_transcript_cluster(AbundanceGroup& transfrag_cluster,
gammas[j] /= total_fpkm;
}
- vector<shared_ptr<Abundance> > filtered_transcripts = abundances;
+ vector<boost::shared_ptr<Abundance> > filtered_transcripts = abundances;
filter_junk_isoforms(filtered_transcripts, gammas, abundances, 0);
vector<bool> to_keep (abundances.size(), false);
for(size_t i = 0; i < abundances.size(); ++i)
{
- shared_ptr<Abundance> ab_i = abundances[i];
+ boost::shared_ptr<Abundance> ab_i = abundances[i];
bool found = false;
- BOOST_FOREACH (shared_ptr<Abundance> ab_j, filtered_transcripts)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> ab_j, filtered_transcripts)
{
if (ab_i == ab_j)
{
@@ -1041,7 +1041,7 @@ void quantitate_transcript_cluster(AbundanceGroup& transfrag_cluster,
BOOST_FOREACH(const AbundanceGroup& gene, transfrags_by_gene)
{
- const vector<shared_ptr<Abundance> >& iso_abundances = gene.abundances();
+ const vector<boost::shared_ptr<Abundance> >& iso_abundances = gene.abundances();
vector<Isoform> isoforms;
int gene_id = -1;
@@ -1050,7 +1050,7 @@ void quantitate_transcript_cluster(AbundanceGroup& transfrag_cluster,
string ref_gene_id = "";
double major_isoform_FPKM = 0;
- BOOST_FOREACH (shared_ptr<Abundance> iso_ab, iso_abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> iso_ab, iso_abundances)
{
if (iso_ab->transfrag()->is_ref())
{
@@ -1067,14 +1067,14 @@ void quantitate_transcript_cluster(AbundanceGroup& transfrag_cluster,
major_isoform_FPKM = max(iso_ab->FPKM(), major_isoform_FPKM);
}
- BOOST_FOREACH (shared_ptr<Abundance> iso_ab, iso_abundances)
+ BOOST_FOREACH (boost::shared_ptr<Abundance> iso_ab, iso_abundances)
{
// Calculate transcript depth of coverage and FMI from FPKM
double FPKM = iso_ab->FPKM();
double density_score = major_isoform_FPKM ? (FPKM / major_isoform_FPKM) : 0;
double density_per_bp = FPKM;
- shared_ptr<Scaffold> transfrag = iso_ab->transfrag();
+ boost::shared_ptr<Scaffold> transfrag = iso_ab->transfrag();
assert(transfrag);
double s_len = transfrag->length();
@@ -1117,17 +1117,17 @@ void quantitate_transcript_cluster(AbundanceGroup& transfrag_cluster,
}
-void quantitate_transcript_clusters(vector<shared_ptr<Scaffold> >& scaffolds,
- shared_ptr<ReadGroupProperties> rg_props,
+void quantitate_transcript_clusters(vector<boost::shared_ptr<Scaffold> >& scaffolds,
+ boost::shared_ptr<ReadGroupProperties> rg_props,
vector<Gene>& genes,
bool bundle_too_large)
{
- //vector<shared_ptr<Scaffold> > partials;
- //vector<shared_ptr<Scaffold> > completes;
+ //vector<boost::shared_ptr<Scaffold> > partials;
+ //vector<boost::shared_ptr<Scaffold> > completes;
long double total_map_mass = rg_props->normalized_map_mass();
- vector<shared_ptr<Scaffold> > split_partials;
+ vector<boost::shared_ptr<Scaffold> > split_partials;
// Cleave the partials at their unknowns to minimize FPKM dilation on
// the low end of the expression profile.
for (size_t i = 0; i < scaffolds.size(); ++i)
@@ -1136,24 +1136,24 @@ void quantitate_transcript_clusters(vector<shared_ptr<Scaffold> >& scaffolds,
scaffolds[i]->get_complete_subscaffolds(c);
BOOST_FOREACH (Scaffold& s, c)
{
- split_partials.push_back(shared_ptr<Scaffold>(new Scaffold(s)));
+ split_partials.push_back(boost::shared_ptr<Scaffold>(new Scaffold(s)));
}
}
scaffolds = split_partials;
- vector<shared_ptr<Abundance> > abundances;
- BOOST_FOREACH(shared_ptr<Scaffold> s, scaffolds)
+ vector<boost::shared_ptr<Abundance> > abundances;
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> s, scaffolds)
{
TranscriptAbundance* pT = new TranscriptAbundance;
pT->transfrag(s);
- shared_ptr<Abundance> ab(pT);
+ boost::shared_ptr<Abundance> ab(pT);
abundances.push_back(ab);
}
AbundanceGroup transfrags = AbundanceGroup(abundances);
- set<shared_ptr<ReadGroupProperties const> > read_groups;
+ set<boost::shared_ptr<ReadGroupProperties const> > read_groups;
read_groups.insert(rg_props);
transfrags.init_rg_props(read_groups);
@@ -1172,8 +1172,8 @@ void quantitate_transcript_clusters(vector<shared_ptr<Scaffold> >& scaffolds,
void assemble_bundle(const RefSequenceTable& rt,
HitBundle* bundle_ptr,
- shared_ptr<ReadGroupProperties> rg_props,
- shared_ptr<BiasLearner> bl_ptr,
+ boost::shared_ptr<ReadGroupProperties> rg_props,
+ boost::shared_ptr<BiasLearner> bl_ptr,
FILE* ftranscripts,
FILE* fgene_abundances,
FILE* ftrans_abundances,
@@ -1193,7 +1193,7 @@ void assemble_bundle(const RefSequenceTable& rt,
#if ENABLE_THREADS
bundle_label.reset(new string(bundle_label_buf));
#else
- bundle_label = shared_ptr<string>(new string(bundle_label_buf));
+ bundle_label = boost::shared_ptr<string>(new string(bundle_label_buf));
#endif
verbose_msg( "%s\tProcessing new bundle with %d alignments\n",
@@ -1204,7 +1204,7 @@ void assemble_bundle(const RefSequenceTable& rt,
boost::this_thread::at_thread_exit(decr_pool_count);
#endif
- vector<shared_ptr<Scaffold> > scaffolds;
+ vector<boost::shared_ptr<Scaffold> > scaffolds;
bool successfully_assembled = true;
@@ -1434,7 +1434,7 @@ void assemble_bundle(const RefSequenceTable& rt,
delete bundle_ptr;
}
-bool assemble_hits(BundleFactory& bundle_factory, shared_ptr<BiasLearner> bl_ptr)
+bool assemble_hits(BundleFactory& bundle_factory, boost::shared_ptr<BiasLearner> bl_ptr)
{
//srand(time(0));
@@ -1587,11 +1587,11 @@ void driver(const string& hit_file_name, FILE* ref_gtf, FILE* mask_gtf)
ReadTable it;
RefSequenceTable rt(true, false);
- shared_ptr<HitFactory> hit_factory;
+ boost::shared_ptr<HitFactory> hit_factory;
try
{
- hit_factory = shared_ptr<BAMHitFactory>(new BAMHitFactory(hit_file_name, it, rt));
+ hit_factory = boost::shared_ptr<BAMHitFactory>(new BAMHitFactory(hit_file_name, it, rt));
}
catch (std::runtime_error& e)
{
@@ -1600,7 +1600,7 @@ void driver(const string& hit_file_name, FILE* ref_gtf, FILE* mask_gtf)
try
{
- hit_factory = shared_ptr<SAMHitFactory>(new SAMHitFactory(hit_file_name, it, rt));
+ hit_factory = boost::shared_ptr<SAMHitFactory>(new SAMHitFactory(hit_file_name, it, rt));
}
catch (std::runtime_error& e)
{
@@ -1610,24 +1610,26 @@ void driver(const string& hit_file_name, FILE* ref_gtf, FILE* mask_gtf)
}
}
- BundleFactory& bundle_factory = *(new BundleFactory(hit_factory, bundle_mode));
- shared_ptr<ReadGroupProperties> rg_props =bundle_factory.read_group_properties();
+ boost::shared_ptr<BundleFactory> bundle_factory = boost::shared_ptr<BundleFactory>(new BundleFactory(hit_factory, bundle_mode));
+ boost::shared_ptr<ReadGroupProperties> rg_props = bundle_factory->read_group_properties();
BadIntronTable bad_introns;
rt.print_rec_ordering();
- vector<shared_ptr<Scaffold> > ref_mRNAs;
+ vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
if (ref_gtf)
{
- ::load_ref_rnas(ref_gtf, bundle_factory.ref_table(), ref_mRNAs, corr_bias && bundle_mode == REF_DRIVEN, false);
- bundle_factory.set_ref_rnas(ref_mRNAs);
+ boost::crc_32_type ref_gtf_crc_result;
+ ::load_ref_rnas(ref_gtf, bundle_factory->ref_table(), ref_mRNAs, ref_gtf_crc_result, corr_bias && bundle_mode == REF_DRIVEN, false);
+ bundle_factory->set_ref_rnas(ref_mRNAs);
}
rt.print_rec_ordering();
- vector<shared_ptr<Scaffold> > mask_rnas;
+ vector<boost::shared_ptr<Scaffold> > mask_rnas;
if (mask_gtf)
{
- ::load_ref_rnas(mask_gtf, bundle_factory.ref_table(), mask_rnas, false, false);
- bundle_factory.set_mask_rnas(mask_rnas);
+ boost::crc_32_type mask_gtf_crc_result;
+ ::load_ref_rnas(mask_gtf, bundle_factory->ref_table(), mask_rnas, mask_gtf_crc_result, false, false);
+ bundle_factory->set_mask_rnas(mask_rnas);
}
vector<LocusCount> compatible_count_table;
@@ -1641,7 +1643,7 @@ void driver(const string& hit_file_name, FILE* ref_gtf, FILE* mask_gtf)
rg_props->raw_compatible_counts(compatible_count_table);
rg_props->raw_total_counts(total_count_table);
- vector<shared_ptr<ReadGroupProperties> > read_groups;
+ vector<boost::shared_ptr<ReadGroupProperties> > read_groups;
read_groups.push_back(rg_props);
normalize_counts(read_groups);
@@ -1650,18 +1652,18 @@ void driver(const string& hit_file_name, FILE* ref_gtf, FILE* mask_gtf)
verbose_msg("%d ReadHits still live\n", num_deleted);
verbose_msg("Found %lu reference contigs\n", rt.size());
- BOOST_FOREACH(shared_ptr<Scaffold> ref_scaff, ref_mRNAs)
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff, ref_mRNAs)
{
ref_scaff->clear_hits();
}
//fprintf(stderr, "ReadHit delete count is %d\n", num_deleted);
- shared_ptr<BiasLearner> bl_ptr(new BiasLearner(rg_props->frag_len_dist()));
- bundle_factory.read_group_properties(rg_props);
+ boost::shared_ptr<BiasLearner> bl_ptr(new BiasLearner(rg_props->frag_len_dist()));
+ bundle_factory->read_group_properties(rg_props);
//if (ref_gtf) -- why? bad introns are bad
- bundle_factory.bad_intron_table(bad_introns);
+ bundle_factory->bad_intron_table(bad_introns);
max_frag_len = rg_props->frag_len_dist()->max();
min_frag_len = rg_props->frag_len_dist()->min();
@@ -1669,20 +1671,19 @@ void driver(const string& hit_file_name, FILE* ref_gtf, FILE* mask_gtf)
if (corr_bias || corr_multi) final_est_run = false;
- assemble_hits(bundle_factory, bl_ptr);
+ assemble_hits(*bundle_factory, bl_ptr);
if (final_est_run)
{
- delete &bundle_factory;
//delete bl_ptr;
ref_mRNAs.clear();
return;
}
hit_factory->reset();
- delete &bundle_factory;
+
BundleFactory bundle_factory2(hit_factory, REF_DRIVEN);
- rg_props->bias_learner(shared_ptr<BiasLearner const>(bl_ptr));
+ rg_props->bias_learner(boost::shared_ptr<BiasLearner const>(bl_ptr));
rg_props->multi_read_table()->valid_mass(true);
bundle_factory2.read_group_properties(rg_props);
@@ -1690,13 +1691,15 @@ void driver(const string& hit_file_name, FILE* ref_gtf, FILE* mask_gtf)
{
ref_gtf = fopen(string(output_dir + "/transcripts.gtf").c_str(), "r");
ref_mRNAs.clear();
- ::load_ref_rnas(ref_gtf, bundle_factory2.ref_table(), ref_mRNAs, corr_bias, true);
+ boost::crc_32_type ref_gtf_crc_result;
+ ::load_ref_rnas(ref_gtf, bundle_factory2.ref_table(), ref_mRNAs, ref_gtf_crc_result, corr_bias, true);
}
bundle_factory2.set_ref_rnas(ref_mRNAs);
if (mask_gtf)
{
mask_rnas.clear();
- ::load_ref_rnas(mask_gtf, bundle_factory2.ref_table(), mask_rnas, false, false);
+ boost::crc_32_type mask_gtf_crc_result;
+ ::load_ref_rnas(mask_gtf, bundle_factory2.ref_table(), mask_rnas, mask_gtf_crc_result, false, false);
bundle_factory2.set_mask_rnas(mask_rnas);
}
bundle_factory2.reset();
diff --git a/src/cuffnorm.cpp b/src/cuffnorm.cpp
new file mode 100644
index 0000000..73f7e75
--- /dev/null
+++ b/src/cuffnorm.cpp
@@ -0,0 +1,1892 @@
+/*
+ * cuffdiff.cpp
+ * cufflinks
+ *
+ * Created by Cole Trapnell on 10/21/09.
+ * Copyright 2009 Cole Trapnell. All rights reserved.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#else
+#define PACKAGE_VERSION "INTERNAL"
+#define SVN_REVISION "XXX"
+#endif
+
+
+#include <stdlib.h>
+#include <getopt.h>
+#include <string>
+#include <numeric>
+#include <cfloat>
+#include <iostream>
+
+#include "common.h"
+#include "hits.h"
+#include "bundles.h"
+#include "abundances.h"
+#include "tokenize.h"
+#include "biascorrection.h"
+#include "update_check.h"
+
+#include <boost/thread.hpp>
+#include <boost/version.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/graph_traits.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "differential.h"
+
+extern "C" {
+#include "locfit/local.h"
+}
+
+// Need at least this many reads in a locus to do any testing on it
+
+vector<string> sample_labels;
+
+double FDR = 0.05;
+bool samples_are_time_series = false;
+using namespace std;
+using namespace boost;
+
+// We leave out the short codes for options that don't take an argument
+#if ENABLE_THREADS
+const char *short_options = "m:p:s:c:I:j:L:M:o:b:TNqvuF:C:";
+#else
+const char *short_options = "m:s:c:I:j:L:M:o:b:TNqvuF:C:";
+#endif
+
+static struct option long_options[] = {
+{"labels", required_argument, 0, 'L'},
+{"seed", required_argument, 0, OPT_RANDOM_SEED},
+{"norm-standards-file", required_argument, 0, OPT_NORM_STANDARDS_FILE},
+{"use-sample-sheet", no_argument, 0, OPT_USE_SAMPLE_SHEET},
+{"output-dir", required_argument, 0, 'o'},
+{"verbose", no_argument, 0, 'v'},
+{"quiet", no_argument, 0, 'q'},
+#if ENABLE_THREADS
+{"num-threads", required_argument, 0, 'p'},
+#endif
+{"library-type", required_argument, 0, OPT_LIBRARY_TYPE},
+{"no-update-check", no_argument, 0, OPT_NO_UPDATE_CHECK},
+{"compatible-hits-norm", no_argument, 0, OPT_USE_COMPAT_MASS},
+{"total-hits-norm", no_argument, 0, OPT_USE_TOTAL_MASS},
+
+// Some options for testing different stats policies
+{"max-bundle-frags", required_argument, 0, OPT_MAX_FRAGS_PER_BUNDLE},
+{"library-norm-method", required_argument, 0, OPT_LIB_NORM_METHOD},
+{"output-format", required_argument, 0, OPT_OUTPUT_FORMAT},
+{0, 0, 0, 0} // terminator
+};
+
+void print_usage()
+{
+ fprintf(stderr, "cuffnorm v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
+ fprintf(stderr, "-----------------------------\n");
+
+ //NOTE: SPACES ONLY, bozo
+ fprintf(stderr, "Usage: cuffnorm [options] <transcripts.gtf> <sample1_expr.cxb> <sample2_expr.cxb> [... sampleN_expr.cxb]\n");
+ fprintf(stderr, " Supply replicate CXB files as comma separated lists for each condition: sample1_rep1.cxb,sample1_rep2.cxb,...sample1_repM.cxb\n");
+ fprintf(stderr, "General Options:\n");
+ fprintf(stderr, " -o/--output-dir write all output files to this directory [ default: ./ ]\n");
+ fprintf(stderr, " -L/--labels comma-separated list of condition labels\n");
+ fprintf(stderr, " --norm-standards-file Housekeeping/spike genes to normalize libraries [ default: NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
+#if ENABLE_THREADS
+ fprintf(stderr, " -p/--num-threads number of threads used during quantification [ default: 1 ]\n");
+#endif
+ fprintf(stderr, " --library-type Library prep used for input reads [ default: below ]\n");
+ fprintf(stderr, " --library-norm-method Method used to normalize library sizes [ default: below ]\n");
+ fprintf(stderr, " --output-format Format for output tables [ default: below ]\n");
+
+ fprintf(stderr, "\nAdvanced Options:\n");
+ fprintf(stderr, " --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]\n");
+ fprintf(stderr, " --total-hits-norm count all hits for normalization [ default: FALSE ]\n");
+ fprintf(stderr, " -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]\n");
+ fprintf(stderr, " -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]\n");
+ fprintf(stderr, " --seed value of random number generator seed [ default: 0 ]\n");
+ fprintf(stderr, " --no-update-check do not contact server to check for update availability[ default: FALSE ]\n");
+ print_library_table();
+ print_lib_norm_method_table();
+ print_output_format_table();
+}
+
+int parse_options(int argc, char** argv)
+{
+ int option_index = 0;
+ int next_option;
+ string sample_label_list;
+ string dispersion_method_str;
+ string lib_norm_method_str;
+ string output_format_str;
+
+ do {
+ next_option = getopt_long_only(argc, argv, short_options, long_options, &option_index);
+ if (next_option == -1) /* Done with options. */
+ break;
+ switch (next_option) {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ break;
+
+ case 'p':
+ num_threads = (uint32_t)parseInt(1, "-p/--num-threads arg must be at least 1", print_usage);
+ break;
+ case 'F':
+ min_isoform_fraction = parseFloat(0, 1.0, "-F/--min-isoform-fraction must be between 0 and 1.0", print_usage);
+ break;
+ case 'L':
+ sample_label_list = optarg;
+ break;
+
+
+ case OPT_NORM_STANDARDS_FILE:
+ {
+ norm_standards_filename = optarg;
+ break;
+ }
+ case OPT_USE_SAMPLE_SHEET:
+ {
+ use_sample_sheet = true;
+ break;
+ }
+ case 'v':
+ {
+ if (cuff_quiet)
+ {
+ fprintf(stderr, "Warning: Can't be both verbose and quiet! Setting verbose only.\n");
+ }
+ cuff_quiet = false;
+ cuff_verbose = true;
+ break;
+ }
+ case 'q':
+ {
+ if (cuff_verbose)
+ {
+ fprintf(stderr, "Warning: Can't be both verbose and quiet! Setting quiet only.\n");
+ }
+ cuff_verbose = false;
+ cuff_quiet = true;
+ break;
+ }
+ case 'o':
+ {
+ output_dir = optarg;
+ break;
+ }
+
+
+ case OPT_LIBRARY_TYPE:
+ {
+ library_type = optarg;
+ break;
+ }
+
+ case OPT_NO_UPDATE_CHECK:
+ {
+ no_update_check = true;
+ break;
+ }
+ case OPT_RANDOM_SEED:
+ {
+ random_seed = parseInt(0, "--seed must be at least 0", print_usage);
+ break;
+ }
+ case OPT_USE_COMPAT_MASS:
+ {
+ use_compat_mass = true;
+ break;
+ }
+ case OPT_USE_TOTAL_MASS:
+ {
+ use_total_mass = true;
+ break;
+ }
+ case OPT_LIB_NORM_METHOD:
+ {
+ lib_norm_method_str = optarg;
+ break;
+ }
+ case OPT_OUTPUT_FORMAT:
+ {
+ output_format_str = optarg;
+ break;
+ }
+ default:
+ print_usage();
+ return 1;
+ }
+ } while(next_option != -1);
+
+ if (library_type != "")
+ {
+ map<string, ReadGroupProperties>::iterator lib_itr =
+ library_type_table.find(library_type);
+ if (lib_itr == library_type_table.end())
+ {
+ fprintf(stderr, "Error: Library type %s not supported\n", library_type.c_str());
+ exit(1);
+ }
+ else
+ {
+ if (library_type == "transfrags")
+ {
+ allow_junk_filtering = false;
+ }
+ global_read_properties = &lib_itr->second;
+ }
+ }
+ else
+ {
+
+ }
+
+ // Set the count dispersion method to use
+ if (dispersion_method_str == "")
+ {
+ dispersion_method_str = default_dispersion_method;
+ }
+
+ map<string, DispersionMethod>::iterator disp_itr =
+ dispersion_method_table.find(dispersion_method_str);
+ if (disp_itr == dispersion_method_table.end())
+ {
+ fprintf(stderr, "Error: Dispersion method %s not supported\n", dispersion_method_str.c_str());
+ exit(1);
+ }
+ else
+ {
+ dispersion_method = disp_itr->second;
+ }
+
+ // Set the library size normalization method to use
+ if (lib_norm_method_str == "")
+ {
+ lib_norm_method_str = default_lib_norm_method;
+ }
+
+ map<string, LibNormalizationMethod>::iterator lib_norm_itr =
+ lib_norm_method_table.find(lib_norm_method_str);
+ if (lib_norm_itr == lib_norm_method_table.end())
+ {
+ fprintf(stderr, "Error: Dispersion method %s not supported\n", lib_norm_method_str.c_str());
+ exit(1);
+ }
+ else
+ {
+ lib_norm_method = lib_norm_itr->second;
+ }
+
+ // Set the count dispersion method to use
+ if (output_format_str == "")
+ {
+ output_format_str = default_output_format;
+ }
+
+ map<string, OutputFormat>::iterator output_itr =
+ output_format_table.find(output_format_str);
+ if (output_itr == output_format_table.end())
+ {
+ fprintf(stderr, "Error: Output format %s not supported\n", output_format_str.c_str());
+ exit(1);
+ }
+ else
+ {
+ output_format = output_itr->second;
+ }
+
+ if (use_total_mass && use_compat_mass)
+ {
+ fprintf (stderr, "Error: please supply only one of --compatibile-hits-norm and --total-hits-norm\n");
+ exit(1);
+ }
+
+ tokenize(sample_label_list, ",", sample_labels);
+
+ allow_junk_filtering = false;
+
+ return 0;
+}
+
+void print_FPKM_tracking(FILE* fout,
+ const FPKMTrackingTable& tracking)
+{
+ fprintf(fout,"tracking_id\tclass_code\tnearest_ref_id\tgene_id\tgene_short_name\ttss_id\tlocus\tlength\tcoverage");
+ FPKMTrackingTable::const_iterator first_itr = tracking.begin();
+ if (first_itr != tracking.end())
+ {
+ const FPKMTracking& track = first_itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ fprintf(fout, "\t%s_FPKM\t%s_conf_lo\t%s_conf_hi\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
+ }
+ }
+ fprintf(fout, "\n");
+ for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
+ {
+ const string& description = itr->first;
+ const FPKMTracking& track = itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+
+ AbundanceStatus status = NUMERIC_OK;
+ BOOST_FOREACH (const FPKMContext& c, fpkms)
+ {
+ if (c.status == NUMERIC_FAIL)
+ status = NUMERIC_FAIL;
+ }
+
+ string all_gene_ids = cat_strings(track.gene_ids);
+ if (all_gene_ids == "")
+ all_gene_ids = "-";
+
+ string all_gene_names = cat_strings(track.gene_names);
+ if (all_gene_names == "")
+ all_gene_names = "-";
+
+ string all_tss_ids = cat_strings(track.tss_ids);
+ if (all_tss_ids == "")
+ all_tss_ids = "-";
+
+ char length_buff[33] = "-";
+ if (track.length)
+ sprintf(length_buff, "%d", track.length);
+
+ fprintf(fout, "%s\t%c\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
+ description.c_str(),
+ track.classcode ? track.classcode : '-',
+ track.ref_match.c_str(),
+ all_gene_ids.c_str(),
+ all_gene_names.c_str(),
+ all_tss_ids.c_str(),
+ track.locus_tag.c_str(),
+ length_buff,
+ "-");
+
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ double fpkm = fpkms[i].FPKM;
+ //double std_dev = sqrt(fpkms[i].FPKM_variance);
+ double fpkm_conf_hi = fpkms[i].FPKM_conf_hi;
+ double fpkm_conf_lo = fpkms[i].FPKM_conf_lo;
+ const char* status_str = "OK";
+
+ if (fpkms[i].status == NUMERIC_OK)
+ {
+ status_str = "OK";
+ }
+ else if (fpkms[i].status == NUMERIC_FAIL)
+ {
+ status_str = "FAIL";
+ }
+ else if (fpkms[i].status == NUMERIC_LOW_DATA)
+ {
+ status_str = "LOWDATA";
+ }
+ else if (fpkms[i].status == NUMERIC_HI_DATA)
+ {
+ status_str = "HIDATA";
+ }
+ else
+ {
+ assert(false);
+ }
+
+ fprintf(fout, "\t%lg\t%lg\t%lg\t%s", fpkm, fpkm_conf_lo, fpkm_conf_hi, status_str);
+ }
+
+ fprintf(fout, "\n");
+ }
+}
+
+void print_count_tracking(FILE* fout,
+ const FPKMTrackingTable& tracking)
+{
+ fprintf(fout,"tracking_id");
+ FPKMTrackingTable::const_iterator first_itr = tracking.begin();
+ if (first_itr != tracking.end())
+ {
+ const FPKMTracking& track = first_itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ fprintf(fout, "\t%s_count\t%s_count_variance\t%s_count_uncertainty_var\t%s_count_dispersion_var\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
+ }
+ }
+ fprintf(fout, "\n");
+ for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
+ {
+ const string& description = itr->first;
+ const FPKMTracking& track = itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+
+ AbundanceStatus status = NUMERIC_OK;
+ BOOST_FOREACH (const FPKMContext& c, fpkms)
+ {
+ if (c.status == NUMERIC_FAIL)
+ status = NUMERIC_FAIL;
+ }
+
+ fprintf(fout, "%s",
+ description.c_str());
+
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ const char* status_str = "OK";
+
+ if (fpkms[i].status == NUMERIC_OK)
+ {
+ status_str = "OK";
+ }
+ else if (fpkms[i].status == NUMERIC_FAIL)
+ {
+ status_str = "FAIL";
+ }
+ else if (fpkms[i].status == NUMERIC_LOW_DATA)
+ {
+ status_str = "LOWDATA";
+ }
+ else if (fpkms[i].status == NUMERIC_HI_DATA)
+ {
+ status_str = "HIDATA";
+ }
+ else
+ {
+ assert(false);
+ }
+
+ double external_counts = fpkms[i].count_mean;
+ double external_count_var = fpkms[i].count_var;
+ double uncertainty_var = fpkms[i].count_uncertainty_var;
+ double dispersion_var = fpkms[i].count_dispersion_var;
+ fprintf(fout, "\t%lg\t%lg\t%lg\t%lg\t%s", external_counts, external_count_var, uncertainty_var, dispersion_var, status_str);
+ }
+
+ fprintf(fout, "\n");
+ }
+}
+
+void print_FPKM_simple_table(FILE* fout,
+ const FPKMTrackingTable& tracking)
+{
+ fprintf(fout,"tracking_id");
+ FPKMTrackingTable::const_iterator first_itr = tracking.begin();
+ if (first_itr != tracking.end())
+ {
+ const FPKMTracking& track = first_itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ for (size_t j = 0; j != fpkms[i].tracking_info_per_rep.size();
+ ++j)
+ {
+ fprintf(fout, "\t%s_%d", sample_labels[i].c_str(), j);
+ }
+ }
+ }
+ for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
+ {
+ const string& description = itr->first;
+ fprintf(fout, "\n%s\t", description.c_str());
+ FPKMTrackingTable::const_iterator first_itr = tracking.begin();
+ if (itr != tracking.end())
+ {
+ const FPKMTracking& track = itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ for (size_t j = 0; j != fpkms[i].tracking_info_per_rep.size();
+ ++j)
+ {
+ double FPKM = fpkms[i].tracking_info_per_rep[j].fpkm;
+ fprintf(fout, "\t%lg", FPKM);
+ }
+ }
+ }
+ }
+}
+
+void print_count_simple_table(FILE* fout,
+ const FPKMTrackingTable& tracking)
+{
+ fprintf(fout,"tracking_id");
+ FPKMTrackingTable::const_iterator first_itr = tracking.begin();
+ if (first_itr != tracking.end())
+ {
+ const FPKMTracking& track = first_itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ for (size_t j = 0; j != fpkms[i].tracking_info_per_rep.size();
+ ++j)
+ {
+ fprintf(fout, "\t%s_%d", sample_labels[i].c_str(), j);
+ }
+ }
+ }
+ for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
+ {
+ const string& description = itr->first;
+ fprintf(fout, "\n%s\t", description.c_str());
+ FPKMTrackingTable::const_iterator first_itr = tracking.begin();
+ if (itr != tracking.end())
+ {
+ const FPKMTracking& track = itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ for (size_t j = 0; j != fpkms[i].tracking_info_per_rep.size();
+ ++j)
+ {
+ double count = fpkms[i].tracking_info_per_rep[j].count;
+ fprintf(fout, "\t%lg", count);
+ }
+ }
+ }
+ }
+}
+
+
+
+void print_read_group_tracking(FILE* fout,
+ const FPKMTrackingTable& tracking)
+{
+ fprintf(fout,"tracking_id\tcondition\treplicate\traw_frags\tinternal_scaled_frags\texternal_scaled_frags\tFPKM\teffective_length\tstatus");
+
+ fprintf(fout, "\n");
+ for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
+ {
+ const string& description = itr->first;
+ const FPKMTracking& track = itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ for (size_t j = 0; j != fpkms[i].tracking_info_per_rep.size();
+ ++j)
+ {
+ double FPKM = fpkms[i].tracking_info_per_rep[j].fpkm;
+ double internal_count = fpkms[i].tracking_info_per_rep[j].count;
+ double external_count = internal_count / fpkms[i].tracking_info_per_rep[j].rg_props->external_scale_factor();
+ double raw_count = internal_count * fpkms[i].tracking_info_per_rep[j].rg_props->internal_scale_factor();
+ const string& condition_name = fpkms[i].tracking_info_per_rep[j].rg_props->condition_name();
+ AbundanceStatus status = fpkms[i].tracking_info_per_rep[j].status;
+
+ int rep_num = fpkms[i].tracking_info_per_rep[j].rg_props->replicate_num();
+
+ const char* status_str = "OK";
+
+ if (status == NUMERIC_OK)
+ {
+ status_str = "OK";
+ }
+ else if (status == NUMERIC_FAIL)
+ {
+ status_str = "FAIL";
+ }
+ else if (status == NUMERIC_LOW_DATA)
+ {
+ status_str = "LOWDATA";
+ }
+ else if (status == NUMERIC_HI_DATA)
+ {
+ status_str = "HIDATA";
+ }
+ else
+ {
+ assert(false);
+ }
+
+ fprintf(fout, "%s\t%s\t%d\t%lg\t%lg\t%lg\t%lg\t%s\t%s\n",
+ description.c_str(),
+ condition_name.c_str(),
+ rep_num,
+ raw_count,
+ internal_count,
+ external_count,
+ FPKM,
+ "-",
+ status_str);
+ }
+ }
+ }
+}
+
+void print_read_group_cuffdiff_info(FILE* fout,
+ const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups)
+{
+ fprintf(fout, "file\tcondition\treplicate_num\ttotal_mass\tnorm_mass\tinternal_scale\texternal_scale\n");
+ for (size_t i = 0; i < all_read_groups.size(); ++i)
+ {
+ boost::shared_ptr<ReadGroupProperties const> rg_props = all_read_groups[i];
+ fprintf(fout, "%s\t%s\t%d\t%Lg\t%Lg\t%lg\t%lg\n",
+ rg_props->file_path().c_str(),
+ rg_props->condition_name().c_str(),
+ rg_props->replicate_num(),
+ rg_props->total_map_mass(),
+ rg_props->normalized_map_mass(),
+ rg_props->internal_scale_factor(),
+ rg_props->external_scale_factor());
+
+ }
+}
+
+void print_read_group_simple_table_info(FILE* fout,
+ const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups)
+{
+ //fprintf(fout, "file\tcondition\treplicate_num\ttotal_mass\tnorm_mass\tinternal_scale\texternal_scale\n");
+ fprintf(fout, "sample_id\tfile\ttotal_mass\tinternal_scale\texternal_scale\n");
+ for (size_t i = 0; i < all_read_groups.size(); ++i)
+ {
+ boost::shared_ptr<ReadGroupProperties const> rg_props = all_read_groups[i];
+ fprintf(fout, "%s_%d\t%s\t%Lg\t%lg\t%lg\n",
+ rg_props->condition_name().c_str(),
+ rg_props->replicate_num(),
+ rg_props->file_path().c_str(),
+ rg_props->total_map_mass(),
+ rg_props->internal_scale_factor(),
+ rg_props->external_scale_factor());
+
+ }
+}
+
+void print_feature_attr_simple_table(FILE* fout,
+ const FPKMTrackingTable& tracking)
+{
+ fprintf(fout,"tracking_id\tclass_code\tnearest_ref_id\tgene_id\tgene_short_name\ttss_id\tlocus\tlength");
+
+ fprintf(fout, "\n");
+ for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
+ {
+ const string& description = itr->first;
+ const string& locus_tag = itr->second.locus_tag;
+ const string& ref_match = itr->second.ref_match;
+ int length = itr->second.length;
+ char length_buff[33] = "-";
+ if (length)
+ sprintf(length_buff, "%d", length);
+ const set<string>& gene_names = itr->second.gene_names;
+ const set<string>& gene_ids = itr->second.gene_ids;
+ const set<string>& tss_ids = itr->second.tss_ids;
+ char class_code = itr->second.classcode ? itr->second.classcode : '-';
+ string all_gene_names = cat_strings(gene_names);
+ if (all_gene_names == "")
+ all_gene_names = "-";
+ string all_gene_ids = cat_strings(gene_ids);
+ if (all_gene_ids == "")
+ all_gene_ids = "-";
+ string all_tss_ids = cat_strings(tss_ids);
+ if (all_tss_ids == "")
+ all_tss_ids = "-";
+ fprintf(fout, "%s\t%c\t%s\t%s\t%s\t%s\t%s\t%s\n",
+ description.c_str(),
+ class_code,
+ ref_match.c_str(),
+ all_gene_ids.c_str(),
+ all_gene_names.c_str(),
+ all_tss_ids.c_str(),
+ locus_tag.c_str(),
+ length_buff);
+ }
+}
+
+void print_run_info(FILE* fout)
+{
+ fprintf(fout, "param\tvalue\n");
+ fprintf(fout, "cmd_line\t%s\n", cmd_str.c_str());
+ fprintf(fout, "version\t%s\n", PACKAGE_VERSION);
+ fprintf(fout, "SVN_revision\t%s\n",SVN_REVISION);
+ fprintf(fout, "boost_version\t%d\n", BOOST_VERSION);
+}
+
+#if ENABLE_THREADS
+boost::mutex inspect_lock;
+#endif
+
+void inspect_map_worker(ReplicatedBundleFactory& fac,
+ int& tmp_min_frag_len,
+ int& tmp_max_frag_len)
+{
+#if ENABLE_THREADS
+ boost::this_thread::at_thread_exit(decr_pool_count);
+#endif
+
+ int min_f = std::numeric_limits<int>::max();
+ int max_f = 0;
+
+ fac.inspect_replicate_maps(min_f, max_f);
+
+#if ENABLE_THREADS
+ inspect_lock.lock();
+#endif
+ tmp_min_frag_len = min(min_f, tmp_min_frag_len);
+ tmp_max_frag_len = max(max_f, tmp_max_frag_len);
+#if ENABLE_THREADS
+ inspect_lock.unlock();
+#endif
+}
+
+boost::shared_ptr<TestLauncher> test_launcher;
+
+bool quantitate_next_locus(const RefSequenceTable& rt,
+ vector<boost::shared_ptr<ReplicatedBundleFactory> >& bundle_factories,
+ boost::shared_ptr<TestLauncher> launcher)
+{
+ for (size_t i = 0; i < bundle_factories.size(); ++i)
+ {
+ boost::shared_ptr<SampleAbundances> s_ab = boost::shared_ptr<SampleAbundances>(new SampleAbundances);
+
+#if ENABLE_THREADS
+ while(1)
+ {
+ locus_thread_pool_lock.lock();
+ if (locus_curr_threads < locus_num_threads)
+ {
+ break;
+ }
+
+ locus_thread_pool_lock.unlock();
+
+ boost::this_thread::sleep(boost::posix_time::milliseconds(5));
+
+ }
+
+ locus_curr_threads++;
+ locus_thread_pool_lock.unlock();
+
+ boost::shared_ptr<HitBundle> pBundle = boost::shared_ptr<HitBundle>(new HitBundle());
+ bool non_empty = bundle_factories[i]->next_bundle(*pBundle);
+
+ if (pBundle->compatible_mass() > 0)
+ {
+ thread quantitate(sample_worker,
+ non_empty,
+ pBundle,
+ boost::ref(rt),
+ boost::ref(*(bundle_factories[i])),
+ s_ab,
+ i,
+ launcher,
+ false);
+ }
+ else
+ {
+ sample_worker(non_empty,
+ pBundle,
+ boost::ref(rt),
+ boost::ref(*(bundle_factories[i])),
+ s_ab,
+ i,
+ launcher,
+ false);
+ locus_thread_pool_lock.lock();
+ locus_curr_threads--;
+ locus_thread_pool_lock.unlock();
+ }
+#else
+ HitBundle bundle;
+ bool non_empty = sample_factory.next_bundle(bundle);
+
+ sample_worker(non_emtpy,
+ pBundle,
+ boost::ref(rt),
+ boost::ref(*(bundle_factories[i])),
+ s_ab,
+ i,
+ launcher,
+ false);
+#endif
+ }
+ return true;
+}
+
+void parse_sample_sheet_file(FILE* sample_sheet_file,
+ vector<string>& sample_labels,
+ vector<string>& sam_hit_filename_lists)
+{
+
+ char pBuf[10 * 1024];
+ size_t non_blank_lines_read = 0;
+
+ sample_labels.clear();
+
+ map<string, vector<string> > sample_groups;
+
+ while (fgets(pBuf, 10*1024, sample_sheet_file))
+ {
+ if (strlen(pBuf) > 0)
+ {
+ char* nl = strchr(pBuf, '\n');
+ if (nl)
+ *nl = 0;
+
+ string pBufstr = pBuf;
+ string trimmed = boost::trim_copy(pBufstr);
+
+ if (trimmed.length() > 0 && trimmed[0] != '#')
+ {
+ non_blank_lines_read++;
+ vector<string> columns;
+ tokenize(trimmed, "\t", columns);
+
+ if (non_blank_lines_read == 1)
+ continue;
+
+ if (columns.size() < 2)
+ {
+ if (columns.size() > 0)
+ fprintf(stderr, "Malformed record in sample sheet: \n > %s\n", pBuf);
+ else
+ continue;
+ }
+
+ string sam_file = columns[0];
+ string sample_group = columns[1];
+
+ pair<map<string, vector<string> >::iterator, bool> inserted = sample_groups.insert(make_pair(sample_group, vector<string>()));
+ inserted.first->second.push_back(sam_file);
+ }
+ }
+ }
+
+ for (map<string, vector<string> >::iterator itr = sample_groups.begin();
+ itr != sample_groups.end(); ++itr)
+ {
+ sample_labels.push_back(itr->first);
+ string sam_list = boost::join(itr->second, ",");
+ sam_hit_filename_lists.push_back(sam_list);
+ }
+}
+
+void parse_norm_standards_file(FILE* norm_standards_file)
+{
+ char pBuf[10 * 1024];
+ size_t non_blank_lines_read = 0;
+
+ boost::shared_ptr<map<string, LibNormStandards> > norm_standards(new map<string, LibNormStandards>);
+
+ while (fgets(pBuf, 10*1024, norm_standards_file))
+ {
+ if (strlen(pBuf) > 0)
+ {
+ char* nl = strchr(pBuf, '\n');
+ if (nl)
+ *nl = 0;
+
+ string pBufstr = pBuf;
+ string trimmed = boost::trim_copy(pBufstr);
+
+ if (trimmed.length() > 0 && trimmed[0] != '#')
+ {
+ non_blank_lines_read++;
+ vector<string> columns;
+ tokenize(trimmed, "\t", columns);
+
+ if (non_blank_lines_read == 1)
+ continue;
+
+ if (columns.size() < 1) //
+ {
+ continue;
+ }
+
+ string gene_id = columns[0];
+ LibNormStandards L;
+ norm_standards->insert(make_pair(gene_id, L));
+ }
+ }
+ }
+ lib_norm_standards = norm_standards;
+}
+
+
+void print_variability_models(FILE* var_model_out, const vector<boost::shared_ptr<ReplicatedBundleFactory> >& factories)
+{
+
+ fprintf(var_model_out, "condition\tlocus\tcompatible_count_mean\tcompatible_count_var\ttotal_count_mean\ttotal_count_var\tfitted_var\n");
+
+ for (size_t i = 0; i < factories.size(); ++i)
+ {
+ string factor_name = factories[i]->condition_name();
+ boost::shared_ptr<ReadGroupProperties> rg = factories[i]->factories()[0]->read_group_properties();
+ boost::shared_ptr<MassDispersionModel const> model = rg->mass_dispersion_model();
+// const vector<double>& means = model->scaled_compatible_mass_means();
+// const vector<double>& raw_vars = model->scaled_compatible_variances();
+
+ const vector<LocusCount>& common_scale_compatible_counts = rg->common_scale_compatible_counts();
+ for (size_t j = 0; j < common_scale_compatible_counts.size(); ++j)
+ {
+ string locus_desc = common_scale_compatible_counts[j].locus_desc;
+ pair<double, double> compat_mean_and_var = model->get_compatible_mean_and_var(locus_desc);
+ pair<double, double> total_mean_and_var = model->get_total_mean_and_var(locus_desc);
+// double total_compat_count = 0;
+// if (itr != locus_to_total_count_table.end())
+// total_compat_count = itr->second.count;
+
+
+ fprintf(var_model_out, "%s\t%s\t%lg\t%lg\t%lg\t%lg\t%lg\n",
+ factor_name.c_str(),
+ locus_desc.c_str(),
+ compat_mean_and_var.first,
+ compat_mean_and_var.second,
+ total_mean_and_var.first,
+ total_mean_and_var.second,
+ model->scale_mass_variance(compat_mean_and_var.first));
+ }
+ }
+ fclose(var_model_out);
+
+}
+
+void write_output_cuffdiff_format(const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups,
+ const Tracking& tracking,
+ Outfiles& outfiles)
+{
+ // FPKM tracking
+
+ FILE* fiso_fpkm_tracking = outfiles.isoform_fpkm_tracking_out;
+ fprintf(stderr, "Writing isoform-level FPKM tracking\n");
+ print_FPKM_tracking(fiso_fpkm_tracking,tracking.isoform_fpkm_tracking);
+
+ FILE* ftss_fpkm_tracking = outfiles.tss_group_fpkm_tracking_out;
+ fprintf(stderr, "Writing TSS group-level FPKM tracking\n");
+ print_FPKM_tracking(ftss_fpkm_tracking,tracking.tss_group_fpkm_tracking);
+
+ FILE* fgene_fpkm_tracking = outfiles.gene_fpkm_tracking_out;
+ fprintf(stderr, "Writing gene-level FPKM tracking\n");
+ print_FPKM_tracking(fgene_fpkm_tracking,tracking.gene_fpkm_tracking);
+
+ FILE* fcds_fpkm_tracking = outfiles.cds_fpkm_tracking_out;
+ fprintf(stderr, "Writing CDS-level FPKM tracking\n");
+ print_FPKM_tracking(fcds_fpkm_tracking,tracking.cds_fpkm_tracking);
+
+ // Count tracking
+
+ FILE* fiso_count_tracking = outfiles.isoform_count_tracking_out;
+ fprintf(stderr, "Writing isoform-level count tracking\n");
+ print_count_tracking(fiso_count_tracking,tracking.isoform_fpkm_tracking);
+
+ FILE* ftss_count_tracking = outfiles.tss_group_count_tracking_out;
+ fprintf(stderr, "Writing TSS group-level count tracking\n");
+ print_count_tracking(ftss_count_tracking,tracking.tss_group_fpkm_tracking);
+
+ FILE* fgene_count_tracking = outfiles.gene_count_tracking_out;
+ fprintf(stderr, "Writing gene-level count tracking\n");
+ print_count_tracking(fgene_count_tracking,tracking.gene_fpkm_tracking);
+
+ FILE* fcds_count_tracking = outfiles.cds_count_tracking_out;
+ fprintf(stderr, "Writing CDS-level count tracking\n");
+ print_count_tracking(fcds_count_tracking,tracking.cds_fpkm_tracking);
+
+ // Read group tracking
+
+ FILE* fiso_rep_tracking = outfiles.isoform_rep_tracking_out;
+ fprintf(stderr, "Writing isoform-level read group tracking\n");
+ print_read_group_tracking(fiso_rep_tracking,tracking.isoform_fpkm_tracking);
+
+ FILE* ftss_rep_tracking = outfiles.tss_group_rep_tracking_out;
+ fprintf(stderr, "Writing TSS group-level read group tracking\n");
+ print_read_group_tracking(ftss_rep_tracking,tracking.tss_group_fpkm_tracking);
+
+ FILE* fgene_rep_tracking = outfiles.gene_rep_tracking_out;
+ fprintf(stderr, "Writing gene-level read group tracking\n");
+ print_read_group_tracking(fgene_rep_tracking,tracking.gene_fpkm_tracking);
+
+ FILE* fcds_rep_tracking = outfiles.cds_rep_tracking_out;
+ fprintf(stderr, "Writing CDS-level read group tracking\n");
+ print_read_group_tracking(fcds_rep_tracking,tracking.cds_fpkm_tracking);
+
+ FILE* fread_group_info = outfiles.read_group_info_out;
+ fprintf(stderr, "Writing read group info\n");
+ print_read_group_cuffdiff_info(fread_group_info,all_read_groups);
+
+ FILE* frun_info = outfiles.run_info_out;
+ fprintf(stderr, "Writing run info\n");
+ print_run_info(frun_info);
+
+}
+
+void write_output_simple_table_format(const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups,
+ const Tracking& tracking,
+ Outfiles& outfiles)
+{
+ // FPKM tracking
+
+ FILE* fiso_fpkm_tracking = outfiles.isoform_fpkm_tracking_out;
+ fprintf(stderr, "Writing isoform-level FPKM tracking\n");
+ print_FPKM_simple_table(fiso_fpkm_tracking,tracking.isoform_fpkm_tracking);
+
+ FILE* ftss_fpkm_tracking = outfiles.tss_group_fpkm_tracking_out;
+ fprintf(stderr, "Writing TSS group-level FPKM tracking\n");
+ print_FPKM_simple_table(ftss_fpkm_tracking,tracking.tss_group_fpkm_tracking);
+
+ FILE* fgene_fpkm_tracking = outfiles.gene_fpkm_tracking_out;
+ fprintf(stderr, "Writing gene-level FPKM tracking\n");
+ print_FPKM_simple_table(fgene_fpkm_tracking,tracking.gene_fpkm_tracking);
+
+ FILE* fcds_fpkm_tracking = outfiles.cds_fpkm_tracking_out;
+ fprintf(stderr, "Writing CDS-level FPKM tracking\n");
+ print_FPKM_simple_table(fcds_fpkm_tracking,tracking.cds_fpkm_tracking);
+
+ // Count tracking
+
+ FILE* fiso_count_tracking = outfiles.isoform_count_tracking_out;
+ fprintf(stderr, "Writing isoform-level count tracking\n");
+ print_count_simple_table(fiso_count_tracking,tracking.isoform_fpkm_tracking);
+
+ FILE* ftss_count_tracking = outfiles.tss_group_count_tracking_out;
+ fprintf(stderr, "Writing TSS group-level count tracking\n");
+ print_count_simple_table(ftss_count_tracking,tracking.tss_group_fpkm_tracking);
+
+ FILE* fgene_count_tracking = outfiles.gene_count_tracking_out;
+ fprintf(stderr, "Writing gene-level count tracking\n");
+ print_count_simple_table(fgene_count_tracking,tracking.gene_fpkm_tracking);
+
+ FILE* fcds_count_tracking = outfiles.cds_count_tracking_out;
+ fprintf(stderr, "Writing CDS-level count tracking\n");
+ print_count_simple_table(fcds_count_tracking,tracking.cds_fpkm_tracking);
+
+ FILE* fiso_attr = outfiles.isoform_attr_out;
+ fprintf(stderr, "Writing isoform-level attributes\n");
+ print_feature_attr_simple_table(fiso_attr,tracking.isoform_fpkm_tracking);
+
+ FILE* ftss_attr = outfiles.tss_group_attr_out;
+ fprintf(stderr, "Writing TSS group-level attributes\n");
+ print_feature_attr_simple_table(ftss_attr,tracking.tss_group_fpkm_tracking);
+
+ FILE* fgene_attr = outfiles.gene_attr_out;
+ fprintf(stderr, "Writing gene-level attributes\n");
+ print_feature_attr_simple_table(fgene_attr,tracking.gene_fpkm_tracking);
+
+ FILE* fcds_attr = outfiles.cds_attr_out;
+ fprintf(stderr, "Writing CDS-level attributes\n");
+ print_feature_attr_simple_table(fcds_attr,tracking.cds_fpkm_tracking);
+
+ FILE* fread_group_info = outfiles.read_group_info_out;
+ fprintf(stderr, "Writing read group info\n");
+ print_read_group_simple_table_info(fread_group_info,all_read_groups);
+
+ FILE* frun_info = outfiles.run_info_out;
+ fprintf(stderr, "Writing run info\n");
+ print_run_info(frun_info);
+
+}
+
+void write_output(const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups,
+ const Tracking& tracking,
+ Outfiles& outfiles)
+{
+ if (output_format == CUFFDIFF_OUTPUT_FMT)
+ {
+ write_output_cuffdiff_format(all_read_groups, tracking, outfiles);
+ }
+ else if (output_format == SIMPLE_TABLE_OUTPUT_FMT)
+ {
+ write_output_simple_table_format(all_read_groups, tracking, outfiles);
+ }
+ else{
+ fprintf(stderr, "Error: unrecognized output format!\n");
+ exit(1);
+ }
+}
+
+
+void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, FILE* norm_standards_file, vector<string>& sam_hit_filename_lists, Outfiles& outfiles)
+{
+
+ ReadTable it;
+ RefSequenceTable rt(true, false);
+
+ vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
+
+ vector<boost::shared_ptr<ReplicatedBundleFactory> > bundle_factories;
+ vector<boost::shared_ptr<ReadGroupProperties> > all_read_groups;
+
+ for (size_t i = 0; i < sam_hit_filename_lists.size(); ++i)
+ {
+ vector<string> sam_hit_filenames;
+ tokenize(sam_hit_filename_lists[i], ",", sam_hit_filenames);
+
+ vector<boost::shared_ptr<BundleFactory> > replicate_factories;
+
+ string condition_name = sample_labels[i];
+
+ for (size_t j = 0; j < sam_hit_filenames.size(); ++j)
+ {
+ boost::shared_ptr<HitFactory> hs;
+ boost::shared_ptr<BundleFactory> hf;
+ try
+ {
+ hs = boost::shared_ptr<HitFactory>(new PrecomputedExpressionHitFactory(sam_hit_filenames[j], it, rt));
+ hf = boost::shared_ptr<BundleFactory>(new PrecomputedExpressionBundleFactory(static_pointer_cast<PrecomputedExpressionHitFactory>(hs)));
+ }
+
+ catch(boost::archive::archive_exception & e)
+ {
+ try
+ {
+ hs = boost::shared_ptr<HitFactory>(new BAMHitFactory(sam_hit_filenames[j], it, rt));
+ }
+ catch (std::runtime_error& e)
+ {
+ try
+ {
+// fprintf(stderr, "File %s doesn't appear to be a valid BAM file, trying SAM...\n",
+// sam_hit_filenames[j].c_str());
+ hs = boost::shared_ptr<HitFactory>(new SAMHitFactory(sam_hit_filenames[j], it, rt));
+ }
+ catch (std::runtime_error& e)
+ {
+ fprintf(stderr, "Error: cannot open file %s for reading. Unrecognized file type\n",
+ sam_hit_filenames[j].c_str());
+ exit(1);
+ }
+ }
+ hf = boost::shared_ptr<BundleFactory>(new BundleFactory(hs, REF_DRIVEN));
+ }
+
+
+ boost::shared_ptr<ReadGroupProperties> rg_props(new ReadGroupProperties);
+
+ if (global_read_properties)
+ {
+ *rg_props = *global_read_properties;
+ }
+ else
+ {
+ *rg_props = hs->read_group_properties();
+ }
+
+ rg_props->checked_parameters(hs->read_group_properties().checked_parameters());
+ rg_props->condition_name(condition_name);
+ rg_props->replicate_num(j);
+ rg_props->file_path(sam_hit_filenames[j]);
+
+ all_read_groups.push_back(rg_props);
+
+ hf->read_group_properties(rg_props);
+
+ replicate_factories.push_back(hf);
+ //replicate_factories.back()->set_ref_rnas(ref_mRNAs);
+ }
+
+ bundle_factories.push_back(boost::shared_ptr<ReplicatedBundleFactory>(new ReplicatedBundleFactory(replicate_factories, condition_name)));
+ }
+
+ boost::crc_32_type ref_gtf_crc_result;
+ ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, ref_gtf_crc_result, corr_bias, false);
+ if (ref_mRNAs.empty())
+ return;
+
+ vector<boost::shared_ptr<Scaffold> > mask_rnas;
+ if (mask_gtf)
+ {
+ boost::crc_32_type mask_gtf_crc_result;
+ ::load_ref_rnas(mask_gtf, rt, mask_rnas, mask_gtf_crc_result, false, false);
+ }
+
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
+ {
+ fac->set_ref_rnas(ref_mRNAs);
+ if (mask_gtf)
+ fac->set_mask_rnas(mask_rnas);
+ }
+
+ if (norm_standards_file != NULL)
+ {
+ parse_norm_standards_file(norm_standards_file);
+ }
+
+ validate_cross_sample_parameters(all_read_groups);
+
+ vector<pair<size_t, size_t > > contrasts;
+
+#if ENABLE_THREADS
+ locus_num_threads = num_threads;
+#endif
+
+ int tmp_min_frag_len = numeric_limits<int>::max();
+ int tmp_max_frag_len = 0;
+
+ ProgressBar p_bar("Inspecting maps and determining fragment length distributions.",0);
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
+ {
+#if ENABLE_THREADS
+ while(1)
+ {
+ locus_thread_pool_lock.lock();
+ if (locus_curr_threads < locus_num_threads)
+ {
+ break;
+ }
+
+ locus_thread_pool_lock.unlock();
+
+ boost::this_thread::sleep(boost::posix_time::milliseconds(5));
+ }
+
+ locus_curr_threads++;
+ locus_thread_pool_lock.unlock();
+
+ thread inspect(inspect_map_worker,
+ boost::ref(*fac),
+ boost::ref(tmp_min_frag_len),
+ boost::ref(tmp_max_frag_len));
+#else
+ inspect_map_worker(boost::ref(*fac),
+ boost::ref(tmp_min_frag_len),
+ boost::ref(tmp_max_frag_len));
+#endif
+ }
+
+ // wait for the workers to finish up before reporting everthing.
+#if ENABLE_THREADS
+ while(1)
+ {
+ locus_thread_pool_lock.lock();
+ if (locus_curr_threads == 0)
+ {
+ locus_thread_pool_lock.unlock();
+ break;
+ }
+ locus_thread_pool_lock.unlock();
+
+ boost::this_thread::sleep(boost::posix_time::milliseconds(5));
+ }
+#endif
+
+ normalize_counts(all_read_groups);
+
+ long double total_norm_mass = 0.0;
+ long double total_mass = 0.0;
+ BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
+ {
+ total_norm_mass += rg_props->normalized_map_mass();
+ total_mass += rg_props->total_map_mass();
+ }
+
+ min_frag_len = tmp_min_frag_len;
+ max_frag_len = tmp_max_frag_len;
+
+ final_est_run = false;
+
+ double num_bundles = (double)bundle_factories[0]->num_bundles();
+
+ p_bar = ProgressBar("Calculating preliminary abundance estimates", num_bundles);
+
+ Tracking tracking;
+
+ test_launcher = boost::shared_ptr<TestLauncher>(new TestLauncher(bundle_factories.size(), contrasts, NULL, &tracking, &p_bar));
+
+ // Allow the multiread tables to do their thing...
+ BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
+ {
+ rg_props->multi_read_table()->valid_mass(true);
+ }
+
+ test_launcher->clear_tracking_data();
+
+ Tests tests;
+
+ int N = (int)sam_hit_filename_lists.size();
+
+ tests.isoform_de_tests = vector<vector<SampleDiffs> >(N);
+ tests.tss_group_de_tests = vector<vector<SampleDiffs> >(N);
+ tests.gene_de_tests = vector<vector<SampleDiffs> >(N);
+ tests.cds_de_tests = vector<vector<SampleDiffs> >(N);
+ tests.diff_splicing_tests = vector<vector<SampleDiffs> >(N);
+ tests.diff_promoter_tests = vector<vector<SampleDiffs> >(N);
+ tests.diff_cds_tests = vector<vector<SampleDiffs> >(N);
+
+ for (int i = 1; i < N; ++i)
+ {
+ tests.isoform_de_tests[i] = vector<SampleDiffs>(i);
+ tests.tss_group_de_tests[i] = vector<SampleDiffs>(i);
+ tests.gene_de_tests[i] = vector<SampleDiffs>(i);
+ tests.cds_de_tests[i] = vector<SampleDiffs>(i);
+ tests.diff_splicing_tests[i] = vector<SampleDiffs>(i);
+ tests.diff_promoter_tests[i] = vector<SampleDiffs>(i);
+ tests.diff_cds_tests[i] = vector<SampleDiffs>(i);
+ }
+
+ final_est_run = true;
+ p_bar = ProgressBar("Normalizing expression levels for locus", num_bundles);
+
+ test_launcher = boost::shared_ptr<TestLauncher>(new TestLauncher(bundle_factories.size(), contrasts, &tests, &tracking, &p_bar));
+
+ while (true)
+ {
+ //boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
+ quantitate_next_locus(rt, bundle_factories, test_launcher);
+ bool more_loci_remain = false;
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
+ {
+ if (rep_fac->bundles_remain())
+ {
+ more_loci_remain = true;
+ break;
+ }
+ }
+ if (!more_loci_remain)
+ {
+ // wait for the workers to finish up before doing the cross-sample testing.
+#if ENABLE_THREADS
+ while(1)
+ {
+ locus_thread_pool_lock.lock();
+ if (locus_curr_threads == 0)
+ {
+ locus_thread_pool_lock.unlock();
+ break;
+ }
+
+ locus_thread_pool_lock.unlock();
+
+ boost::this_thread::sleep(boost::posix_time::milliseconds(5));
+
+ }
+#endif
+ break;
+ }
+ }
+
+ p_bar.complete();
+
+ write_output(all_read_groups, tracking, outfiles);
+}
+
+void open_outfiles_for_writing_cuffdiff_format(Outfiles& outfiles)
+{
+
+ static const int filename_buf_size = 2048;
+
+ char isoform_fpkm_tracking_name[filename_buf_size];
+ sprintf(isoform_fpkm_tracking_name, "%s/isoforms.fpkm_tracking", output_dir.c_str());
+ FILE* isoform_fpkm_out = fopen(isoform_fpkm_tracking_name, "w");
+ if (!isoform_fpkm_out)
+ {
+ fprintf(stderr, "Error: cannot open isoform-level FPKM tracking file %s for writing\n",
+ isoform_fpkm_tracking_name);
+ exit(1);
+ }
+ outfiles.isoform_fpkm_tracking_out = isoform_fpkm_out;
+
+ char tss_group_fpkm_tracking_name[filename_buf_size];
+ sprintf(tss_group_fpkm_tracking_name, "%s/tss_groups.fpkm_tracking", output_dir.c_str());
+ FILE* tss_group_fpkm_out = fopen(tss_group_fpkm_tracking_name, "w");
+ if (!tss_group_fpkm_out)
+ {
+ fprintf(stderr, "Error: cannot open TSS group-level FPKM tracking file %s for writing\n",
+ tss_group_fpkm_tracking_name);
+ exit(1);
+ }
+ outfiles.tss_group_fpkm_tracking_out = tss_group_fpkm_out;
+
+ char cds_fpkm_tracking_name[filename_buf_size];
+ sprintf(cds_fpkm_tracking_name, "%s/cds.fpkm_tracking", output_dir.c_str());
+ FILE* cds_fpkm_out = fopen(cds_fpkm_tracking_name, "w");
+ if (!cds_fpkm_out)
+ {
+ fprintf(stderr, "Error: cannot open CDS level FPKM tracking file %s for writing\n",
+ cds_fpkm_tracking_name);
+ exit(1);
+ }
+ outfiles.cds_fpkm_tracking_out = cds_fpkm_out;
+
+ char gene_fpkm_tracking_name[filename_buf_size];
+ sprintf(gene_fpkm_tracking_name, "%s/genes.fpkm_tracking", output_dir.c_str());
+ FILE* gene_fpkm_out = fopen(gene_fpkm_tracking_name, "w");
+ if (!gene_fpkm_out)
+ {
+ fprintf(stderr, "Error: cannot open gene-level FPKM tracking file %s for writing\n",
+ gene_fpkm_tracking_name);
+ exit(1);
+ }
+ outfiles.gene_fpkm_tracking_out = gene_fpkm_out;
+
+ char isoform_count_tracking_name[filename_buf_size];
+ sprintf(isoform_count_tracking_name, "%s/isoforms.count_tracking", output_dir.c_str());
+ FILE* isoform_count_out = fopen(isoform_count_tracking_name, "w");
+ if (!isoform_count_out)
+ {
+ fprintf(stderr, "Error: cannot open isoform-level count tracking file %s for writing\n",
+ isoform_count_tracking_name);
+ exit(1);
+ }
+ outfiles.isoform_count_tracking_out = isoform_count_out;
+
+ char tss_group_count_tracking_name[filename_buf_size];
+ sprintf(tss_group_count_tracking_name, "%s/tss_groups.count_tracking", output_dir.c_str());
+ FILE* tss_group_count_out = fopen(tss_group_count_tracking_name, "w");
+ if (!tss_group_count_out)
+ {
+ fprintf(stderr, "Error: cannot open TSS group-level count tracking file %s for writing\n",
+ tss_group_count_tracking_name);
+ exit(1);
+ }
+ outfiles.tss_group_count_tracking_out = tss_group_count_out;
+
+ char cds_count_tracking_name[filename_buf_size];
+ sprintf(cds_count_tracking_name, "%s/cds.count_tracking", output_dir.c_str());
+ FILE* cds_count_out = fopen(cds_count_tracking_name, "w");
+ if (!cds_count_out)
+ {
+ fprintf(stderr, "Error: cannot open CDS level count tracking file %s for writing\n",
+ cds_count_tracking_name);
+ exit(1);
+ }
+ outfiles.cds_count_tracking_out = cds_count_out;
+
+ char gene_count_tracking_name[filename_buf_size];
+ sprintf(gene_count_tracking_name, "%s/genes.count_tracking", output_dir.c_str());
+ FILE* gene_count_out = fopen(gene_count_tracking_name, "w");
+ if (!gene_count_out)
+ {
+ fprintf(stderr, "Error: cannot open gene-level count tracking file %s for writing\n",
+ gene_count_tracking_name);
+ exit(1);
+ }
+ outfiles.gene_count_tracking_out = gene_count_out;
+
+ char isoform_rep_tracking_name[filename_buf_size];
+ sprintf(isoform_rep_tracking_name, "%s/isoforms.read_group_tracking", output_dir.c_str());
+ FILE* isoform_rep_out = fopen(isoform_rep_tracking_name, "w");
+ if (!isoform_rep_out)
+ {
+ fprintf(stderr, "Error: cannot open isoform-level read group tracking file %s for writing\n",
+ isoform_rep_tracking_name);
+ exit(1);
+ }
+ outfiles.isoform_rep_tracking_out = isoform_rep_out;
+
+ char tss_group_rep_tracking_name[filename_buf_size];
+ sprintf(tss_group_rep_tracking_name, "%s/tss_groups.read_group_tracking", output_dir.c_str());
+ FILE* tss_group_rep_out = fopen(tss_group_rep_tracking_name, "w");
+ if (!tss_group_rep_out)
+ {
+ fprintf(stderr, "Error: cannot open TSS group-level read group tracking file %s for writing\n",
+ tss_group_rep_tracking_name);
+ exit(1);
+ }
+ outfiles.tss_group_rep_tracking_out = tss_group_rep_out;
+
+ char cds_rep_tracking_name[filename_buf_size];
+ sprintf(cds_rep_tracking_name, "%s/cds.read_group_tracking", output_dir.c_str());
+ FILE* cds_rep_out = fopen(cds_rep_tracking_name, "w");
+ if (!cds_rep_out)
+ {
+ fprintf(stderr, "Error: cannot open CDS level read group tracking file %s for writing\n",
+ cds_rep_tracking_name);
+ exit(1);
+ }
+ outfiles.cds_rep_tracking_out = cds_rep_out;
+
+ char gene_rep_tracking_name[filename_buf_size];
+ sprintf(gene_rep_tracking_name, "%s/genes.read_group_tracking", output_dir.c_str());
+ FILE* gene_rep_out = fopen(gene_rep_tracking_name, "w");
+ if (!gene_rep_out)
+ {
+ fprintf(stderr, "Error: cannot open gene-level read group tracking file %s for writing\n",
+ gene_rep_tracking_name);
+ exit(1);
+ }
+ outfiles.gene_rep_tracking_out = gene_rep_out;
+
+ char read_group_info_name[filename_buf_size];
+ sprintf(read_group_info_name, "%s/read_groups.info", output_dir.c_str());
+ FILE* read_group_out = fopen(read_group_info_name, "w");
+ if (!read_group_out)
+ {
+ fprintf(stderr, "Error: cannot open read group info file %s for writing\n",
+ read_group_info_name);
+ exit(1);
+ }
+ outfiles.read_group_info_out = read_group_out;
+
+ char run_info_name[filename_buf_size];
+ sprintf(run_info_name, "%s/run.info", output_dir.c_str());
+ FILE* run_info_out = fopen(run_info_name, "w");
+ if (!run_info_out)
+ {
+ fprintf(stderr, "Error: cannot open run info file %s for writing\n",
+ run_info_name);
+ exit(1);
+ }
+ outfiles.run_info_out = run_info_out;
+
+}
+
+void open_outfiles_for_writing_simple_table_format(Outfiles& outfiles)
+{
+ static const int filename_buf_size = 2048;
+
+ char isoform_fpkm_tracking_name[filename_buf_size];
+ sprintf(isoform_fpkm_tracking_name, "%s/isoforms.fpkm_table", output_dir.c_str());
+ FILE* isoform_fpkm_out = fopen(isoform_fpkm_tracking_name, "w");
+ if (!isoform_fpkm_out)
+ {
+ fprintf(stderr, "Error: cannot open isoform-level FPKM table %s for writing\n",
+ isoform_fpkm_tracking_name);
+ exit(1);
+ }
+ outfiles.isoform_fpkm_tracking_out = isoform_fpkm_out;
+
+ char tss_group_fpkm_tracking_name[filename_buf_size];
+ sprintf(tss_group_fpkm_tracking_name, "%s/tss_groups.fpkm_table", output_dir.c_str());
+ FILE* tss_group_fpkm_out = fopen(tss_group_fpkm_tracking_name, "w");
+ if (!tss_group_fpkm_out)
+ {
+ fprintf(stderr, "Error: cannot open TSS group-level FPKM table %s for writing\n",
+ tss_group_fpkm_tracking_name);
+ exit(1);
+ }
+ outfiles.tss_group_fpkm_tracking_out = tss_group_fpkm_out;
+
+ char cds_fpkm_tracking_name[filename_buf_size];
+ sprintf(cds_fpkm_tracking_name, "%s/cds.fpkm_table", output_dir.c_str());
+ FILE* cds_fpkm_out = fopen(cds_fpkm_tracking_name, "w");
+ if (!cds_fpkm_out)
+ {
+ fprintf(stderr, "Error: cannot open CDS level FPKM table %s for writing\n",
+ cds_fpkm_tracking_name);
+ exit(1);
+ }
+ outfiles.cds_fpkm_tracking_out = cds_fpkm_out;
+
+ char gene_fpkm_tracking_name[filename_buf_size];
+ sprintf(gene_fpkm_tracking_name, "%s/genes.fpkm_table", output_dir.c_str());
+ FILE* gene_fpkm_out = fopen(gene_fpkm_tracking_name, "w");
+ if (!gene_fpkm_out)
+ {
+ fprintf(stderr, "Error: cannot open gene-level FPKM table %s for writing\n",
+ gene_fpkm_tracking_name);
+ exit(1);
+ }
+ outfiles.gene_fpkm_tracking_out = gene_fpkm_out;
+
+ char isoform_count_tracking_name[filename_buf_size];
+ sprintf(isoform_count_tracking_name, "%s/isoforms.count_table", output_dir.c_str());
+ FILE* isoform_count_out = fopen(isoform_count_tracking_name, "w");
+ if (!isoform_count_out)
+ {
+ fprintf(stderr, "Error: cannot open isoform-level count table %s for writing\n",
+ isoform_count_tracking_name);
+ exit(1);
+ }
+ outfiles.isoform_count_tracking_out = isoform_count_out;
+
+ char tss_group_count_tracking_name[filename_buf_size];
+ sprintf(tss_group_count_tracking_name, "%s/tss_groups.count_table", output_dir.c_str());
+ FILE* tss_group_count_out = fopen(tss_group_count_tracking_name, "w");
+ if (!tss_group_count_out)
+ {
+ fprintf(stderr, "Error: cannot open TSS group-level count table %s for writing\n",
+ tss_group_count_tracking_name);
+ exit(1);
+ }
+ outfiles.tss_group_count_tracking_out = tss_group_count_out;
+
+ char cds_count_tracking_name[filename_buf_size];
+ sprintf(cds_count_tracking_name, "%s/cds.count_table", output_dir.c_str());
+ FILE* cds_count_out = fopen(cds_count_tracking_name, "w");
+ if (!cds_count_out)
+ {
+ fprintf(stderr, "Error: cannot open CDS level count table %s for writing\n",
+ cds_count_tracking_name);
+ exit(1);
+ }
+ outfiles.cds_count_tracking_out = cds_count_out;
+
+ char gene_count_tracking_name[filename_buf_size];
+ sprintf(gene_count_tracking_name, "%s/genes.count_table", output_dir.c_str());
+ FILE* gene_count_out = fopen(gene_count_tracking_name, "w");
+ if (!gene_count_out)
+ {
+ fprintf(stderr, "Error: cannot open gene-level count table %s for writing\n",
+ gene_count_tracking_name);
+ exit(1);
+ }
+ outfiles.gene_count_tracking_out = gene_count_out;
+
+ char isoform_attr_name[filename_buf_size];
+ sprintf(isoform_attr_name, "%s/isoforms.attr_table", output_dir.c_str());
+ FILE* isoform_attr_out = fopen(isoform_attr_name, "w");
+ if (!isoform_attr_out)
+ {
+ fprintf(stderr, "Error: cannot open isoform-level attribute table %s for writing\n",
+ isoform_attr_name);
+ exit(1);
+ }
+ outfiles.isoform_attr_out = isoform_attr_out;
+
+ char tss_group_attr_name[filename_buf_size];
+ sprintf(tss_group_attr_name, "%s/tss_groups.attr_table", output_dir.c_str());
+ FILE* tss_group_attr_out = fopen(tss_group_attr_name, "w");
+ if (!tss_group_attr_out)
+ {
+ fprintf(stderr, "Error: cannot open TSS group-level attribute table %s for writing\n",
+ tss_group_attr_name);
+ exit(1);
+ }
+ outfiles.tss_group_attr_out = tss_group_attr_out;
+
+ char cds_attr_name[filename_buf_size];
+ sprintf(cds_attr_name, "%s/cds.attr_table", output_dir.c_str());
+ FILE* cds_attr_out = fopen(cds_attr_name, "w");
+ if (!cds_attr_out)
+ {
+ fprintf(stderr, "Error: cannot open CDS level attribute table %s for writing\n",
+ cds_attr_name);
+ exit(1);
+ }
+ outfiles.cds_attr_out = cds_attr_out;
+
+ char gene_attr_name[filename_buf_size];
+ sprintf(gene_attr_name, "%s/genes.attr_table", output_dir.c_str());
+ FILE* gene_attr_out = fopen(gene_attr_name, "w");
+ if (!gene_attr_out)
+ {
+ fprintf(stderr, "Error: cannot open gene-level attribute table %s for writing\n",
+ gene_attr_name);
+ exit(1);
+ }
+ outfiles.gene_attr_out = gene_attr_out;
+
+ char read_group_info_name[filename_buf_size];
+ sprintf(read_group_info_name, "%s/samples.table", output_dir.c_str());
+ FILE* read_group_out = fopen(read_group_info_name, "w");
+ if (!read_group_out)
+ {
+ fprintf(stderr, "Error: cannot open read group info file %s for writing\n",
+ read_group_info_name);
+ exit(1);
+ }
+ outfiles.read_group_info_out = read_group_out;
+
+ char run_info_name[filename_buf_size];
+ sprintf(run_info_name, "%s/run.info", output_dir.c_str());
+ FILE* run_info_out = fopen(run_info_name, "w");
+ if (!run_info_out)
+ {
+ fprintf(stderr, "Error: cannot open run info file %s for writing\n",
+ run_info_name);
+ exit(1);
+ }
+ outfiles.run_info_out = run_info_out;
+
+}
+
+void open_outfiles_for_writing(Outfiles& outfiles)
+{
+ if (output_format == CUFFDIFF_OUTPUT_FMT)
+ {
+ open_outfiles_for_writing_cuffdiff_format(outfiles);
+ }
+ else if (output_format == SIMPLE_TABLE_OUTPUT_FMT)
+ {
+ open_outfiles_for_writing_simple_table_format(outfiles);
+ }
+ else{
+ fprintf(stderr, "Error: unrecognized output format!\n");
+ exit(1);
+ }
+
+}
+
+int main(int argc, char** argv)
+{
+ for (int i = 0; i < argc; ++i)
+ {
+ cmd_str += string(argv[i]) + " ";
+ }
+
+ init_library_table();
+ init_dispersion_method_table();
+ init_lib_norm_method_table();
+ init_output_format_table();
+
+ min_isoform_fraction = 1e-5;
+
+ int parse_ret = parse_options(argc,argv);
+ if (parse_ret)
+ return parse_ret;
+
+ if (!use_total_mass && !use_compat_mass)
+ {
+ use_total_mass = false;
+ use_compat_mass = true;
+ }
+
+ if(optind >= argc)
+ {
+ print_usage();
+ return 1;
+ }
+
+ if (!no_update_check)
+ check_version(PACKAGE_VERSION);
+
+ string ref_gtf_filename = argv[optind++];
+ vector<string> sam_hit_filenames;
+
+ if (use_sample_sheet)
+ {
+ if (optind < argc)
+ {
+
+ string sample_sheet_filename = argv[optind++];
+ FILE* sample_sheet_file = NULL;
+ if (sample_sheet_filename != "")
+ {
+ sample_sheet_file = fopen(sample_sheet_filename.c_str(), "r");
+ if (!sample_sheet_file)
+ {
+ fprintf(stderr, "Error: cannot open sample sheet file %s for reading\n",
+ sample_sheet_filename.c_str());
+ exit(1);
+ }
+ }
+ parse_sample_sheet_file(sample_sheet_file, sample_labels, sam_hit_filenames);
+ }
+ else
+ {
+ fprintf(stderr, "Error: option --use-sample-sheet requires a single sample sheet filename instead of a list of SAM/BAM files\n");
+ }
+ }
+ else
+ {
+ while(optind < argc)
+ {
+ string sam_hits_file_name = argv[optind++];
+ sam_hit_filenames.push_back(sam_hits_file_name);
+ }
+
+ if (sample_labels.size() == 0)
+ {
+ for (size_t i = 1; i < sam_hit_filenames.size() + 1; ++i)
+ {
+ char buf[256];
+ sprintf(buf, "q%lu", i);
+ sample_labels.push_back(buf);
+ }
+ }
+ }
+
+ while (sam_hit_filenames.size() < 2)
+ {
+ fprintf(stderr, "Error: cuffdiff requires at least 2 SAM files\n");
+ exit(1);
+ }
+
+
+ if (sam_hit_filenames.size() != sample_labels.size())
+ {
+ fprintf(stderr, "Error: number of labels must match number of conditions\n");
+ exit(1);
+ }
+
+ if (random_seed == -1)
+ random_seed = time(NULL);
+
+ // seed the random number generator - we'll need it for the importance
+ // sampling during MAP estimation of the gammas
+ srand48(random_seed);
+
+ FILE* ref_gtf = NULL;
+ if (ref_gtf_filename != "")
+ {
+ ref_gtf = fopen(ref_gtf_filename.c_str(), "r");
+ if (!ref_gtf)
+ {
+ fprintf(stderr, "Error: cannot open reference GTF file %s for reading\n",
+ ref_gtf_filename.c_str());
+ exit(1);
+ }
+ }
+
+ FILE* mask_gtf = NULL;
+ if (mask_gtf_filename != "")
+ {
+ mask_gtf = fopen(mask_gtf_filename.c_str(), "r");
+ if (!mask_gtf)
+ {
+ fprintf(stderr, "Error: cannot open mask GTF file %s for reading\n",
+ mask_gtf_filename.c_str());
+ exit(1);
+ }
+ }
+
+
+ FILE* contrast_file = NULL;
+ if (contrast_filename != "")
+ {
+ contrast_file = fopen(contrast_filename.c_str(), "r");
+ if (!contrast_file)
+ {
+ fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
+ contrast_filename.c_str());
+ exit(1);
+ }
+ }
+
+ FILE* norm_standards_file = NULL;
+ if (norm_standards_filename != "")
+ {
+ norm_standards_file = fopen(norm_standards_filename.c_str(), "r");
+ if (!norm_standards_file)
+ {
+ fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
+ norm_standards_filename.c_str());
+ exit(1);
+ }
+ }
+
+
+ // Note: we don't want the assembly filters interfering with calculations
+ // here
+
+ pre_mrna_fraction = 0.0;
+ olap_radius = 0;
+
+ Outfiles outfiles;
+
+ if (output_dir != "")
+ {
+ int retcode = mkpath(output_dir.c_str(), 0777);
+ if (retcode == -1)
+ {
+ if (errno != EEXIST)
+ {
+ fprintf (stderr,
+ "Error: cannot create directory %s\n",
+ output_dir.c_str());
+ exit(1);
+ }
+ }
+ }
+
+ open_outfiles_for_writing(outfiles);
+
+ driver(ref_gtf, mask_gtf, contrast_file, norm_standards_file, sam_hit_filenames, outfiles);
+
+#if 0
+ if (emit_count_tables)
+ {
+ dump_locus_variance_info(output_dir + string("/locus_var.txt"));
+ }
+#endif
+
+ return 0;
+}
+
diff --git a/src/cuffquant.cpp b/src/cuffquant.cpp
new file mode 100644
index 0000000..7a52a87
--- /dev/null
+++ b/src/cuffquant.cpp
@@ -0,0 +1,1623 @@
+/*
+ * cuffdiff.cpp
+ * cufflinks
+ *
+ * Created by Cole Trapnell on 10/21/09.
+ * Copyright 2009 Cole Trapnell. All rights reserved.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#else
+#define PACKAGE_VERSION "INTERNAL"
+#define SVN_REVISION "XXX"
+#endif
+
+
+#include <stdlib.h>
+#include <getopt.h>
+#include <string>
+#include <numeric>
+#include <cfloat>
+#include <iostream>
+#include <fstream>
+
+#include "common.h"
+#include "hits.h"
+#include "bundles.h"
+#include "abundances.h"
+#include "tokenize.h"
+#include "biascorrection.h"
+#include "update_check.h"
+
+#include <boost/thread.hpp>
+#include <boost/version.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/graph_traits.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "replicates.h"
+#include "tracking.h"
+
+// Need at least this many reads in a locus to do any testing on it
+
+vector<string> sample_labels;
+
+using namespace std;
+using namespace boost;
+
+// We leave out the short codes for options that don't take an argument
+#if ENABLE_THREADS
+const char *short_options = "m:p:s:c:I:j:L:M:o:b:TNqvuF:C:";
+#else
+const char *short_options = "m:s:c:I:j:L:M:o:b:TNqvuF:C:";
+#endif
+
+
+
+static struct option long_options[] = {
+{"frag-len-mean", required_argument, 0, 'm'},
+{"frag-len-std-dev", required_argument, 0, 's'},
+{"seed", required_argument, 0, OPT_RANDOM_SEED},
+{"mask-file", required_argument, 0, 'M'},
+{"output-dir", required_argument, 0, 'o'},
+{"verbose", no_argument, 0, 'v'},
+{"quiet", no_argument, 0, 'q'},
+{"frag-bias-correct", required_argument, 0, 'b'},
+{"multi-read-correct", no_argument, 0, 'u'},
+#if ENABLE_THREADS
+{"num-threads", required_argument, 0, 'p'},
+#endif
+{"library-type", required_argument, 0, OPT_LIBRARY_TYPE},
+{"seed", required_argument, 0, OPT_RANDOM_SEED},
+{"no-collapse-cond-prob", no_argument, 0, OPT_COLLAPSE_COND_PROB},
+{"max-mle-iterations", required_argument, 0, OPT_MLE_MAX_ITER},
+{"min-mle-accuracy", required_argument, 0, OPT_MLE_MIN_ACC},
+{"bias-mode", required_argument, 0, OPT_BIAS_MODE},
+{"no-update-check", no_argument, 0, OPT_NO_UPDATE_CHECK},
+
+// Some options for testing different stats policies
+{"max-bundle-frags", required_argument, 0, OPT_MAX_FRAGS_PER_BUNDLE},
+{"max-frag-multihits", required_argument, 0, OPT_FRAG_MAX_MULTIHITS},
+{"no-effective-length-correction", no_argument, 0, OPT_NO_EFFECTIVE_LENGTH_CORRECTION},
+{"no-length-correction", no_argument, 0, OPT_NO_LENGTH_CORRECTION},
+{0, 0, 0, 0} // terminator
+};
+
+void print_usage()
+{
+ fprintf(stderr, "cuffquant v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
+ fprintf(stderr, "-----------------------------\n");
+
+ //NOTE: SPACES ONLY, bozo
+ fprintf(stderr, "Usage: cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]\n");
+ fprintf(stderr, " Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam\n");
+ fprintf(stderr, "General Options:\n");
+ fprintf(stderr, " -o/--output-dir write all output files to this directory [ default: ./ ]\n");
+ fprintf(stderr, " -M/--mask-file ignore all alignment within transcripts in this file [ default: NULL ]\n");
+ //fprintf(stderr, " --norm-standards-file Housekeeping/spike genes to normalize libraries [ default: NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
+ fprintf(stderr, " -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]\n");
+ fprintf(stderr, " -u/--multi-read-correct use 'rescue method' for multi-reads [ default: FALSE ]\n");
+#if ENABLE_THREADS
+ fprintf(stderr, " -p/--num-threads number of threads used during quantification [ default: 1 ]\n");
+#endif
+ fprintf(stderr, " --library-type Library prep used for input reads [ default: below ]\n");
+
+ fprintf(stderr, "\nAdvanced Options:\n");
+ fprintf(stderr, " -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]\n");
+ fprintf(stderr, " -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]\n");
+ fprintf(stderr, " -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]\n");
+ fprintf(stderr, " --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]\n");
+ fprintf(stderr, " -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]\n");
+ fprintf(stderr, " -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]\n");
+ fprintf(stderr, " --seed value of random number generator seed [ default: 0 ]\n");
+ fprintf(stderr, " --no-update-check do not contact server to check for update availability[ default: FALSE ]\n");
+ fprintf(stderr, " --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]\n");
+ fprintf(stderr, " --max-frag-multihits Maximum number of alignments allowed per fragment [ default: unlim ]\n");
+ fprintf(stderr, " --no-effective-length-correction No effective length correction [ default: FALSE ]\n");
+ fprintf(stderr, " --no-length-correction No length correction [ default: FALSE ]\n");
+
+ fprintf(stderr, "\nDebugging use only:\n");
+ fprintf(stderr, " --read-skip-fraction Skip a random subset of reads this size [ default: 0.0 ]\n");
+ fprintf(stderr, " --no-read-pairs Break all read pairs [ default: FALSE ]\n");
+ fprintf(stderr, " --trim-read-length Trim reads to be this long (keep 5' end) [ default: none ]\n");
+ fprintf(stderr, " --no-scv-correction Disable SCV correction [ default: FALSE ]\n");
+ print_library_table();
+}
+
+int parse_options(int argc, char** argv)
+{
+ int option_index = 0;
+ int next_option;
+ string sample_label_list;
+ string dispersion_method_str;
+ string lib_norm_method_str;
+ do {
+ next_option = getopt_long_only(argc, argv, short_options, long_options, &option_index);
+ if (next_option == -1) /* Done with options. */
+ break;
+ switch (next_option) {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ break;
+
+ case 'm':
+ user_provided_fld = true;
+ def_frag_len_mean = (uint32_t)parseInt(0, "-m/--frag-len-mean arg must be at least 0", print_usage);
+ break;
+ case 's':
+ user_provided_fld = true;
+ def_frag_len_std_dev = (uint32_t)parseInt(0, "-s/--frag-len-std-dev arg must be at least 0", print_usage);
+ break;
+ case 'p':
+ num_threads = (uint32_t)parseInt(1, "-p/--num-threads arg must be at least 1", print_usage);
+ break;
+ case 'L':
+ sample_label_list = optarg;
+ break;
+ case OPT_NUM_IMP_SAMPLES:
+ num_importance_samples = parseInt(1, "--num-importance-samples must be at least 1", print_usage);
+ break;
+ case OPT_MLE_MAX_ITER:
+ max_mle_iterations = parseInt(1, "--max-mle-iterations must be at least 1", print_usage);
+ break;
+ case OPT_BIAS_MODE:
+ if (!strcmp(optarg, "site"))
+ bias_mode = SITE;
+ else if (!strcmp(optarg, "pos"))
+ bias_mode = POS;
+ else if (!strcmp(optarg, "pos_vlmm"))
+ bias_mode = POS_VLMM;
+ else if (!strcmp(optarg, "vlmm"))
+ bias_mode = VLMM;
+ else if (!strcmp(optarg, "pos_site"))
+ bias_mode = POS_SITE;
+ else
+ {
+ fprintf(stderr, "Unknown bias mode.\n");
+ exit(1);
+ }
+ break;
+ case 'M':
+ {
+ mask_gtf_filename = optarg;
+ break;
+ }
+ case OPT_NORM_STANDARDS_FILE:
+ {
+ norm_standards_filename = optarg;
+ break;
+ }
+ case 'v':
+ {
+ if (cuff_quiet)
+ {
+ fprintf(stderr, "Warning: Can't be both verbose and quiet! Setting verbose only.\n");
+ }
+ cuff_quiet = false;
+ cuff_verbose = true;
+ break;
+ }
+ case 'q':
+ {
+ if (cuff_verbose)
+ {
+ fprintf(stderr, "Warning: Can't be both verbose and quiet! Setting quiet only.\n");
+ }
+ cuff_verbose = false;
+ cuff_quiet = true;
+ break;
+ }
+ case 'o':
+ {
+ output_dir = optarg;
+ break;
+ }
+ case 'b':
+ {
+ fasta_dir = optarg;
+ corr_bias = true;
+ break;
+ }
+
+ case 'u':
+ {
+ corr_multi = true;
+ break;
+ }
+ case OPT_LIBRARY_TYPE:
+ {
+ library_type = optarg;
+ break;
+ }
+ case OPT_NO_UPDATE_CHECK:
+ {
+ no_update_check = true;
+ break;
+ }
+ case OPT_RANDOM_SEED:
+ {
+ random_seed = parseInt(0, "--seed must be at least 0", print_usage);
+ break;
+ }
+ case OPT_COLLAPSE_COND_PROB:
+ {
+ cond_prob_collapse = false;
+ break;
+ }
+ case OPT_USE_COMPAT_MASS:
+ {
+ use_compat_mass = true;
+ break;
+ }
+ case OPT_USE_TOTAL_MASS:
+ {
+ use_total_mass = true;
+ break;
+ }
+ case OPT_MAX_FRAGS_PER_BUNDLE:
+ {
+ max_frags_per_bundle = parseInt(0, "--max-bundle-frags must be at least 0", print_usage);
+ break;
+ }
+ case OPT_READ_SKIP_FRACTION:
+ {
+ read_skip_fraction = parseFloat(0, 1.0, "--read-skip-fraction must be between 0 and 1.0", print_usage);
+ break;
+ }
+ case OPT_NO_READ_PAIRS:
+ {
+ no_read_pairs = true;
+ break;
+ }
+ case OPT_TRIM_READ_LENGTH:
+ {
+ trim_read_length = parseInt(0, "--trim-read-length must be at least 1", print_usage);
+ break;
+ }
+ case OPT_MLE_MIN_ACC:
+ {
+ bootstrap_delta_gap = parseFloat(0, 10000000.0, "--read-skip-fraction must be between 0 and 10000000.0", print_usage);
+ break;
+ }
+ case OPT_FRAG_MAX_MULTIHITS:
+ {
+ max_frag_multihits = parseInt(1, "--max-frag-multihits must be at least 1", print_usage);
+ break;
+ }
+ case OPT_NO_EFFECTIVE_LENGTH_CORRECTION:
+ {
+ no_effective_length_correction = true;
+ break;
+ }
+ case OPT_NO_LENGTH_CORRECTION:
+ {
+ no_length_correction = true;
+ break;
+ }
+ case OPT_LIB_NORM_METHOD:
+ {
+ lib_norm_method_str = optarg;
+ break;
+ }
+
+ default:
+ print_usage();
+ return 1;
+ }
+ } while(next_option != -1);
+
+ if (library_type != "")
+ {
+ map<string, ReadGroupProperties>::iterator lib_itr =
+ library_type_table.find(library_type);
+ if (lib_itr == library_type_table.end())
+ {
+ fprintf(stderr, "Error: Library type %s not supported\n", library_type.c_str());
+ exit(1);
+ }
+ else
+ {
+ if (library_type == "transfrags")
+ {
+ allow_junk_filtering = false;
+ }
+ global_read_properties = &lib_itr->second;
+ }
+ }
+ else
+ {
+
+ }
+
+ // Set the count dispersion method to use
+ if (dispersion_method_str == "")
+ {
+ dispersion_method_str = default_dispersion_method;
+ }
+
+ map<string, DispersionMethod>::iterator disp_itr =
+ dispersion_method_table.find(dispersion_method_str);
+ if (disp_itr == dispersion_method_table.end())
+ {
+ fprintf(stderr, "Error: Dispersion method %s not supported\n", dispersion_method_str.c_str());
+ exit(1);
+ }
+ else
+ {
+ dispersion_method = disp_itr->second;
+ }
+
+ // Set the library size normalization method to use
+ if (lib_norm_method_str == "")
+ {
+ lib_norm_method_str = default_lib_norm_method;
+ }
+
+ map<string, LibNormalizationMethod>::iterator lib_norm_itr =
+ lib_norm_method_table.find(lib_norm_method_str);
+ if (lib_norm_itr == lib_norm_method_table.end())
+ {
+ fprintf(stderr, "Error: Dispersion method %s not supported\n", lib_norm_method_str.c_str());
+ exit(1);
+ }
+ else
+ {
+ lib_norm_method = lib_norm_itr->second;
+ }
+
+
+
+ if (use_total_mass && use_compat_mass)
+ {
+ fprintf (stderr, "Error: please supply only one of --compatibile-hits-norm and --total-hits-norm\n");
+ exit(1);
+ }
+
+ tokenize(sample_label_list, ",", sample_labels);
+
+ allow_junk_filtering = false;
+
+ return 0;
+}
+
+struct Outfiles
+{
+ FILE* isoform_fpkm_tracking_out;
+ FILE* tss_group_fpkm_tracking_out;
+ FILE* gene_fpkm_tracking_out;
+ FILE* cds_fpkm_tracking_out;
+
+ FILE* isoform_count_tracking_out;
+ FILE* tss_group_count_tracking_out;
+ FILE* gene_count_tracking_out;
+ FILE* cds_count_tracking_out;
+
+ FILE* run_info_out;
+ FILE* read_group_info_out;
+ FILE* bias_out;
+ FILE* var_model_out;
+};
+
+void print_FPKM_tracking(FILE* fout,
+ const FPKMTrackingTable& tracking)
+{
+ fprintf(fout,"tracking_id\tclass_code\tnearest_ref_id\tgene_id\tgene_short_name\ttss_id\tlocus\tlength\tcoverage");
+ FPKMTrackingTable::const_iterator first_itr = tracking.begin();
+ if (first_itr != tracking.end())
+ {
+ const FPKMTracking& track = first_itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ fprintf(fout, "\t%s_FPKM\t%s_conf_lo\t%s_conf_hi\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
+ }
+ }
+ fprintf(fout, "\n");
+ for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
+ {
+ const string& description = itr->first;
+ const FPKMTracking& track = itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+
+ AbundanceStatus status = NUMERIC_OK;
+ BOOST_FOREACH (const FPKMContext& c, fpkms)
+ {
+ if (c.status == NUMERIC_FAIL)
+ status = NUMERIC_FAIL;
+ }
+
+ string all_gene_ids = cat_strings(track.gene_ids);
+ if (all_gene_ids == "")
+ all_gene_ids = "-";
+
+ string all_gene_names = cat_strings(track.gene_names);
+ if (all_gene_names == "")
+ all_gene_names = "-";
+
+ string all_tss_ids = cat_strings(track.tss_ids);
+ if (all_tss_ids == "")
+ all_tss_ids = "-";
+
+ char length_buff[33] = "-";
+ if (track.length)
+ sprintf(length_buff, "%d", track.length);
+
+ fprintf(fout, "%s\t%c\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
+ description.c_str(),
+ track.classcode ? track.classcode : '-',
+ track.ref_match.c_str(),
+ all_gene_ids.c_str(),
+ all_gene_names.c_str(),
+ all_tss_ids.c_str(),
+ track.locus_tag.c_str(),
+ length_buff,
+ "-");
+
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ double fpkm = fpkms[i].FPKM;
+ //double std_dev = sqrt(fpkms[i].FPKM_variance);
+ double fpkm_conf_hi = fpkms[i].FPKM_conf_hi;
+ double fpkm_conf_lo = fpkms[i].FPKM_conf_lo;
+ const char* status_str = "OK";
+
+ if (fpkms[i].status == NUMERIC_OK)
+ {
+ status_str = "OK";
+ }
+ else if (fpkms[i].status == NUMERIC_FAIL)
+ {
+ status_str = "FAIL";
+ }
+ else if (fpkms[i].status == NUMERIC_LOW_DATA)
+ {
+ status_str = "LOWDATA";
+ }
+ else if (fpkms[i].status == NUMERIC_HI_DATA)
+ {
+ status_str = "HIDATA";
+ }
+ else
+ {
+ assert(false);
+ }
+
+ fprintf(fout, "\t%lg\t%lg\t%lg\t%s", fpkm, fpkm_conf_lo, fpkm_conf_hi, status_str);
+ }
+
+ fprintf(fout, "\n");
+ }
+}
+
+void print_count_tracking(FILE* fout,
+ const FPKMTrackingTable& tracking)
+{
+ fprintf(fout,"tracking_id");
+ FPKMTrackingTable::const_iterator first_itr = tracking.begin();
+ if (first_itr != tracking.end())
+ {
+ const FPKMTracking& track = first_itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ fprintf(fout, "\t%s_count\t%s_count_variance\t%s_count_uncertainty_var\t%s_count_dispersion_var\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
+ }
+ }
+ fprintf(fout, "\n");
+ for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
+ {
+ const string& description = itr->first;
+ const FPKMTracking& track = itr->second;
+ const vector<FPKMContext>& fpkms = track.fpkm_series;
+
+ AbundanceStatus status = NUMERIC_OK;
+ BOOST_FOREACH (const FPKMContext& c, fpkms)
+ {
+ if (c.status == NUMERIC_FAIL)
+ status = NUMERIC_FAIL;
+ }
+
+ fprintf(fout, "%s",
+ description.c_str());
+
+ for (size_t i = 0; i < fpkms.size(); ++i)
+ {
+ const char* status_str = "OK";
+
+ if (fpkms[i].status == NUMERIC_OK)
+ {
+ status_str = "OK";
+ }
+ else if (fpkms[i].status == NUMERIC_FAIL)
+ {
+ status_str = "FAIL";
+ }
+ else if (fpkms[i].status == NUMERIC_LOW_DATA)
+ {
+ status_str = "LOWDATA";
+ }
+ else if (fpkms[i].status == NUMERIC_HI_DATA)
+ {
+ status_str = "HIDATA";
+ }
+ else
+ {
+ assert(false);
+ }
+
+ double external_counts = fpkms[i].count_mean;
+ double external_count_var = fpkms[i].count_var;
+ double uncertainty_var = fpkms[i].count_uncertainty_var;
+ double dispersion_var = fpkms[i].count_dispersion_var;
+ fprintf(fout, "\t%lg\t%lg\t%lg\t%lg\t%s", external_counts, external_count_var, uncertainty_var, dispersion_var, status_str);
+ }
+
+ fprintf(fout, "\n");
+ }
+}
+
+void print_run_info(FILE* fout)
+{
+ fprintf(fout, "param\tvalue\n");
+ fprintf(fout, "cmd_line\t%s\n", cmd_str.c_str());
+ fprintf(fout, "version\t%s\n", PACKAGE_VERSION);
+ fprintf(fout, "SVN_revision\t%s\n",SVN_REVISION);
+ fprintf(fout, "boost_version\t%d\n", BOOST_VERSION);
+}
+
+
+#if ENABLE_THREADS
+boost::mutex inspect_lock;
+
+boost::mutex _recorder_lock;
+boost::mutex locus_thread_pool_lock;
+int locus_curr_threads = 0;
+int locus_num_threads = 0;
+
+void decr_pool_count()
+{
+ locus_thread_pool_lock.lock();
+ locus_curr_threads--;
+ locus_thread_pool_lock.unlock();
+}
+
+#endif
+
+
+
+void inspect_map_worker(ReplicatedBundleFactory& fac,
+ int& tmp_min_frag_len,
+ int& tmp_max_frag_len)
+{
+#if ENABLE_THREADS
+ boost::this_thread::at_thread_exit(decr_pool_count);
+#endif
+
+ int min_f = std::numeric_limits<int>::max();
+ int max_f = 0;
+
+ fac.inspect_replicate_maps(min_f, max_f);
+
+#if ENABLE_THREADS
+ inspect_lock.lock();
+#endif
+ tmp_min_frag_len = min(min_f, tmp_min_frag_len);
+ tmp_max_frag_len = max(max_f, tmp_max_frag_len);
+#if ENABLE_THREADS
+ inspect_lock.unlock();
+#endif
+}
+
+void learn_bias_worker(boost::shared_ptr<BundleFactory> fac)
+{
+#if ENABLE_THREADS
+ boost::this_thread::at_thread_exit(decr_pool_count);
+#endif
+ boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
+ BiasLearner* bl = new BiasLearner(rg_props->frag_len_dist());
+ learn_bias(*fac, *bl, false);
+ rg_props->bias_learner(boost::shared_ptr<BiasLearner>(bl));
+}
+
+typedef map<int, vector<AbundanceGroup> > light_ab_group_tracking_table;
+
+// Similiar to TestLauncher, except this class records tracking data when abundance groups report in
+struct AbundanceRecorder
+{
+private:
+ AbundanceRecorder(AbundanceRecorder& rhs) {}
+
+public:
+ AbundanceRecorder(int num_samples,
+ Tracking* tracking,
+ ProgressBar* p_bar)
+ :
+ _orig_workers(num_samples),
+ _tracking(tracking),
+ _p_bar(p_bar)
+ {
+ }
+
+ void operator()();
+
+ void register_locus(int locus_id);
+ void abundance_avail(int locus_id,
+ boost::shared_ptr<SampleAbundances> ab,
+ size_t factory_id);
+ void record_finished_loci();
+ void record_tracking_data(int locus_id, vector<boost::shared_ptr<SampleAbundances> >& abundances);
+ bool all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances> >& abundances);
+ bool all_samples_reported_in(int locus_id);
+
+ void clear_tracking_data() { _tracking->clear(); }
+
+ typedef list<pair<int, vector<boost::shared_ptr<SampleAbundances> > > > recorder_sample_table;
+
+ const light_ab_group_tracking_table& get_sample_table() const { return _ab_group_tracking_table; }
+
+private:
+
+ recorder_sample_table::iterator find_locus(int locus_id);
+
+ int _orig_workers;
+
+ recorder_sample_table _samples;
+
+ Tracking* _tracking;
+ ProgressBar* _p_bar;
+
+ light_ab_group_tracking_table _ab_group_tracking_table;
+};
+
+
+AbundanceRecorder::recorder_sample_table::iterator AbundanceRecorder::find_locus(int locus_id)
+{
+ recorder_sample_table::iterator itr = _samples.begin();
+ for(; itr != _samples.end(); ++itr)
+ {
+ if (itr->first == locus_id)
+ return itr;
+ }
+ return _samples.end();
+}
+
+void AbundanceRecorder::register_locus(int locus_id)
+{
+#if ENABLE_THREADS
+ boost::mutex::scoped_lock lock(_recorder_lock);
+#endif
+
+ recorder_sample_table::iterator itr = find_locus(locus_id);
+ if (itr == _samples.end())
+ {
+ pair<recorder_sample_table::iterator, bool> p;
+ vector<boost::shared_ptr<SampleAbundances> >abs(_orig_workers);
+ _samples.push_back(make_pair(locus_id, abs));
+ }
+}
+
+void AbundanceRecorder::abundance_avail(int locus_id,
+ boost::shared_ptr<SampleAbundances> ab,
+ size_t factory_id)
+{
+#if ENABLE_THREADS
+ boost::mutex::scoped_lock lock(_recorder_lock);
+#endif
+ recorder_sample_table::iterator itr = find_locus(locus_id);
+ if (itr == _samples.end())
+ {
+ assert(false);
+ }
+ itr->second[factory_id] = ab;
+ //itr->second(factory_id] = ab;
+}
+
+// Note: this routine should be called under lock - it doesn't
+// acquire the lock itself.
+bool AbundanceRecorder::all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances> >& abundances)
+{
+ BOOST_FOREACH (boost::shared_ptr<SampleAbundances> ab, abundances)
+ {
+ if (!ab)
+ {
+ return false;
+ }
+ }
+ return true;
+}
+
+
+// Note: this routine should be called under lock - it doesn't
+// acquire the lock itself.
+void AbundanceRecorder::record_tracking_data(int locus_id, vector<boost::shared_ptr<SampleAbundances> >& abundances)
+{
+ assert (abundances.size() == _orig_workers);
+
+ // Just verify that all the loci from each factory match up.
+ for (size_t i = 1; i < abundances.size(); ++i)
+ {
+ const SampleAbundances& curr = *(abundances[i]);
+ const SampleAbundances& prev = *(abundances[i-1]);
+
+ assert (curr.locus_tag == prev.locus_tag);
+
+ const AbundanceGroup& s1 = curr.transcripts;
+ const AbundanceGroup& s2 = prev.transcripts;
+
+ assert (s1.abundances().size() == s2.abundances().size());
+
+ for (size_t j = 0; j < s1.abundances().size(); ++j)
+ {
+ assert (s1.abundances()[j]->description() == s2.abundances()[j]->description());
+ }
+ }
+
+ vector<AbundanceGroup> lightweight_ab_groups;
+
+ // Add all the transcripts, CDS groups, TSS groups, and genes to their
+ // respective FPKM tracking table. Whether this is a time series or an
+ // all pairs comparison, we should be calculating and reporting FPKMs for
+ // all objects in all samples
+ for (size_t i = 0; i < abundances.size(); ++i)
+ {
+ const AbundanceGroup& ab_group = abundances[i]->transcripts;
+ /*
+ //fprintf(stderr, "[%d] count = %lg\n",i, ab_group.num_fragments());
+ BOOST_FOREACH (boost::shared_ptr<Abundance> ab, ab_group.abundances())
+ {
+ add_to_tracking_table(i, *ab, _tracking->isoform_fpkm_tracking);
+ //assert (_tracking->isoform_fpkm_tracking.num_fragments_by_replicate().empty() == false);
+ }
+
+ BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->cds)
+ {
+ add_to_tracking_table(i, ab, _tracking->cds_fpkm_tracking);
+ }
+
+ BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->primary_transcripts)
+ {
+ add_to_tracking_table(i, ab, _tracking->tss_group_fpkm_tracking);
+ }
+
+ BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->genes)
+ {
+ add_to_tracking_table(i, ab, _tracking->gene_fpkm_tracking);
+ }
+ */
+
+ abundances[i]->transcripts.clear_non_serialized_data();
+ lightweight_ab_groups.push_back(abundances[i]->transcripts);
+ }
+
+ if (_ab_group_tracking_table.find(locus_id) != _ab_group_tracking_table.end())
+ {
+ fprintf (stderr, "Error: locus %d is already recorded!\n", locus_id);
+ }
+ _ab_group_tracking_table[locus_id] = lightweight_ab_groups;
+}
+
+void AbundanceRecorder::record_finished_loci()
+{
+#if ENABLE_THREADS
+ boost::mutex::scoped_lock lock(_recorder_lock);
+#endif
+
+ recorder_sample_table::iterator itr = _samples.begin();
+ while(itr != _samples.end())
+ {
+ if (all_samples_reported_in(itr->second))
+ {
+ // In some abundance runs, we don't actually want to perform testing
+ // (eg initial quantification before bias correction).
+ // _tests and _tracking will be NULL in these cases.
+ if (_tracking != NULL)
+ {
+ if (_p_bar)
+ {
+ verbose_msg("Estimating expression in locus [%s]\n", itr->second.front()->locus_tag.c_str());
+ _p_bar->update(itr->second.front()->locus_tag.c_str(), 1);
+ }
+ }
+ record_tracking_data(itr->first, itr->second);
+
+ // Removes the samples that have already been tested and transferred to the tracking tables,
+ itr = _samples.erase(itr);
+ }
+ else
+ {
+
+ ++itr;
+ }
+ }
+}
+
+
+boost::shared_ptr<AbundanceRecorder> abundance_recorder;
+
+void sample_worker(const RefSequenceTable& rt,
+ ReplicatedBundleFactory& sample_factory,
+ boost::shared_ptr<SampleAbundances> abundance,
+ size_t factory_id,
+ boost::shared_ptr<AbundanceRecorder> recorder,
+ bool calculate_variance)
+{
+#if ENABLE_THREADS
+ boost::this_thread::at_thread_exit(decr_pool_count);
+#endif
+
+ boost::shared_ptr<HitBundle> bundle(new HitBundle);
+ bool non_empty = sample_factory.next_bundle(*bundle);
+
+ char bundle_label_buf[2048];
+ sprintf(bundle_label_buf,
+ "%s:%d-%d",
+ rt.get_name(bundle->ref_id()),
+ bundle->left(),
+ bundle->right());
+ string locus_tag = bundle_label_buf;
+
+ abundance->cluster_mass = bundle->mass();
+
+ recorder->register_locus(bundle->id());
+
+ abundance->locus_tag = locus_tag;
+
+// if (rt.get_name(bundle->ref_id()) == "chr13_random")
+// {
+// int a = 5;
+// }
+ if (!non_empty || (bias_run && bundle->ref_scaffolds().size() != 1)) // Only learn on single isoforms
+ {
+//#if !ENABLE_THREADS
+ // If Cuffdiff was built without threads, we need to manually invoke
+ // the testing functor, which will check to see if all the workers
+ // are done, and if so, perform the cross sample testing.
+ recorder->abundance_avail(bundle->id(), abundance, factory_id);
+ recorder->record_finished_loci();
+ //launcher();
+//#endif
+ return;
+ }
+
+ bool perform_cds_analysis = false;
+ bool perform_tss_analysis = false;
+
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> s, bundle->ref_scaffolds())
+ {
+ if (s->annotated_tss_id() != "")
+ {
+ perform_tss_analysis = final_est_run;
+ }
+ if (s->annotated_protein_id() != "")
+ {
+ perform_cds_analysis = final_est_run;
+ }
+ }
+
+ set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
+ for (size_t i = 0; i < sample_factory.factories().size(); ++i)
+ {
+ boost::shared_ptr<BundleFactory> bf = sample_factory.factories()[i];
+ rg_props.insert(bf->read_group_properties());
+ }
+
+ sample_abundance_worker(boost::cref(locus_tag),
+ boost::cref(rg_props),
+ boost::ref(*abundance),
+ bundle,
+ perform_cds_analysis,
+ perform_tss_analysis,
+ calculate_variance);
+
+ ///////////////////////////////////////////////
+
+
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff, bundle->ref_scaffolds())
+ {
+ ref_scaff->clear_hits();
+ }
+
+ recorder->abundance_avail(bundle->id(), abundance, factory_id);
+ recorder->record_finished_loci();
+}
+
+bool quantitate_next_locus(const RefSequenceTable& rt,
+ vector<boost::shared_ptr<ReplicatedBundleFactory> >& bundle_factories,
+ boost::shared_ptr<AbundanceRecorder> recorder)
+{
+ for (size_t i = 0; i < bundle_factories.size(); ++i)
+ {
+ boost::shared_ptr<SampleAbundances> s_ab = boost::shared_ptr<SampleAbundances>(new SampleAbundances);
+
+#if ENABLE_THREADS
+ while(1)
+ {
+ locus_thread_pool_lock.lock();
+ if (locus_curr_threads < locus_num_threads)
+ {
+ break;
+ }
+
+ locus_thread_pool_lock.unlock();
+
+ boost::this_thread::sleep(boost::posix_time::milliseconds(5));
+
+ }
+
+ locus_curr_threads++;
+ locus_thread_pool_lock.unlock();
+
+ thread quantitate(sample_worker,
+ boost::ref(rt),
+ boost::ref(*(bundle_factories[i])),
+ s_ab,
+ i,
+ recorder,
+ false);
+#else
+ sample_worker(boost::ref(rt),
+ boost::ref(*(bundle_factories[i])),
+ s_ab,
+ i,
+ recorder,
+ false);
+#endif
+ }
+ return true;
+}
+
+void parse_norm_standards_file(FILE* norm_standards_file)
+{
+ char pBuf[10 * 1024];
+ size_t non_blank_lines_read = 0;
+
+ boost::shared_ptr<map<string, LibNormStandards> > norm_standards(new map<string, LibNormStandards>);
+
+ while (fgets(pBuf, 10*1024, norm_standards_file))
+ {
+ if (strlen(pBuf) > 0)
+ {
+ char* nl = strchr(pBuf, '\n');
+ if (nl)
+ *nl = 0;
+
+ string pBufstr = pBuf;
+ string trimmed = boost::trim_copy(pBufstr);
+
+ if (trimmed.length() > 0 && trimmed[0] != '#')
+ {
+ non_blank_lines_read++;
+ vector<string> columns;
+ tokenize(trimmed, "\t", columns);
+
+ if (non_blank_lines_read == 1)
+ continue;
+
+ if (columns.size() < 1) //
+ {
+ continue;
+ }
+
+ string gene_id = columns[0];
+ LibNormStandards L;
+ norm_standards->insert(make_pair(gene_id, L));
+ }
+ }
+ }
+ lib_norm_standards = norm_standards;
+}
+
+boost::shared_ptr<AbundanceRecorder> abx_recorder;
+
+void driver(const std::string& ref_gtf_filename, const std::string& mask_gtf_filename, FILE* norm_standards_file, vector<string>& sam_hit_filename_lists, Outfiles& outfiles)
+{
+
+ FILE* ref_gtf = NULL;
+ if (ref_gtf_filename != "")
+ {
+ ref_gtf = fopen(ref_gtf_filename.c_str(), "r");
+ if (!ref_gtf) // we actually already did this check, leave this code here in case we remove the upstream one
+ {
+ fprintf(stderr, "Error: cannot open reference GTF file %s for reading\n",
+ ref_gtf_filename.c_str());
+ exit(1);
+ }
+ }
+
+ FILE* mask_gtf = NULL;
+ if (mask_gtf_filename != "")
+ {
+ mask_gtf = fopen(mask_gtf_filename.c_str(), "r");
+ if (!mask_gtf) // we actually already did this check, leave this code here in case we remove the upstream one
+ {
+ fprintf(stderr, "Error: cannot open mask GTF file %s for reading\n",
+ mask_gtf_filename.c_str());
+ exit(1);
+ }
+ }
+
+ ReadTable it;
+ RefSequenceTable rt(true, false);
+
+ vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
+
+ vector<boost::shared_ptr<ReplicatedBundleFactory> > bundle_factories;
+ vector<boost::shared_ptr<ReadGroupProperties> > all_read_groups;
+ vector<boost::shared_ptr<HitFactory> > all_hit_factories;
+
+ for (size_t i = 0; i < sam_hit_filename_lists.size(); ++i)
+ {
+ vector<string> sam_hit_filenames;
+ tokenize(sam_hit_filename_lists[i], ",", sam_hit_filenames);
+
+ vector<boost::shared_ptr<BundleFactory> > replicate_factories;
+
+ string condition_name = sample_labels[i];
+
+ for (size_t j = 0; j < sam_hit_filenames.size(); ++j)
+ {
+ boost::shared_ptr<HitFactory> hs;
+ try
+ {
+ hs = boost::shared_ptr<HitFactory>(new BAMHitFactory(sam_hit_filenames[j], it, rt));
+ }
+ catch (std::runtime_error& e)
+ {
+ try
+ {
+ fprintf(stderr, "File %s doesn't appear to be a valid BAM file, trying SAM...\n",
+ sam_hit_filenames[j].c_str());
+ hs = boost::shared_ptr<HitFactory>(new SAMHitFactory(sam_hit_filenames[j], it, rt));
+ }
+ catch (std::runtime_error& e)
+ {
+ fprintf(stderr, "Error: cannot open alignment file %s for reading\n",
+ sam_hit_filenames[j].c_str());
+ exit(1);
+ }
+ }
+
+ all_hit_factories.push_back(hs);
+
+ boost::shared_ptr<BundleFactory> hf(new BundleFactory(hs, REF_DRIVEN));
+ boost::shared_ptr<ReadGroupProperties> rg_props(new ReadGroupProperties);
+
+ if (global_read_properties)
+ {
+ *rg_props = *global_read_properties;
+ }
+ else
+ {
+ *rg_props = hs->read_group_properties();
+ }
+
+ rg_props->condition_name(condition_name);
+ rg_props->replicate_num(j);
+ rg_props->file_path(sam_hit_filenames[j]);
+
+ all_read_groups.push_back(rg_props);
+
+ hf->read_group_properties(rg_props);
+
+ replicate_factories.push_back(hf);
+ //replicate_factories.back()->set_ref_rnas(ref_mRNAs);
+ }
+
+ bundle_factories.push_back(boost::shared_ptr<ReplicatedBundleFactory>(new ReplicatedBundleFactory(replicate_factories, condition_name)));
+ }
+
+ boost::crc_32_type ref_gtf_crc_result;
+ ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, ref_gtf_crc_result, corr_bias, false);
+ if (ref_mRNAs.empty())
+ return;
+
+ boost::crc_32_type mask_gtf_crc_result;
+ vector<boost::shared_ptr<Scaffold> > mask_rnas;
+ if (mask_gtf)
+ {
+ ::load_ref_rnas(mask_gtf, rt, mask_rnas, mask_gtf_crc_result, false, false);
+ }
+
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
+ {
+ fac->set_ref_rnas(ref_mRNAs);
+ if (mask_gtf)
+ fac->set_mask_rnas(mask_rnas);
+ }
+
+ if (norm_standards_file != NULL)
+ {
+ parse_norm_standards_file(norm_standards_file);
+ }
+
+ for (size_t i = 0; i < all_read_groups.size(); ++i)
+ {
+ all_read_groups[i]->collect_checked_parameters();
+ all_read_groups[i]->ref_gtf(ref_gtf_filename, ref_gtf_crc_result);
+ all_read_groups[i]->mask_gtf(mask_gtf_filename, mask_gtf_crc_result);
+ }
+
+#if ENABLE_THREADS
+ locus_num_threads = num_threads;
+#endif
+
+ dispersion_method = POISSON;
+
+ int tmp_min_frag_len = numeric_limits<int>::max();
+ int tmp_max_frag_len = 0;
+
+ ProgressBar p_bar("Inspecting maps and determining fragment length distributions.",0);
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
+ {
+#if ENABLE_THREADS
+ while(1)
+ {
+ locus_thread_pool_lock.lock();
+ if (locus_curr_threads < locus_num_threads)
+ {
+ break;
+ }
+
+ locus_thread_pool_lock.unlock();
+
+ boost::this_thread::sleep(boost::posix_time::milliseconds(5));
+ }
+
+ locus_curr_threads++;
+ locus_thread_pool_lock.unlock();
+
+ thread inspect(inspect_map_worker,
+ boost::ref(*fac),
+ boost::ref(tmp_min_frag_len),
+ boost::ref(tmp_max_frag_len));
+#else
+ inspect_map_worker(boost::ref(*fac),
+ boost::ref(tmp_min_frag_len),
+ boost::ref(tmp_max_frag_len));
+#endif
+ }
+
+ // wait for the workers to finish up before reporting everthing.
+#if ENABLE_THREADS
+ while(1)
+ {
+ locus_thread_pool_lock.lock();
+ if (locus_curr_threads == 0)
+ {
+ locus_thread_pool_lock.unlock();
+ break;
+ }
+ locus_thread_pool_lock.unlock();
+
+ boost::this_thread::sleep(boost::posix_time::milliseconds(5));
+ }
+#endif
+
+ normalize_counts(all_read_groups);
+
+ for (size_t i = 0; i < all_read_groups.size(); ++i)
+ {
+ boost::shared_ptr<ReadGroupProperties> rg = all_read_groups[i];
+ fprintf(stderr, "> Map Properties:\n");
+
+ fprintf(stderr, ">\tNormalized Map Mass: %.2Lf\n", rg->normalized_map_mass());
+ fprintf(stderr, ">\tRaw Map Mass: %.2Lf\n", rg->total_map_mass());
+ if (corr_multi)
+ fprintf(stderr,">\tNumber of Multi-Reads: %zu (with %zu total hits)\n", rg->multi_read_table()->num_multireads(), rg->multi_read_table()->num_multihits());
+
+ if (rg->frag_len_dist()->source() == LEARNED)
+ {
+ fprintf(stderr, ">\tFragment Length Distribution: Empirical (learned)\n");
+ fprintf(stderr, ">\t Estimated Mean: %.2f\n", rg->frag_len_dist()->mean());
+ fprintf(stderr, ">\t Estimated Std Dev: %.2f\n", rg->frag_len_dist()->std_dev());
+ }
+ else
+ {
+ if (rg->frag_len_dist()->source() == USER)
+ fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (user-specified)\n");
+ else //rg->frag_len_dist()->source == FLD::DEFAULT
+ fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (default)\n");
+ fprintf(stderr, ">\t Default Mean: %d\n", def_frag_len_mean);
+ fprintf(stderr, ">\t Default Std Dev: %d\n", def_frag_len_std_dev);
+ }
+ }
+
+ long double total_norm_mass = 0.0;
+ long double total_mass = 0.0;
+ BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
+ {
+ total_norm_mass += rg_props->normalized_map_mass();
+ total_mass += rg_props->total_map_mass();
+ }
+
+ min_frag_len = tmp_min_frag_len;
+ max_frag_len = tmp_max_frag_len;
+
+ final_est_run = false;
+
+ double num_bundles = (double)bundle_factories[0]->num_bundles();
+
+ p_bar = ProgressBar("Calculating preliminary abundance estimates", num_bundles);
+
+ Tracking tracking;
+
+ abundance_recorder = boost::shared_ptr<AbundanceRecorder>(new AbundanceRecorder(bundle_factories.size(), &tracking, &p_bar));
+
+ if (model_mle_error || corr_bias || corr_multi) // Only run initial estimation if correcting bias or multi-reads
+ {
+ while (1)
+ {
+ boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
+ quantitate_next_locus(rt, bundle_factories, abundance_recorder);
+ bool more_loci_remain = false;
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
+ {
+ if (rep_fac->bundles_remain())
+ {
+ more_loci_remain = true;
+ break;
+ }
+ }
+
+ if (!more_loci_remain)
+ {
+ // wait for the workers to finish up before breaking out.
+#if ENABLE_THREADS
+ while(1)
+ {
+ locus_thread_pool_lock.lock();
+ if (locus_curr_threads == 0)
+ {
+ locus_thread_pool_lock.unlock();
+ break;
+ }
+
+ locus_thread_pool_lock.unlock();
+
+ boost::this_thread::sleep(boost::posix_time::milliseconds(5));
+
+ }
+#endif
+ break;
+ }
+ }
+
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
+ {
+ rep_fac->reset();
+ }
+
+ p_bar.complete();
+ }
+ if (corr_bias)
+ {
+ bias_run = true;
+ p_bar = ProgressBar("Learning bias parameters.", 0);
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
+ {
+ BOOST_FOREACH (boost::shared_ptr<BundleFactory> fac, rep_fac->factories())
+ {
+#if ENABLE_THREADS
+ while(1)
+ {
+ locus_thread_pool_lock.lock();
+ if (locus_curr_threads < locus_num_threads)
+ {
+ break;
+ }
+
+ locus_thread_pool_lock.unlock();
+
+ boost::this_thread::sleep(boost::posix_time::milliseconds(5));
+ }
+ locus_curr_threads++;
+ locus_thread_pool_lock.unlock();
+
+ thread bias(learn_bias_worker, fac);
+#else
+ learn_bias_worker(fac);
+#endif
+ }
+ }
+
+ // wait for the workers to finish up before reporting everthing.
+#if ENABLE_THREADS
+ while(1)
+ {
+ locus_thread_pool_lock.lock();
+ if (locus_curr_threads == 0)
+ {
+ locus_thread_pool_lock.unlock();
+ break;
+ }
+
+ locus_thread_pool_lock.unlock();
+
+ boost::this_thread::sleep(boost::posix_time::milliseconds(5));
+ }
+#endif
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
+ {
+ rep_fac->reset();
+ }
+ bias_run = false;
+ }
+
+ // Allow the multiread tables to do their thing...
+ BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
+ {
+ rg_props->multi_read_table()->valid_mass(true);
+ }
+
+
+ abundance_recorder->clear_tracking_data();
+
+ abundance_recorder = boost::shared_ptr<AbundanceRecorder>(new AbundanceRecorder(bundle_factories.size(), &tracking, &p_bar));
+
+ final_est_run = true;
+ p_bar = ProgressBar("Quantifying expression levels in locus.", num_bundles);
+
+ while (true)
+ {
+ //boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
+ quantitate_next_locus(rt, bundle_factories, abundance_recorder);
+ bool more_loci_remain = false;
+ BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
+ {
+ if (rep_fac->bundles_remain())
+ {
+ more_loci_remain = true;
+ break;
+ }
+ }
+ if (!more_loci_remain)
+ {
+ // wait for the workers to finish up before doing the cross-sample testing.
+#if ENABLE_THREADS
+ while(1)
+ {
+ locus_thread_pool_lock.lock();
+ if (locus_curr_threads == 0)
+ {
+ locus_thread_pool_lock.unlock();
+ break;
+ }
+
+ locus_thread_pool_lock.unlock();
+
+ boost::this_thread::sleep(boost::posix_time::milliseconds(5));
+
+ }
+#endif
+ break;
+ }
+ }
+
+ p_bar.complete();
+
+
+ string expression_cxb_filename = output_dir + "/abundances.cxb";
+ std::ofstream ofs(expression_cxb_filename.c_str());
+ boost::archive::binary_oarchive oa(ofs);
+
+ vector< pair<int, AbundanceGroup> > single_sample_tracking;
+
+ const light_ab_group_tracking_table& sample_table = abundance_recorder->get_sample_table();
+ for (light_ab_group_tracking_table::const_iterator itr = sample_table.begin(); itr != sample_table.end(); ++itr)
+ {
+
+ assert (itr->second.size() == 1);
+ single_sample_tracking.push_back(make_pair(itr->first, itr->second[0]));
+ }
+
+ std::sort(single_sample_tracking.begin(), single_sample_tracking.end(),
+ boost::bind(&std::pair<int, AbundanceGroup>::first, _1) <
+ boost::bind(&std::pair<int, AbundanceGroup>::first, _2));
+
+ size_t num_loci = single_sample_tracking.size();
+ oa << num_loci;
+
+
+
+ for (int i = 0; i < single_sample_tracking.size(); ++i)
+ {
+ oa << single_sample_tracking[i];
+ }
+
+// // FPKM tracking
+//
+// FILE* fiso_fpkm_tracking = outfiles.isoform_fpkm_tracking_out;
+// fprintf(stderr, "Writing isoform-level FPKM tracking\n");
+// print_FPKM_tracking(fiso_fpkm_tracking,tracking.isoform_fpkm_tracking);
+//
+// FILE* ftss_fpkm_tracking = outfiles.tss_group_fpkm_tracking_out;
+// fprintf(stderr, "Writing TSS group-level FPKM tracking\n");
+// print_FPKM_tracking(ftss_fpkm_tracking,tracking.tss_group_fpkm_tracking);
+//
+// FILE* fgene_fpkm_tracking = outfiles.gene_fpkm_tracking_out;
+// fprintf(stderr, "Writing gene-level FPKM tracking\n");
+// print_FPKM_tracking(fgene_fpkm_tracking,tracking.gene_fpkm_tracking);
+//
+// FILE* fcds_fpkm_tracking = outfiles.cds_fpkm_tracking_out;
+// fprintf(stderr, "Writing CDS-level FPKM tracking\n");
+// print_FPKM_tracking(fcds_fpkm_tracking,tracking.cds_fpkm_tracking);
+//
+// // Count tracking
+//
+// FILE* fiso_count_tracking = outfiles.isoform_count_tracking_out;
+// fprintf(stderr, "Writing isoform-level count tracking\n");
+// print_count_tracking(fiso_count_tracking,tracking.isoform_fpkm_tracking);
+//
+// FILE* ftss_count_tracking = outfiles.tss_group_count_tracking_out;
+// fprintf(stderr, "Writing TSS group-level count tracking\n");
+// print_count_tracking(ftss_count_tracking,tracking.tss_group_fpkm_tracking);
+//
+// FILE* fgene_count_tracking = outfiles.gene_count_tracking_out;
+// fprintf(stderr, "Writing gene-level count tracking\n");
+// print_count_tracking(fgene_count_tracking,tracking.gene_fpkm_tracking);
+//
+// FILE* fcds_count_tracking = outfiles.cds_count_tracking_out;
+// fprintf(stderr, "Writing CDS-level count tracking\n");
+// print_count_tracking(fcds_count_tracking,tracking.cds_fpkm_tracking);
+//
+// // Run info
+// FILE* frun_info = outfiles.run_info_out;
+// fprintf(stderr, "Writing run info\n");
+// print_run_info(frun_info);
+}
+
+int main(int argc, char** argv)
+{
+// boost::serialization::void_cast_register<TranscriptAbundance, Abundance>(
+// static_cast<TranscriptAbundance *>(NULL),
+// static_cast<Abundance *>(NULL)
+// );
+
+ for (int i = 0; i < argc; ++i)
+ {
+ cmd_str += string(argv[i]) + " ";
+ }
+
+ init_library_table();
+ init_dispersion_method_table();
+ init_lib_norm_method_table();
+
+ min_isoform_fraction = 0;
+
+ int parse_ret = parse_options(argc,argv);
+ if (parse_ret)
+ return parse_ret;
+
+ if (!use_total_mass && !use_compat_mass)
+ {
+ use_total_mass = false;
+ use_compat_mass = true;
+ }
+
+ if(optind >= argc)
+ {
+ print_usage();
+ return 1;
+ }
+
+ if (!no_update_check)
+ check_version(PACKAGE_VERSION);
+
+ string ref_gtf_filename = argv[optind++];
+ vector<string> sam_hit_filenames;
+
+ if(optind < argc)
+ {
+ string sam_hits_file_name = argv[optind++];
+ sam_hit_filenames.push_back(sam_hits_file_name);
+ }
+
+ if (sample_labels.size() == 0)
+ {
+ for (size_t i = 1; i < sam_hit_filenames.size() + 1; ++i)
+ {
+ char buf[256];
+ sprintf(buf, "q%lu", i);
+ sample_labels.push_back(buf);
+ }
+ }
+
+ while (sam_hit_filenames.size() < 1)
+ {
+ fprintf(stderr, "Error: cuffquant requires exactly 1 SAM/BAM file\n");
+ exit(1);
+ }
+
+
+ if (sam_hit_filenames.size() != sample_labels.size())
+ {
+ fprintf(stderr, "Error: number of labels must match number of conditions\n");
+ exit(1);
+ }
+
+ if (random_seed == -1)
+ random_seed = time(NULL);
+
+ // seed the random number generator - we'll need it for the importance
+ // sampling during MAP estimation of the gammas
+ srand48(random_seed);
+
+ FILE* ref_gtf = NULL;
+ if (ref_gtf_filename != "")
+ {
+ ref_gtf = fopen(ref_gtf_filename.c_str(), "r");
+ if (!ref_gtf)
+ {
+ fprintf(stderr, "Error: cannot open reference GTF file %s for reading\n",
+ ref_gtf_filename.c_str());
+ exit(1);
+ }
+ }
+
+ FILE* mask_gtf = NULL;
+ if (mask_gtf_filename != "")
+ {
+ mask_gtf = fopen(mask_gtf_filename.c_str(), "r");
+ if (!mask_gtf)
+ {
+ fprintf(stderr, "Error: cannot open mask GTF file %s for reading\n",
+ mask_gtf_filename.c_str());
+ exit(1);
+ }
+ }
+
+ FILE* norm_standards_file = NULL;
+ if (norm_standards_filename != "")
+ {
+ norm_standards_file = fopen(norm_standards_filename.c_str(), "r");
+ if (!norm_standards_file)
+ {
+ fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
+ norm_standards_filename.c_str());
+ exit(1);
+ }
+ }
+
+
+ // Note: we don't want the assembly filters interfering with calculations
+ // here
+
+ pre_mrna_fraction = 0.0;
+ olap_radius = 0;
+
+ Outfiles outfiles;
+
+ if (output_dir != "")
+ {
+ int retcode = mkpath(output_dir.c_str(), 0777);
+ if (retcode == -1)
+ {
+ if (errno != EEXIST)
+ {
+ fprintf (stderr,
+ "Error: cannot create directory %s\n",
+ output_dir.c_str());
+ exit(1);
+ }
+ }
+ }
+
+ static const int filename_buf_size = 2048;
+
+ char out_file_prefix[filename_buf_size];
+ sprintf(out_file_prefix, "%s/", output_dir.c_str());
+
+ driver(ref_gtf_filename, mask_gtf_filename, norm_standards_file, sam_hit_filenames, outfiles);
+
+ return 0;
+}
+
diff --git a/src/differential.cpp b/src/differential.cpp
index 4d8f152..d6eae35 100644
--- a/src/differential.cpp
+++ b/src/differential.cpp
@@ -28,8 +28,8 @@ using namespace std;
double min_read_count = 10;
#if ENABLE_THREADS
-mutex _launcher_lock;
-mutex locus_thread_pool_lock;
+boost::mutex _launcher_lock;
+boost::mutex locus_thread_pool_lock;
int locus_curr_threads = 0;
int locus_num_threads = 0;
@@ -53,105 +53,6 @@ SampleDifference::SampleDifference():
significant(false) {}
-void add_to_tracking_table(size_t sample_index,
- Abundance& ab,
- FPKMTrackingTable& track)
-
-{
- pair<FPKMTrackingTable::iterator,bool> inserted;
- pair<string, FPKMTracking > p;
- p = make_pair(ab.description(), FPKMTracking());
- inserted = track.insert(p);
-
- FPKMTracking& fpkm_track = inserted.first->second;
-
- set<string> tss = ab.tss_id();
- set<string> gene_ids = ab.gene_id();
- set<string> genes = ab.gene_name();
- set<string> proteins = ab.protein_id();
-
- fpkm_track.tss_ids.insert(tss.begin(), tss.end());
- fpkm_track.gene_ids.insert(gene_ids.begin(), gene_ids.end());
- fpkm_track.gene_names.insert(genes.begin(), genes.end());
- fpkm_track.protein_ids.insert(proteins.begin(), proteins.end());
-
- if (inserted.second)
- {
- fpkm_track.locus_tag = ab.locus_tag();
- fpkm_track.description = ab.description();
- shared_ptr<Scaffold> transfrag = ab.transfrag();
- if (transfrag && transfrag->nearest_ref_id() != "")
- {
- fpkm_track.classcode = transfrag->nearest_ref_classcode();
- fpkm_track.ref_match = transfrag->nearest_ref_id();
- }
- else
- {
- fpkm_track.classcode = 0;
- fpkm_track.ref_match = "-";
- }
- if (transfrag)
- {
- fpkm_track.length = transfrag->length();
- }
- else
- {
- fpkm_track.length = 0;
- }
- }
-
- FPKMContext r1 = FPKMContext(ab.num_fragments(),
- ab.num_fragment_var(),
- ab.num_fragment_uncertainty_var(),
- ab.mass_variance(),
- ab.num_fragments_by_replicate(),
- ab.FPKM(),
- ab.FPKM_by_replicate(),
- ab.FPKM_variance(),
- ab.FPKM_conf().low,
- ab.FPKM_conf().high,
- ab.status(),
- ab.status_by_replicate(),
- ab.fpkm_samples(),
- ab.gamma());
-
-
-
- vector<FPKMContext>& fpkms = inserted.first->second.fpkm_series;
- if (sample_index < fpkms.size())
- {
- // if the fpkm series already has an entry matching this description
- // for this sample index, then we are dealing with a group of transcripts
- // that occupies multiple (genomically disjoint) bundles. We need
- // to add this bundle's contribution to the FPKM, fragments, and variance
- // to whatever's already there.
-
- // NOTE: we can simply sum the FKPM_variances, because we are currently
- // assuming that transcripts in disjoint bundles share no alignments and
- // thus have FPKM covariance == 0; This assumption will no longer be
- // true if we decide to do multireads the right way.
-
- FPKMContext& existing = fpkms[sample_index];
- existing.FPKM += r1.FPKM;
- existing.count_mean += r1.count_mean;
- existing.FPKM_variance += r1.FPKM_variance;
- if (existing.status == NUMERIC_FAIL || r1.status == NUMERIC_FAIL)
- {
- existing.status = NUMERIC_FAIL;
- }
- else
- {
- existing.status = NUMERIC_OK;
- }
-
- }
- else
- {
- fpkms.push_back(r1);
- }
-}
-
-
TestLauncher::launcher_sample_table::iterator TestLauncher::find_locus(const string& locus_id)
{
launcher_sample_table::iterator itr = _samples.begin();
@@ -173,13 +74,13 @@ void TestLauncher::register_locus(const string& locus_id)
if (itr == _samples.end())
{
pair<launcher_sample_table::iterator, bool> p;
- vector<shared_ptr<SampleAbundances> >abs(_orig_workers);
+ vector<boost::shared_ptr<SampleAbundances> >abs(_orig_workers);
_samples.push_back(make_pair(locus_id, abs));
}
}
void TestLauncher::abundance_avail(const string& locus_id,
- shared_ptr<SampleAbundances> ab,
+ boost::shared_ptr<SampleAbundances> ab,
size_t factory_id)
{
#if ENABLE_THREADS
@@ -196,9 +97,9 @@ void TestLauncher::abundance_avail(const string& locus_id,
// Note: this routine should be called under lock - it doesn't
// acquire the lock itself.
-bool TestLauncher::all_samples_reported_in(vector<shared_ptr<SampleAbundances> >& abundances)
+bool TestLauncher::all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances> >& abundances)
{
- BOOST_FOREACH (shared_ptr<SampleAbundances> ab, abundances)
+ BOOST_FOREACH (boost::shared_ptr<SampleAbundances> ab, abundances)
{
if (!ab)
{
@@ -209,12 +110,12 @@ bool TestLauncher::all_samples_reported_in(vector<shared_ptr<SampleAbundances> >
}
#if ENABLE_THREADS
-mutex test_storage_lock; // don't modify the above struct without locking here
+boost::mutex test_storage_lock; // don't modify the above struct without locking here
#endif
// Note: this routine should be called under lock - it doesn't
// acquire the lock itself.
-void TestLauncher::perform_testing(vector<shared_ptr<SampleAbundances> >& abundances)
+void TestLauncher::perform_testing(vector<boost::shared_ptr<SampleAbundances> > abundances)
{
assert (abundances.size() == _orig_workers);
@@ -242,7 +143,7 @@ void TestLauncher::perform_testing(vector<shared_ptr<SampleAbundances> >& abunda
// Note: this routine should be called under lock - it doesn't
// acquire the lock itself.
-void TestLauncher::record_tracking_data(vector<shared_ptr<SampleAbundances> >& abundances)
+void TestLauncher::record_tracking_data(vector<boost::shared_ptr<SampleAbundances> >& abundances)
{
assert (abundances.size() == _orig_workers);
@@ -277,7 +178,7 @@ void TestLauncher::record_tracking_data(vector<shared_ptr<SampleAbundances> >& a
{
const AbundanceGroup& ab_group = abundances[i]->transcripts;
//fprintf(stderr, "[%d] count = %lg\n",i, ab_group.num_fragments());
- BOOST_FOREACH (shared_ptr<Abundance> ab, ab_group.abundances())
+ BOOST_FOREACH (boost::shared_ptr<Abundance> ab, ab_group.abundances())
{
add_to_tracking_table(i, *ab, _tracking->isoform_fpkm_tracking);
//assert (_tracking->isoform_fpkm_tracking.num_fragments_by_replicate().empty() == false);
@@ -308,9 +209,11 @@ void TestLauncher::record_tracking_data(vector<shared_ptr<SampleAbundances> >& a
void TestLauncher::test_finished_loci()
{
#if ENABLE_THREADS
- boost::mutex::scoped_lock lock(_launcher_lock);
+ _launcher_lock.lock();
#endif
+ vector<vector<boost::shared_ptr<SampleAbundances> > > samples_for_testing;
+
launcher_sample_table::iterator itr = _samples.begin();
while(itr != _samples.end())
{
@@ -323,12 +226,11 @@ void TestLauncher::test_finished_loci()
{
if (_p_bar)
{
- verbose_msg("Testing for differential expression and regulation in locus [%s]\n", itr->second.front()->locus_tag.c_str());
+ verbose_msg("Estimating expression in locus [%s]\n", itr->second.front()->locus_tag.c_str());
_p_bar->update(itr->second.front()->locus_tag.c_str(), 1);
}
record_tracking_data(itr->second);
- perform_testing(itr->second);
-
+ samples_for_testing.push_back(itr->second);
}
else
{
@@ -349,6 +251,16 @@ void TestLauncher::test_finished_loci()
++itr;
}
}
+
+#if ENABLE_THREADS
+ _launcher_lock.unlock();
+#endif
+
+ for (size_t i = 0; i < samples_for_testing.size(); ++i)
+ {
+ vector<boost::shared_ptr<SampleAbundances> > samples_i = samples_for_testing[i];
+ perform_testing(samples_i);
+ }
}
long double wrapped_lgamma(double gamma_k)
@@ -480,7 +392,7 @@ SampleDifference test_diffexp(const FPKMContext& curr,
boost::random::mt19937 rng;
- if ((curr.FPKM != 0 || prev.FPKM != 0))
+ if ((curr.FPKM != 0 || prev.FPKM != 0) && (prev.fpkm_samples.size() > 0 && curr.fpkm_samples.size() > 0))
{
boost::random::uniform_int_distribution<> prev_sampler(0, prev.fpkm_samples.size()-1);
boost::random::uniform_int_distribution<> curr_sampler(0, curr.fpkm_samples.size()-1);
@@ -489,23 +401,20 @@ SampleDifference test_diffexp(const FPKMContext& curr,
vector<double> curr_rep_samples;
- for (FPKMPerReplicateTable::const_iterator itr = curr.fpkm_per_rep.begin();
- itr != curr.fpkm_per_rep.end(); ++itr)
+ for (size_t i = 0; i != curr.tracking_info_per_rep.size(); ++i)
{
- StatusPerReplicateTable::const_iterator si = curr.status_per_rep.find(itr->first);
- if (si == curr.status_per_rep.end() || si->second == NUMERIC_LOW_DATA)
+ if (curr.tracking_info_per_rep[i].status == NUMERIC_LOW_DATA)
continue;
- curr_rep_samples.push_back(itr->second);
+ curr_rep_samples.push_back(curr.tracking_info_per_rep[i].fpkm);
}
- for (FPKMPerReplicateTable::const_iterator itr = prev.fpkm_per_rep.begin();
- itr != prev.fpkm_per_rep.end(); ++itr)
+ for (size_t i = 0; i != prev.tracking_info_per_rep.size(); ++i)
{
- StatusPerReplicateTable::const_iterator si = prev.status_per_rep.find(itr->first);
- if (si == prev.status_per_rep.end() || si->second == NUMERIC_LOW_DATA)
+ if (prev.tracking_info_per_rep[i].status == NUMERIC_LOW_DATA)
continue;
- prev_rep_samples.push_back(itr->second);
+ prev_rep_samples.push_back(prev.tracking_info_per_rep[i].fpkm);
}
+
double curr_fpkm = accumulate(curr_rep_samples.begin(), curr_rep_samples.end(), 0.0);
if (curr_rep_samples.size() > 0)
@@ -660,7 +569,7 @@ SampleDiffMetaDataTable meta_data_table;
boost::mutex meta_data_lock;
#endif
-shared_ptr<SampleDifferenceMetaData> get_metadata(const string description)
+boost::shared_ptr<SampleDifferenceMetaData> get_metadata(const string description)
{
#if ENABLE_THREADS
boost::mutex::scoped_lock lock(meta_data_lock);
@@ -942,7 +851,9 @@ SampleDifference get_ds_tests(const AbundanceGroup& prev_abundance,
if (prev_abundance.abundances().size() == 1 ||
prev_abundance.num_fragments() == 0 ||
- curr_abundance.num_fragments() == 0)
+ curr_abundance.num_fragments() == 0 ||
+ prev_abundance.member_fpkm_samples().size() == 0 ||
+ curr_abundance.member_fpkm_samples().size() == 0)
{
test.p_value = 1;
test.value_1 = 0;
@@ -1046,241 +957,6 @@ string bundle_locus_tag(const RefSequenceTable& rt,
return string(locus_buf);
}
-void sample_abundance_worker(const string& locus_tag,
- const set<shared_ptr<ReadGroupProperties const> >& rg_props,
- SampleAbundances& sample,
- HitBundle* sample_bundle,
- bool perform_cds_analysis,
- bool perform_tss_analysis)
-{
- vector<shared_ptr<Abundance> > abundances;
-
- BOOST_FOREACH(shared_ptr<Scaffold> s, sample_bundle->ref_scaffolds())
- {
- TranscriptAbundance* pT = new TranscriptAbundance;
- pT->transfrag(s);
- shared_ptr<Abundance> ab(pT);
- ab->description(s->annotated_trans_id());
- ab->locus_tag(locus_tag);
- abundances.push_back(ab);
- }
-
- sample.transcripts = AbundanceGroup(abundances);
-
- sample.transcripts.init_rg_props(rg_props);
-
- vector<MateHit> hits_in_cluster;
-
- if (sample_bundle->hits().size() < (size_t)max_frags_per_bundle)
- {
- get_alignments_from_scaffolds(sample.transcripts.abundances(),
- hits_in_cluster);
-
- // Compute the individual transcript FPKMs via each sample's
- // AbundanceGroup for this locus.
-
- sample.transcripts.calculate_abundance(hits_in_cluster);
- }
- else
- {
- BOOST_FOREACH(shared_ptr<Abundance> ab, abundances)
- {
- ab->status(NUMERIC_HI_DATA);
-
- CountPerReplicateTable cpr;
- FPKMPerReplicateTable fpr;
- StatusPerReplicateTable spr;
- for (set<shared_ptr<ReadGroupProperties const> >::const_iterator itr = rg_props.begin();
- itr != rg_props.end();
- ++itr)
- {
- cpr[*itr] = 0;
- fpr[*itr] = 0;
- spr[*itr] = NUMERIC_HI_DATA;
- }
- ab->num_fragments_by_replicate(cpr);
- ab->FPKM_by_replicate(fpr);
- ab->status_by_replicate(spr);
- }
- }
-
- // Cluster transcripts by gene_id
- vector<AbundanceGroup> transcripts_by_gene_id;
- cluster_transcripts<ConnectByAnnotatedGeneId>(sample.transcripts,
- transcripts_by_gene_id);
-
- BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_gene_id)
- {
- ab_group.locus_tag(locus_tag);
- set<string> gene_ids = ab_group.gene_id();
- assert (gene_ids.size() == 1);
- ab_group.description(*(gene_ids.begin()));
- }
-
- sample.genes = transcripts_by_gene_id;
-
- if (perform_cds_analysis)
- {
- // Cluster transcripts by CDS
- vector<AbundanceGroup> transcripts_by_cds;
- ublas::matrix<double> cds_gamma_cov;
- ublas::matrix<double> cds_count_cov;
- ublas::matrix<double> cds_iterated_exp_count_cov;
- ublas::matrix<double> cds_fpkm_cov;
-
- vector<bool> mask(sample.transcripts.abundances().size(), true);
- for (size_t i = 0; i < sample.transcripts.abundances().size(); ++i)
- {
- if (*(sample.transcripts.abundances()[i]->protein_id().begin()) == "")
- {
- mask[i] = false;
- }
- }
-
- AbundanceGroup trans_with_p_id;
- sample.transcripts.filter_group(mask, trans_with_p_id);
-
- cluster_transcripts<ConnectByAnnotatedProteinId>(trans_with_p_id,
- transcripts_by_cds,
- &cds_gamma_cov,
- &cds_iterated_exp_count_cov,
- &cds_count_cov,
- &cds_fpkm_cov);
-
- BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_cds)
- {
- ab_group.locus_tag(locus_tag);
- set<string> protein_ids = ab_group.protein_id();
- assert (protein_ids.size() == 1);
- string desc = *(protein_ids.begin());
- //if (desc != "")
- //{
- assert (desc != "");
- ab_group.description(*(protein_ids.begin()));
- //}
- }
-
- sample.cds = transcripts_by_cds;
-
- // Group the CDS clusters by gene
- vector<shared_ptr<Abundance> > cds_abundances;
-
- set<shared_ptr<ReadGroupProperties const> > rg_props;
- BOOST_FOREACH (AbundanceGroup& ab_group, sample.cds)
- {
- //if (ab_group.description() != "")
- {
- cds_abundances.push_back(shared_ptr<Abundance>(new AbundanceGroup(ab_group)));
- rg_props.insert(ab_group.rg_props().begin(), ab_group.rg_props().end());
- }
- }
- AbundanceGroup cds(cds_abundances,
- cds_gamma_cov,
- cds_iterated_exp_count_cov,
- cds_count_cov,
- cds_fpkm_cov,
- rg_props);
-
- vector<AbundanceGroup> cds_by_gene;
-
- cluster_transcripts<ConnectByAnnotatedGeneId>(cds,
- cds_by_gene);
-
- BOOST_FOREACH(AbundanceGroup& ab_group, cds_by_gene)
- {
- ab_group.locus_tag(locus_tag);
- set<string> gene_ids = ab_group.gene_id();
- assert (gene_ids.size() == 1);
- ab_group.description(*(gene_ids.begin()));
- }
-
- sample.gene_cds = cds_by_gene;
- }
-
- if (perform_tss_analysis)
- {
- // Cluster transcripts by start site (TSS)
- vector<AbundanceGroup> transcripts_by_tss;
-
- ublas::matrix<double> tss_gamma_cov;
- ublas::matrix<double> tss_count_cov;
- ublas::matrix<double> tss_iterated_exp_count_cov;
- ublas::matrix<double> tss_fpkm_cov;
- vector<Eigen::VectorXd> tss_assigned_counts;
-
- vector<bool> mask(sample.transcripts.abundances().size(), true);
- for (size_t i = 0; i < sample.transcripts.abundances().size(); ++i)
- {
- if (*(sample.transcripts.abundances()[i]->tss_id().begin()) == "")
- {
- mask[i] = false;
- }
- }
-
- AbundanceGroup trans_with_tss;
- sample.transcripts.filter_group(mask, trans_with_tss);
-
- cluster_transcripts<ConnectByAnnotatedTssId>(trans_with_tss,
- transcripts_by_tss,
- &tss_gamma_cov,
- &tss_iterated_exp_count_cov,
- &tss_count_cov,
- &tss_fpkm_cov);
-
-
- BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_tss)
- {
- ab_group.locus_tag(locus_tag);
- set<string> tss_ids = ab_group.tss_id();
- assert (tss_ids.size() == 1);
- string desc = *(tss_ids.begin());
- assert (desc != "");
- ab_group.description(*(tss_ids.begin()));
- }
-
- sample.primary_transcripts = transcripts_by_tss;
-
- // Group TSS clusters by gene
- vector<shared_ptr<Abundance> > primary_transcript_abundances;
- set<shared_ptr<ReadGroupProperties const> > rg_props;
- BOOST_FOREACH (AbundanceGroup& ab_group, sample.primary_transcripts)
- {
- primary_transcript_abundances.push_back(shared_ptr<Abundance>(new AbundanceGroup(ab_group)));
- rg_props.insert(ab_group.rg_props().begin(), ab_group.rg_props().end());
- }
-
- AbundanceGroup primary_transcripts(primary_transcript_abundances,
- tss_gamma_cov,
- tss_iterated_exp_count_cov,
- tss_count_cov,
- tss_fpkm_cov,
- rg_props);
-
- vector<AbundanceGroup> primary_transcripts_by_gene;
-
- cluster_transcripts<ConnectByAnnotatedGeneId>(primary_transcripts,
- primary_transcripts_by_gene);
-
- BOOST_FOREACH(AbundanceGroup& ab_group, primary_transcripts_by_gene)
- {
- ab_group.locus_tag(locus_tag);
- set<string> gene_ids = ab_group.gene_id();
-// if (gene_ids.size() > 1)
-// {
-// BOOST_FOREACH (string st, gene_ids)
-// {
-// fprintf(stderr, "%s\n", st.c_str());
-// }
-// ab_group.gene_id();
-// }
- assert (gene_ids.size() == 1);
- ab_group.description(*(gene_ids.begin()));
- }
-
- sample.gene_primary_transcripts = primary_transcripts_by_gene;
- }
-}
-
struct LocusVarianceInfo
{
int factory_id;
@@ -1303,7 +979,7 @@ struct LocusVarianceInfo
};
#if ENABLE_THREADS
-mutex variance_info_lock; // don't modify the above struct without locking here
+boost::mutex variance_info_lock; // don't modify the above struct without locking here
#endif
vector<LocusVarianceInfo> locus_variance_info_table;
@@ -1312,31 +988,32 @@ vector<LocusVarianceInfo> locus_variance_info_table;
// transcript, so we can re-fit the variance model.
FPKMTrackingTable transcript_count_tracking;
-void sample_worker(const RefSequenceTable& rt,
- ReplicatedBundleFactory& sample_factory,
- shared_ptr<SampleAbundances> abundance,
- size_t factory_id,
- shared_ptr<TestLauncher> launcher)
+
+void sample_worker(bool non_empty,
+ boost::shared_ptr<HitBundle> bundle,
+ const RefSequenceTable& rt,
+ ReplicatedBundleFactory& sample_factory,
+ boost::shared_ptr<SampleAbundances> abundance,
+ size_t factory_id,
+ boost::shared_ptr<TestLauncher> launcher,
+ bool calculate_variance)
{
#if ENABLE_THREADS
boost::this_thread::at_thread_exit(decr_pool_count);
#endif
- HitBundle bundle;
- bool non_empty = sample_factory.next_bundle(bundle);
-
char bundle_label_buf[2048];
sprintf(bundle_label_buf,
"%s:%d-%d",
- rt.get_name(bundle.ref_id()),
- bundle.left(),
- bundle.right());
+ rt.get_name(bundle->ref_id()),
+ bundle->left(),
+ bundle->right());
string locus_tag = bundle_label_buf;
- if (!non_empty || (bias_run && bundle.ref_scaffolds().size() != 1)) // Only learn on single isoforms
+ if (!non_empty || (bias_run && bundle->ref_scaffolds().size() != 1)) // Only learn on single isoforms
{
#if !ENABLE_THREADS
- // If Cuffdiff was built without threads, we need to manually invoke
+ // If Cuffdiff was built without threads, we need to manually invoke
// the testing functor, which will check to see if all the workers
// are done, and if so, perform the cross sample testing.
launcher->abundance_avail(locus_tag, abundance, factory_id);
@@ -1346,7 +1023,7 @@ void sample_worker(const RefSequenceTable& rt,
return;
}
- abundance->cluster_mass = bundle.mass();
+ abundance->cluster_mass = bundle->mass();
launcher->register_locus(locus_tag);
@@ -1354,8 +1031,8 @@ void sample_worker(const RefSequenceTable& rt,
bool perform_cds_analysis = false;
bool perform_tss_analysis = false;
-
- BOOST_FOREACH(shared_ptr<Scaffold> s, bundle.ref_scaffolds())
+
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> s, bundle->ref_scaffolds())
{
if (s->annotated_tss_id() != "")
{
@@ -1366,33 +1043,57 @@ void sample_worker(const RefSequenceTable& rt,
perform_cds_analysis = final_est_run;
}
}
-
- set<shared_ptr<ReadGroupProperties const> > rg_props;
- for (size_t i = 0; i < sample_factory.factories().size(); ++i)
+
+ std::vector<boost::shared_ptr<BundleFactory> > factories = sample_factory.factories();
+ vector<boost::shared_ptr<PrecomputedExpressionBundleFactory> > hit_factories;
+ for (size_t i = 0; i < factories.size(); ++i)
{
- shared_ptr<BundleFactory> bf = sample_factory.factories()[i];
- rg_props.insert(bf->read_group_properties());
+ boost::shared_ptr<BundleFactory> pFac = factories[i];
+ boost::shared_ptr<PrecomputedExpressionBundleFactory> pBundleFac = dynamic_pointer_cast<PrecomputedExpressionBundleFactory> (pFac);
+ if (pBundleFac)
+ {
+ // If we get here, this factory refers to a pre computed expression object.
+
+ hit_factories.push_back(pBundleFac);
+ }
}
- sample_abundance_worker(boost::cref(locus_tag),
- boost::cref(rg_props),
- boost::ref(*abundance),
- &bundle,
- perform_cds_analysis,
- perform_tss_analysis);
-
-#if ENABLE_THREADS
- variance_info_lock.lock();
-#endif
-
-#if ENABLE_THREADS
- variance_info_lock.unlock();
-#endif
-
+ if (hit_factories.size() == factories.size())
+ {
+ merge_precomputed_expression_worker(boost::cref(locus_tag),
+ hit_factories,
+ boost::ref(*abundance),
+ bundle,
+ perform_cds_analysis,
+ perform_tss_analysis,
+ calculate_variance);
+ }
+ else if (hit_factories.empty())
+ {
+ set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
+ for (size_t i = 0; i < sample_factory.factories().size(); ++i)
+ {
+ boost::shared_ptr<BundleFactory> bf = sample_factory.factories()[i];
+ rg_props.insert(bf->read_group_properties());
+ }
+
+ sample_abundance_worker(boost::cref(locus_tag),
+ boost::cref(rg_props),
+ boost::ref(*abundance),
+ bundle,
+ perform_cds_analysis,
+ perform_tss_analysis,
+ calculate_variance);
+ }
+ else
+ {
+ fprintf (stderr, "Error: mixing pre-computed input with SAM/BAM files not yet supported!\n");
+ exit(1);
+ }
///////////////////////////////////////////////
- BOOST_FOREACH(shared_ptr<Scaffold> ref_scaff, bundle.ref_scaffolds())
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff, bundle->ref_scaffolds())
{
ref_scaff->clear_hits();
}
@@ -1401,11 +1102,12 @@ void sample_worker(const RefSequenceTable& rt,
launcher->test_finished_loci();
#if !ENABLE_THREADS
- // If Cuffdiff was built without threads, we need to manually invoke
+ // If Cuffdiff was built without threads, we need to manually invoke
// the testing functor, which will check to see if all the workers
// are done, and if so, perform the cross sample testing.
//launcher->test_finished_loci();
#endif
+
}
void dump_locus_variance_info(const string& filename)
@@ -1481,13 +1183,14 @@ void clear_samples_from_fpkm_tracking_table(const string& locus_desc, FPKMTracki
for (size_t i = 0; i < itr->second.fpkm_series.size(); ++i)
{
- itr->second.fpkm_series[i].fpkm_samples.clear();
- std::vector<double>().swap(itr->second.fpkm_series[i].fpkm_samples);
+ vector<double>& fpkm_samples = itr->second.fpkm_series[i].fpkm_samples;
+ fpkm_samples.clear();
+ std::vector<double>().swap(fpkm_samples);
//itr->second.fpkm_series[i].fpkm_samples.swap(vector<double>(itr->second.fpkm_series[i].fpkm_samples));
}
}
-void clear_samples_from_tracking_table(shared_ptr<SampleAbundances> sample, Tracking& tracking)
+void clear_samples_from_tracking_table(boost::shared_ptr<SampleAbundances> sample, Tracking& tracking)
{
for (size_t k = 0; k < sample->transcripts.abundances().size(); ++k)
{
@@ -1520,7 +1223,7 @@ void clear_samples_from_tracking_table(shared_ptr<SampleAbundances> sample, Trac
int total_tests = 0;
void test_differential(const string& locus_tag,
- const vector<shared_ptr<SampleAbundances> >& samples,
+ const vector<boost::shared_ptr<SampleAbundances> >& samples,
const vector<pair<size_t, size_t> >& contrasts,
Tests& tests,
Tracking& tracking)
@@ -1609,7 +1312,7 @@ void test_differential(const string& locus_tag,
inserted = tests.isoform_de_tests[i][j].insert(make_pair(desc,
test));
- shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
+ boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
meta_data->gene_ids = curr_abundance.gene_id();
meta_data->gene_names = curr_abundance.gene_name();
@@ -1666,7 +1369,7 @@ void test_differential(const string& locus_tag,
inserted = tests.cds_de_tests[i][j].insert(make_pair(desc,
test));
- shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
+ boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
meta_data->gene_ids = curr_abundance.gene_id();
meta_data->gene_names = curr_abundance.gene_name();
@@ -1723,7 +1426,7 @@ void test_differential(const string& locus_tag,
test));
- shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
+ boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
meta_data->gene_ids = curr_abundance.gene_id();
meta_data->gene_names = curr_abundance.gene_name();
@@ -1771,7 +1474,7 @@ void test_differential(const string& locus_tag,
inserted = tests.gene_de_tests[i][j].insert(make_pair(desc,
test));
- shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
+ boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
meta_data->gene_ids = curr_abundance.gene_id();
meta_data->gene_names = curr_abundance.gene_name();
@@ -1809,7 +1512,7 @@ void test_differential(const string& locus_tag,
// The filtered group might be empty, so let's grab metadata from
// the unfiltered group
- shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
+ boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
meta_data->gene_ids = samples[i]->gene_primary_transcripts[k].gene_id();
meta_data->gene_names = samples[i]->gene_primary_transcripts[k].gene_name();
@@ -1848,7 +1551,7 @@ void test_differential(const string& locus_tag,
// The filtered group might be empty, so let's grab metadata from
// the unfiltered group
- shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
+ boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
meta_data->gene_ids = samples[i]->gene_cds[k].gene_id();
meta_data->gene_names = samples[i]->gene_cds[k].gene_name();
@@ -1887,7 +1590,7 @@ void test_differential(const string& locus_tag,
// The filtered group might be empty, so let's grab metadata from
// the unfiltered group
- shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
+ boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
meta_data->gene_ids = samples[i]->primary_transcripts[k].gene_id();
meta_data->gene_names = samples[i]->primary_transcripts[k].gene_name();
@@ -1916,3 +1619,87 @@ void test_differential(const string& locus_tag,
clear_samples_from_tracking_table(samples[i], tracking);
}
}
+
+void print_checked_params_table(const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups)
+{
+ string param_check_error_out_filename = output_dir + "/checked_params.txt";
+ FILE* param_check_error_out_file = fopen(param_check_error_out_filename.c_str(), "w");
+
+ fprintf(param_check_error_out_file, "file\tdefault frag length mean\tdefault frag length std dev\tbias correction\tbias mode\tmultiread correction\tmax-mle-iterations\tmin-mle-accuracy\tmax-bundle-frags\tmax-frag-multihits\tno-effective-length-correction\tno-length-correction\n");
+ for (size_t i = 0; i < all_read_groups.size(); ++i)
+ {
+ const CheckedParameters& cp_i = all_read_groups[i]->checked_parameters();
+ fprintf(param_check_error_out_file, "%s\t", all_read_groups[i]->file_path().c_str());
+ fprintf(param_check_error_out_file, "%lg\t", cp_i.frag_len_mean);
+ fprintf(param_check_error_out_file, "%lg\t", cp_i.frag_len_std_dev);
+ fprintf(param_check_error_out_file, "%s\t", cp_i.corr_bias ? "yes" : "no");
+ fprintf(param_check_error_out_file, "%d\t", cp_i.frag_bias_mode);
+ fprintf(param_check_error_out_file, "%s\t", cp_i.corr_multireads ? "yes" : "no");
+ fprintf(param_check_error_out_file, "%lg\t", cp_i.max_mle_iterations);
+ fprintf(param_check_error_out_file, "%lg\t", cp_i.min_mle_accuracy);
+ fprintf(param_check_error_out_file, "%lg\t", cp_i.max_bundle_frags);
+ fprintf(param_check_error_out_file, "%lg\t", cp_i.max_frags_multihits);
+ fprintf(param_check_error_out_file, "%s\t", cp_i.no_effective_length_correction ? "yes" : "no");
+ fprintf(param_check_error_out_file, "%s\t", cp_i.no_length_correction? "yes" : "no");
+ fprintf(param_check_error_out_file, "\n");
+ }
+ fclose(param_check_error_out_file);
+
+}
+
+void validate_cross_sample_parameters(const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups)
+{
+ bool dump_params = false;
+ for (size_t i = 1; i < all_read_groups.size(); ++i)
+ {
+ const CheckedParameters& cp_i = all_read_groups[i - 1]->checked_parameters();
+ const CheckedParameters& cp_j = all_read_groups[i]->checked_parameters();
+
+ if (cp_i.ref_gtf_crc != cp_j.ref_gtf_crc)
+ {
+ fprintf(stderr, "Error reference gene annotation differs between samples!\n");
+ fprintf(stderr, "\t%s\t%s:\t%u!\n", all_read_groups[i - 1]->file_path().c_str(), cp_i.ref_gtf_file_path.c_str(), cp_i.ref_gtf_crc);
+ fprintf(stderr, "\t%s\t%s:\t%u!\n", all_read_groups[i]->file_path().c_str(), cp_j.ref_gtf_file_path.c_str(), cp_j.ref_gtf_crc);
+ exit(1);
+ }
+
+ if (cp_i != cp_j)
+ {
+ dump_params = true;
+ fprintf(stderr, "Warning: quantification parameters differ between CXB files!\n\tSee %s/checked_params.txt for more info\n", output_dir.c_str());
+ break;
+// fprintf(stderr, "CXB files:\n");
+// fprintf(stderr, "%s:\n", all_read_groups[i - 1]->file_path().c_str());
+// fprintf(stderr, "\tdefault-frag-length-mean:\t%lg\n", cp_i.frag_len_mean);
+// fprintf(stderr, "\tdefault-frag-length-std-dev:\t%lg\n", cp_i.frag_len_std_dev);
+// fprintf(stderr, "\tbias correction:\t%s\n", cp_i.corr_bias ? "yes" : "no");
+// fprintf(stderr, "\tbias mode:\t%d\n", cp_i.frag_bias_mode);
+// fprintf(stderr, "\tmultiread correction:\t%s\n", cp_i.corr_multireads ? "yes" : "no");
+// fprintf(stderr, "\tmax-mle-iterations:\t%lg\n", cp_i.max_mle_iterations);
+// fprintf(stderr, "\tmin-mle-accuracy:\t%lg\n", cp_i.min_mle_accuracy);
+// fprintf(stderr, "\tmax-bundle-frags:\t%lg\n", cp_i.max_bundle_frags);
+// fprintf(stderr, "\tmax-frag-multihits:\t%lg\n", cp_i.max_frags_multihits);
+// fprintf(stderr, "\tno-effective-length-correction:\t%s\n", cp_i.no_effective_length_correction ? "yes" : "no");
+// fprintf(stderr, "\tno-length-correction:\t%s\n", cp_i.no_length_correction? "yes" : "no");
+//
+// fprintf(stderr, "%s\n", all_read_groups[i]->file_path().c_str());
+// fprintf(stderr, "\tdefault-frag-length-mean:\t%lg\n", cp_j.frag_len_mean);
+// fprintf(stderr, "\tdefault-frag-length-std-dev:\t%lg\n", cp_j.frag_len_std_dev);
+// // TODO: add CRCs for reference GTF, mask file, norm standards file if using.
+// fprintf(stderr, "\tbias correction:\t%s\n", cp_j.corr_bias ? "yes" : "no");
+// fprintf(stderr, "\tbias mode:\t%d\n", cp_j.frag_bias_mode);
+// fprintf(stderr, "\tmultiread correction:\t%s\n", cp_j.corr_multireads ? "yes" : "no");
+// fprintf(stderr, "\tmax-mle-iterations:\t%lg\n", cp_j.max_mle_iterations);
+// fprintf(stderr, "\tmin-mle-accuracy:\t%lg\n", cp_j.min_mle_accuracy);
+// fprintf(stderr, "\tmax-bundle-frags:\t%lg\n", cp_j.max_bundle_frags);
+// fprintf(stderr, "\tmax-frag-multihits:\t%lg\n", cp_j.max_frags_multihits);
+// fprintf(stderr, "\tno-effective-length-correction:\t%s\n", cp_j.no_effective_length_correction ? "yes" : "no");
+// fprintf(stderr, "\tno-length-correction:\t%s\n", cp_j.no_length_correction? "yes" : "no");
+
+ }
+
+ }
+ if (dump_params)
+ print_checked_params_table(all_read_groups);
+}
+
diff --git a/src/differential.h b/src/differential.h
index 7822f3d..fbd5193 100644
--- a/src/differential.h
+++ b/src/differential.h
@@ -27,6 +27,7 @@
#include "abundances.h"
#include "jensen_shannon.h"
#include "replicates.h"
+#include "tracking.h"
using namespace std;
@@ -64,7 +65,7 @@ public:
double p_value;
double corrected_p;
- shared_ptr<SampleDifferenceMetaData> meta_data;
+ boost::shared_ptr<SampleDifferenceMetaData> meta_data;
TestStatus test_status;
bool significant;
@@ -81,7 +82,7 @@ public:
};
typedef map<string, SampleDifference > SampleDiffs;
-typedef map<string, shared_ptr<SampleDifferenceMetaData> > SampleDiffMetaDataTable;
+typedef map<string, boost::shared_ptr<SampleDifferenceMetaData> > SampleDiffMetaDataTable;
struct Outfiles
{
@@ -108,6 +109,11 @@ struct Outfiles
FILE* tss_group_rep_tracking_out;
FILE* gene_rep_tracking_out;
FILE* cds_rep_tracking_out;
+
+ FILE* isoform_attr_out;
+ FILE* tss_group_attr_out;
+ FILE* gene_attr_out;
+ FILE* cds_attr_out;
FILE* run_info_out;
FILE* read_group_info_out;
@@ -127,100 +133,6 @@ struct Tests
vector<vector<SampleDiffs> > diff_cds_tests; // to be performed on the cds groups of a single gene
};
-struct FPKMContext
-{
- FPKMContext(double cm,
- double cv,
- double cuv,
- double cdv,
- const CountPerReplicateTable& cpr,
- double r,
- const FPKMPerReplicateTable& fpr,
- double v,
- double fcl,
- double fch,
- AbundanceStatus s,
- const StatusPerReplicateTable& spr,
- const vector<double>& fs,
- double g)
- : count_mean(cm),
- count_var(cv),
- count_uncertainty_var(cuv),
- count_dispersion_var(cdv),
- count_per_rep(cpr),
- fpkm_per_rep(fpr),
- FPKM(r),
- FPKM_variance(v),
- FPKM_conf_lo(fcl),
- FPKM_conf_hi(fch),
- status(s),
- status_per_rep(spr),
- fpkm_samples(fs),
- gamma(g) {}
-
- double count_mean;
- double count_var;
- double count_uncertainty_var;
- double count_dispersion_var;
- CountPerReplicateTable count_per_rep;
- FPKMPerReplicateTable fpkm_per_rep;
- StatusPerReplicateTable status_per_rep;
- double FPKM;
- double FPKM_variance;
- double FPKM_conf_lo;
- double FPKM_conf_hi;
- AbundanceStatus status;
- vector<double> fpkm_samples;
- double gamma;
-};
-
-struct FPKMTracking
-{
- string locus_tag;
- char classcode;
- set<string> tss_ids; // for individual isoforms only
- set<string> gene_ids;
- set<string> gene_names;
- set<string> protein_ids;
- string description; // isoforms or tss groups (e.g.) involved in this test
- string ref_match;
- int length;
-
- TestStatus test_status;
-
- vector<FPKMContext> fpkm_series;
-};
-
-typedef map<string, FPKMTracking> FPKMTrackingTable;
-
-struct Tracking
-{
- FPKMTrackingTable isoform_fpkm_tracking;
- FPKMTrackingTable tss_group_fpkm_tracking;
- FPKMTrackingTable gene_fpkm_tracking;
- FPKMTrackingTable cds_fpkm_tracking;
-
- void clear()
- {
- isoform_fpkm_tracking.clear();
- tss_group_fpkm_tracking.clear();
- gene_fpkm_tracking.clear();
- cds_fpkm_tracking.clear();
- }
-};
-
-struct SampleAbundances
-{
- string locus_tag;
- AbundanceGroup transcripts;
- vector<AbundanceGroup> primary_transcripts;
- vector<AbundanceGroup> gene_primary_transcripts;
- vector<AbundanceGroup> cds;
- vector<AbundanceGroup> gene_cds;
- vector<AbundanceGroup> genes;
- double cluster_mass;
-};
-
#if ENABLE_THREADS
extern boost::mutex _launcher_lock;
#endif
@@ -249,17 +161,17 @@ public:
void register_locus(const string& locus_id);
void abundance_avail(const string& locus_id,
- shared_ptr<SampleAbundances> ab,
+ boost::shared_ptr<SampleAbundances> ab,
size_t factory_id);
void test_finished_loci();
- void perform_testing(vector<shared_ptr<SampleAbundances> >& abundances);
- void record_tracking_data(vector<shared_ptr<SampleAbundances> >& abundances);
- bool all_samples_reported_in(vector<shared_ptr<SampleAbundances> >& abundances);
+ void perform_testing(vector<boost::shared_ptr<SampleAbundances> > abundances);
+ void record_tracking_data(vector<boost::shared_ptr<SampleAbundances> >& abundances);
+ bool all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances> >& abundances);
bool all_samples_reported_in(const string& locus_id);
void clear_tracking_data() { _tracking->clear(); }
- typedef list<pair<string, vector<shared_ptr<SampleAbundances> > > > launcher_sample_table;
+ typedef list<pair<string, vector<boost::shared_ptr<SampleAbundances> > > > launcher_sample_table;
private:
@@ -276,14 +188,17 @@ private:
extern double min_read_count;
-void sample_worker(const RefSequenceTable& rt,
+void sample_worker(bool non_empty,
+ boost::shared_ptr<HitBundle> bundle,
+ const RefSequenceTable& rt,
ReplicatedBundleFactory& sample_factory,
- shared_ptr<SampleAbundances> abundance,
+ boost::shared_ptr<SampleAbundances> abundance,
size_t factory_id,
- shared_ptr<TestLauncher> launcher);
+ boost::shared_ptr<TestLauncher> launcher,
+ bool calculate_variance);
void test_differential(const string& locus_tag,
- const vector<shared_ptr<SampleAbundances> >& samples,
+ const vector<boost::shared_ptr<SampleAbundances> >& samples,
const vector<pair<size_t, size_t> >& constrasts,
Tests& tests,
Tracking& tracking);
@@ -298,3 +213,5 @@ extern int locus_num_threads;
#endif
#endif
+
+void validate_cross_sample_parameters(const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups);
diff --git a/src/filters.cpp b/src/filters.cpp
index 96a9b47..d533fe6 100644
--- a/src/filters.cpp
+++ b/src/filters.cpp
@@ -568,9 +568,9 @@ void filter_hits(int bundle_length,
}
-void filter_junk_isoforms(vector<shared_ptr<Abundance> >& transcripts,
+void filter_junk_isoforms(vector<boost::shared_ptr<Abundance> >& transcripts,
vector<double>& abundances,
- const vector<shared_ptr<Abundance> >& mapped_transcripts,
+ const vector<boost::shared_ptr<Abundance> >& mapped_transcripts,
double locus_mass)
{
// vector<double>::iterator max_ab = std::max_element(abundances.begin(),
@@ -580,7 +580,7 @@ void filter_junk_isoforms(vector<shared_ptr<Abundance> >& transcripts,
for (size_t t = 0; t < transcripts.size(); ++t)
{
- shared_ptr<Scaffold> scaff = transcripts[t]->transfrag();
+ boost::shared_ptr<Scaffold> scaff = transcripts[t]->transfrag();
if (scaff->strand() == CUFF_FWD || scaff->strand() == CUFF_STRAND_UNKNOWN)
{
if (abundances[t] > max_fwd_ab)
@@ -602,7 +602,7 @@ void filter_junk_isoforms(vector<shared_ptr<Abundance> >& transcripts,
//cerr << "Chucked : ";
for (size_t t = 0; t < transcripts.size(); ++t)
{
- shared_ptr<Scaffold> scaff = transcripts[t]->transfrag();
+ boost::shared_ptr<Scaffold> scaff = transcripts[t]->transfrag();
if (!(scaff->is_ref()) && allow_junk_filtering)
{
@@ -661,7 +661,7 @@ void filter_junk_isoforms(vector<shared_ptr<Abundance> >& transcripts,
}
}
- vector<shared_ptr<Abundance> > non_junk_transcripts;
+ vector<boost::shared_ptr<Abundance> > non_junk_transcripts;
vector<double> non_junk_abundances;
for (size_t t = 0; t < transcripts.size(); ++t)
{
diff --git a/src/filters.h b/src/filters.h
index ed04859..faa0839 100644
--- a/src/filters.h
+++ b/src/filters.h
@@ -17,9 +17,9 @@
#include "scaffolds.h"
#include "genes.h"
-void filter_junk_isoforms(vector<shared_ptr<Abundance> >& transcripts,
+void filter_junk_isoforms(vector<boost::shared_ptr<Abundance> >& transcripts,
vector<double>& abundances,
- const vector<shared_ptr<Abundance> >& mapped_transcripts,
+ const vector<boost::shared_ptr<Abundance> >& mapped_transcripts,
double locus_mass);
diff --git a/src/genes.cpp b/src/genes.cpp
index e6a2234..b69c678 100644
--- a/src/genes.cpp
+++ b/src/genes.cpp
@@ -12,7 +12,7 @@
using namespace boost;
#if ENABLE_THREADS
-mutex gene_id_lock;
+boost::mutex gene_id_lock;
#endif
int next_isoform_id = 1;
diff --git a/src/gff.cpp b/src/gff.cpp
index 6e72096..1ebc184 100644
--- a/src/gff.cpp
+++ b/src/gff.cpp
@@ -12,7 +12,7 @@ bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings
const int gff_fid_mRNA=0;
const int gff_fid_transcript=1;
const int gff_fid_exon=2;
-const int gff_fid_CDS=3; //never really used in GffObj ftype_id or subftype_id
+
const uint gfo_flag_HAS_ERRORS = 0x00000001;
const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
const uint gfo_flag_IS_GENE = 0x00000004;
@@ -35,8 +35,15 @@ void gffnames_unref(GffNames* &n) {
if (n->numrefs==0) { delete n; n=NULL; }
}
+
+const char* strExonType(char xtype) {
+ static const char* extbl[7]={"None", "start_codon", "stop_codon", "CDS", "UTR", "CDS_UTR", "exon"};
+ if (xtype>0 && xtype<7)
+ return extbl[(int)xtype];
+ else return "NULL";
+}
+
int gfo_cmpByLoc(const pointer p1, const pointer p2) {
-
GffObj& g1=*((GffObj*)p1);
GffObj& g2=*((GffObj*)p2);
if (g1.gseq_id==g2.gseq_id) {
@@ -232,6 +239,9 @@ GffLine::GffLine(GffReader* reader, const char* l) {
is_cds=true;
is_t_data=true;
}
+ else if (startsWith(fnamelc, "intron") || endsWith(fnamelc, "intron")) {
+ exontype=exgffIntron;
+ }
else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
is_gene=true;
is_t_data=true; //because its name will be attached to parented transcripts
@@ -468,7 +478,7 @@ int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAt
if (gl->exontype==0 && !gl->is_transcript) {
//extraneous mRNA feature, discard
if (reader->gff_warns)
- GMessage("Warning: discarding unrecognized transcript subfeature %s of %s\n",
+ GMessage("Warning: discarding unrecognized transcript subfeature '%s' of %s\n",
gl->ftype, gffID);
return -1;
}
@@ -575,15 +585,24 @@ int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int
expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
return oi;
}
- }
+ }
//only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
// start_codon within (CDS|exon)
- if (exons[oi]->exontype>exontype &&
- exons[oi]->start<=segstart && exons[oi]->end>=segend &&
+ if (exons[oi]->start<=segstart && exons[oi]->end>=segend) {
+ //larger segment given first, now the smaller included one is redundant
+ if (exons[oi]->exontype>exontype &&
!(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
- //larger segment given first, now the smaller included one is redundant
return oi; //only used to store attributes from current GffLine
}
+ else {
+ if (gff_show_warnings && (exons[oi]->start<segstart || exons[oi]->end>segend)) {
+ GMessage("GFF Warning: unusual segment inclusion: %s(%d-%d) within %s(%d-%d) (ID=%s)\n",
+ strExonType(exontype), segstart, segend, strExonType(exons[oi]->exontype),
+ exons[oi]->start, exons[oi]->end, this->gffID);
+ }
+ return oi;
+ }
+ }
if (exontype>exons[oi]->exontype &&
segstart<=exons[oi]->start && segend>=exons[oi]->end &&
!(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
@@ -608,8 +627,8 @@ int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int
if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
if (gff_show_warnings)
- GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n",
- segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
+ GMessage("GFF Warning: merging overlapping/adjacent feature segment %s (%d-%d) with %s (%d-%d) for GFF ID %s on %s\n",
+ strExonType(exontype), segstart, segend, strExonType(exons[oi]->exontype), exons[oi]->start, exons[oi]->end, gffID, getGSeqName());
expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
return oi;
}
@@ -617,7 +636,7 @@ int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int
//TODO: we might want to add an attribute here with the slippage coordinate and size?
covlen-=ovlen;
}//overlap or adjacent to existing segment
- } //check for overlap
+ } //check for overlap
// --- no overlap, or accepted micro-overlap (ribosomal slippage)
// create & add the new segment
/*
@@ -863,6 +882,9 @@ GffLine* GffReader::nextGffLine() {
if (l==NULL) {
return NULL; //end of file
}
+
+
+ _crc_result.process_bytes( linebuf, llen );
int ns=0; //first nonspace position
while (l[ns]!=0 && isspace(l[ns])) ns++;
if (l[ns]=='#' || llen<10) continue;
@@ -902,38 +924,36 @@ void GffReader::gfoRemove(const char* id, const char* ctg) {
GFREE(buf);
}
*/
-GfoHolder* GffReader::gfoAdd(GffObj* gfo, int idx) {
- //Warning: if gflst gets altered, idx becomes obsolete
- GVec<GfoHolder>* glst=phash.Find(gfo->gffID);
+GffObj* GffReader::gfoAdd(GffObj* gfo) {
+ GPVec<GffObj>* glst=phash.Find(gfo->gffID);
if (glst==NULL)
- glst=new GVec<GfoHolder>(1);
- GfoHolder gh(gfo,idx);
- int i=glst->Add(gh);
+ glst=new GPVec<GffObj>(false);
+ //GfoHolder gh(gfo); //,idx);
+ int i=glst->Add(gfo);
phash.Add(gfo->gffID, glst);
- return &(glst->Get(i));
+ return glst->Get(i);
}
-GfoHolder* GffReader::gfoAdd(GVec<GfoHolder>& glst, GffObj* gfo, int idx) {
- GfoHolder gh(gfo,idx);
- int i=glst.Add(gh);
- return &(glst[i]);
+GffObj* GffReader::gfoAdd(GPVec<GffObj>& glst, GffObj* gfo) {
+ int i=glst.Add(gfo);
+ return glst[i];
}
-GfoHolder* GffReader::gfoFind(const char* id, const char* ctg,
- GVec<GfoHolder>** glst, char strand, uint start, uint end) {
- GVec<GfoHolder>* gl=phash.Find(id);
- GfoHolder* gh=NULL;
+GffObj* GffReader::gfoFind(const char* id, const char* ctg,
+ GPVec<GffObj>** glst, char strand, uint start, uint end) {
+ GPVec<GffObj>* gl=phash.Find(id);
+ GffObj* gh=NULL;
if (gl) {
for (int i=0;i<gl->Count();i++) {
- GfoHolder& gfo = gl->Get(i);
- if (ctg!=NULL && strcmp(ctg, gfo.gffobj->getGSeqName())!=0)
+ GffObj& gfo = *(gl->Get(i));
+ if (ctg!=NULL && strcmp(ctg, gfo.getGSeqName())!=0)
continue;
- if (strand && gfo.gffobj->strand!='.' && strand != gfo.gffobj->strand)
+ if (strand && gfo.strand!='.' && strand != gfo.strand)
continue;
if (start>0) {
- if (abs((int)start-(int)gfo.gffobj->start)> (int)GFF_MAX_LOCUS)
+ if (abs((int)start-(int)gfo.start)> (int)GFF_MAX_LOCUS)
continue;
- if (end>0 && (gfo.gffobj->start>end || gfo.gffobj->end<start))
+ if (end>0 && (gfo.start>end || gfo.end<start))
continue;
}
//must be the same transcript, according to given comparison criteria
@@ -944,55 +964,44 @@ GfoHolder* GffReader::gfoFind(const char* id, const char* ctg,
if (glst) *glst=gl;
return gh;
}
-
-GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
+/*
+GffObj* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
- GfoHolder* r=NULL;
+ GffObj* r=NULL;
if (replaceidx>=0) {
gflst.Put(replaceidx,newgfo);
- r=gfoAdd(newgfo, replaceidx);
+ r=gfoAdd(newgfo);
}
else {
int gfoidx=gflst.Add(newgfo);
- r=gfoAdd(newgfo, gfoidx);
+ r=gfoAdd(newgfo);
}
- /*
- if (gff_warns) {
- int* pcount=tids.Find(newgfo->gffID);
- if (pcount!=NULL) {
- if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
- (*pcount)++;
- }
- else {
- tids.Add(newgfo->gffID,new int(1));
- }
- }
- */
return r;
-}
+} */
-GfoHolder* GffReader::updateParent(GfoHolder* newgfh, GffObj* parent) {
+GffObj* GffReader::updateParent(GffObj* newgfo, GffObj* parent) {
//assert(parent);
//assert(newgfo);
- parent->children.Add(newgfh->gffobj);
- if (newgfh->gffobj->parent==NULL) newgfh->gffobj->parent=parent;
- newgfh->gffobj->setLevel(parent->getLevel()+1);
+ parent->children.Add(newgfo);
+ if (newgfo->parent==NULL) newgfo->parent=parent;
+ newgfo->setLevel(parent->getLevel()+1);
if (parent->isGene()) {
- if (parent->gene_name!=NULL && newgfh->gffobj->gene_name==NULL)
- newgfh->gffobj->gene_name=Gstrdup(parent->gene_name);
- if (parent->geneID!=NULL && newgfh->gffobj->geneID==NULL)
- newgfh->gffobj->geneID=Gstrdup(parent->geneID);
+ if (parent->gene_name!=NULL && newgfo->gene_name==NULL)
+ newgfo->gene_name=Gstrdup(parent->gene_name);
+ if (parent->geneID!=NULL && newgfo->geneID==NULL)
+ newgfo->geneID=Gstrdup(parent->geneID);
}
- return newgfh;
+ return newgfo;
}
-GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
- GffObj* parent, GffExon* pexon, GVec<GfoHolder>* glst) {
+GffObj* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
+ GffObj* parent, GffExon* pexon, GPVec<GffObj>* glst) {
GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
- GfoHolder* r=NULL;
- int gfoidx=gflst.Add(newgfo);
- r=(glst) ? gfoAdd(*glst, newgfo, gfoidx) : gfoAdd(newgfo, gfoidx);
+ GffObj* r=NULL;
+ //int gfoidx=gflst.Add(newgfo);
+ gflst.Add(newgfo);
+ r=(glst) ? gfoAdd(*glst, newgfo) : gfoAdd(newgfo);
if (parent!=NULL) {
updateParent(r, parent);
if (pexon!=NULL) parent->removeExon(pexon);
@@ -1012,48 +1021,48 @@ GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr
return r;
}
-GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
+GffObj* GffReader::updateGffRec(GffObj* prevgfo, GffLine* gffline,
bool keepAttr) {
if (prevgfo==NULL) return NULL;
//prevgfo->gffobj->createdByExon(false);
- prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
- prevgfo->gffobj->start=gffline->fstart;
- prevgfo->gffobj->end=gffline->fend;
- prevgfo->gffobj->isGene(gffline->is_gene);
- prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
- prevgfo->gffobj->hasGffID(gffline->ID!=NULL);
+ prevgfo->ftype_id=prevgfo->names->feats.addName(gffline->ftype);
+ prevgfo->start=gffline->fstart;
+ prevgfo->end=gffline->fend;
+ prevgfo->isGene(gffline->is_gene);
+ prevgfo->isTranscript(gffline->is_transcript || gffline->exontype!=0);
+ prevgfo->hasGffID(gffline->ID!=NULL);
if (keepAttr) {
- if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
- prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
+ if (prevgfo->attrs!=NULL) prevgfo->attrs->Clear();
+ prevgfo->parseAttrs(prevgfo->attrs, gffline->info);
}
return prevgfo;
}
-bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
+bool GffReader::addExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
bool r=true;
- if (gffline->strand!=prevgfo->gffobj->strand) {
- if (prevgfo->gffobj->strand=='.') {
- prevgfo->gffobj->strand=gffline->strand;
+ if (gffline->strand!=prevgfo->strand) {
+ if (prevgfo->strand=='.') {
+ prevgfo->strand=gffline->strand;
}
else {
GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
- prevgfo->gffobj->gffID, prevgfo->gffobj->strand,
- gffline->fstart, gffline->fend, gffline->strand, prevgfo->gffobj->getGSeqName());
+ prevgfo->gffID, prevgfo->strand,
+ gffline->fstart, gffline->fend, gffline->strand, prevgfo->getGSeqName());
//r=false;
return true;
}
}
- int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
- ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
+ int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end :
+ ((gffline->fend<prevgfo->start)? prevgfo->start-gffline->fend :
0 );
if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
- GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffobj->gffID);
+ GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID);
//validation_errors = true;
r=false;
if (!gff_warns) exit(1);
}
- int eidx=prevgfo->gffobj->addExon(this, gffline, !noExonAttr, noExonAttr);
+ int eidx=prevgfo->addExon(this, gffline, !noExonAttr, noExonAttr);
if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
subfPoolAdd(pex, prevgfo);
return r;
@@ -1074,57 +1083,60 @@ CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*
return NULL;
}
-void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo) {
+void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GffObj* newgfo) {
//this might become a parent feature later
-if (newgfo->gffobj->exons.Count()>0) {
+if (newgfo->exons.Count()>0) {
char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
- pex.Add(xbuf, new CNonExon(newgfo->idx, newgfo->gffobj,
- newgfo->gffobj->exons[0], gffline));
+ pex.Add(xbuf, new CNonExon(newgfo, newgfo->exons[0], gffline));
GFREE(xbuf);
}
}
-GfoHolder* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
+GffObj* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
bool keepAttr, bool noExonAttr) {
GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
- if (prevp!=gflst[subp->idx])
- GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
+ //if (prevp!=gflst[subp->idx])
+ // GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
subp->gffline->discardParent();
- GfoHolder* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
+ GffObj* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
prevp->promotedChildren(true);
return gfoh; //returns the holder of newly promoted feature
}
-//have to parse the whole file because exons can be scattered all over
-//trans-splicing and fusions are only accepted in proper GFF3 format, with a single parent feature ID entry
+//have to parse the whole file because exons and other subfeatures can be scattered, unordered in the input
+//Trans-splicing and fusions are only accepted in proper GFF3 format, i.e. multiple features with the same ID
+//are accepted if they are NOT overlapping/continuous
+// *** BUT (exception): proximal xRNA features with the same ID, on the same strand, will be merged
+// and the segments will be treated like exons (e.g. TRNAR15 (rna1940) in RefSeq)
void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
bool validation_errors = false;
//loc_debug=false;
GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
//and thus could become promoted to parent features
while (nextGffLine()!=NULL) {
- GfoHolder* prevseen=NULL;
- GVec<GfoHolder>* prevgflst=NULL;
+ GffObj* prevseen=NULL;
+ GPVec<GffObj>* prevgflst=NULL;
if (gffline->ID && gffline->exontype==0) {
//>> for a parent-like IDed feature (mRNA, gene, etc.)
//look for same ID on the same chromosome/strand/locus
prevseen=gfoFind(gffline->ID, gffline->gseqname, &prevgflst, gffline->strand, gffline->fstart);
if (prevseen!=NULL) {
//same ID/chromosome combo encountered before
- if (prevseen->gffobj->createdByExon() &&
- prevseen->gffobj->start>=gffline->fstart &&
- prevseen->gffobj->end<=gffline->fend) {
+ if (prevseen->createdByExon()) {
+ if (gff_show_warnings && (prevseen->start<gffline->fstart ||
+ prevseen->end>gffline->fend))
+ GMessage("GFF Warning: invalid coordinates for %s parent feature (ID=%s)\n", gffline->ftype, gffline->ID);
//an exon of this ID was given before
//this line has the main attributes for this ID
- updateGffRec(prevseen, gffline, keepAttr);
- }
+ updateGffRec(prevseen, gffline, keepAttr);
+ }
else {
- //- duplicate ID -- this must be a discontiguous feature
+ //- duplicate ID -- this must be a discontinuous feature according to GFF3 specs
// e.g. a trans-spliced transcript
- if (prevseen->gffobj->overlap(gffline->fstart, gffline->fend)) {
+ if (prevseen->overlap(gffline->fstart, gffline->fend)) {
//overlapping with same ID not allowed
- GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
+ GMessage("GFF Error: duplicate/invalid '%s' feature ID=%s\n", gffline->ftype, gffline->ID);
//validation_errors = true;
if (gff_warns) {
delete gffline;
@@ -1134,23 +1146,37 @@ void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
else exit(1);
}
//create a new entry with the same ID
- prevseen=newGffRec(gffline, keepAttr, noExonAttr,
- prevseen->gffobj->parent, NULL, prevgflst);
+ int distance=INT_MAX;
+ if (prevseen->isTranscript() && prevseen->strand==gffline->strand) {
+ if (prevseen->start>=gffline->fstart)
+ distance=prevseen->start-gffline->fend;
+ else
+ distance=gffline->fstart-prevseen->end;
+ }
+ if (distance<1000) {//FIXME: arbitrary proximity threshold (yuck)
+ //exception: make this an exon of previous ID
+ //addExonFeature(prevseen, gffline, pex, noExonAttr);
+ prevseen->addExon(this, gffline, false, true);
+ }
+ else { //create a separate entry (true discontinuous feature)
+ prevseen=newGffRec(gffline, keepAttr, noExonAttr,
+ prevseen->parent, NULL, prevgflst);
+ }
} //duplicate ID on the same chromosome
} //prevseeen != NULL
- } //parent-like ID feature
+ } //parent-like ID feature
if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, prevgflst);
}
else { //--- it's a child feature (exon/CDS but could still be a mRNA with gene(s) as parent)
//updates all the declared parents with this child
bool found_parent=false;
- GfoHolder* newgfo=prevseen;
- GVec<GfoHolder>* newgflst=NULL;
+ GffObj* newgfo=prevseen;
+ GPVec<GffObj>* newgflst=NULL;
for (int i=0;i<gffline->num_parents;i++) {
if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
continue; //skipping discarded parent feature
- GfoHolder* parentgfo=NULL;
+ GffObj* parentgfo=NULL;
if (gffline->is_transcript || gffline->exontype==0) {//possibly a transcript
parentgfo=gfoFind(gffline->parents[i], gffline->gseqname,
&newgflst, gffline->strand, gffline->fstart, gffline->fend);
@@ -1163,21 +1189,24 @@ void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
}
if (parentgfo!=NULL) { //parent GffObj parsed earlier
found_parent=true;
- if (parentgfo->gffobj->isGene() && gffline->is_transcript
+ if (parentgfo->isGene() && gffline->is_transcript
&& gffline->exontype==0) {
//not an exon, but a transcript parented by a gene
if (newgfo) {
- updateParent(newgfo, parentgfo->gffobj);
+ updateParent(newgfo, parentgfo);
}
else {
- newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo->gffobj);
+ newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo);
}
+ }
+ else { //potential exon subfeature?
+ //always discards dummy "intron" features
+ if (!(gffline->exontype==exgffIntron && (parentgfo->isTranscript() || parentgfo->exons.Count()>0))) {
+ if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
+ validation_errors=true;
}
- else { //potential exon subfeature
- if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
- validation_errors=true;
- }
- } //overlapping parent feature found
+ }
+ } //overlapping parent feature found
} //for each parsed parent Id
if (!found_parent) { //new GTF-like record starting here with a subfeature directly
//or it could be some chado GFF3 barf with exons coming BEFORE their parent :(
@@ -1186,20 +1215,20 @@ void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
if (subp!=NULL) { //found a subfeature that is the parent of this gffline
//promote that subfeature to a full GffObj
- GfoHolder* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
+ GffObj* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
//add current gffline as an exon of the newly promoted subfeature
if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
validation_errors=true;
}
else { //no parent seen before,
//loc_debug=true;
- GfoHolder* ngfo=prevseen;
+ GffObj* ngfo=prevseen;
if (ngfo==NULL) {
//if it's an exon type, create directly the parent with this exon
//but if it's recognized as a transcript, the object itself is created
ngfo=newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, newgflst);
}
- if (!ngfo->gffobj->isTranscript() &&
+ if (!ngfo->isTranscript() &&
gffline->ID!=NULL && gffline->exontype==0)
subfPoolAdd(pex, ngfo);
//even those with errors will be added here!
@@ -1239,10 +1268,19 @@ void GfList::finalize(GffReader* gfr, bool mergeCloseExons,
fList[i]->finalize(gfr, mergeCloseExons, keepAttrs, noExonAttr);
if (fList[i]->isDiscarded()) {
discarded.Add(fList[i]);
+ if (fList[i]->children.Count()>0) {
+ for (int c=0;c<fList[i]->children.Count();c++) {
+ fList[i]->children[c]->parent=NULL;
+ if (keepAttrs)
+ fList[i]->children[c]->copyAttrs(fList[i]); //inherit the attributes of discarded parent (e.g. pseudo=true; )
+ }
+ }
this->Forget(i);
}
}
- if (discarded.Count()>0) this->Pack();
+ if (discarded.Count()>0) {
+ this->Pack();
+ }
}
GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
@@ -1387,6 +1425,34 @@ void GffObj::addAttr(const char* attrname, const char* attrvalue) {
this->attrs->add_or_update(names, attrname, attrvalue);
}
+void GffObj::copyAttrs(GffObj* from) { //typically from is the parent gene, and this is a transcript
+ if (from==NULL || from->attrs==NULL) return;
+ if (this->attrs==NULL) {
+ this->attrs=new GffAttrs();
+ }
+ //special RefSeq case
+ int desc_attr_id=names->attrs.getId("description"); //from gene
+ int prod_attr_id=names->attrs.getId("product"); //from transcript (this)
+ char* prod = (prod_attr_id>=0) ? this->attrs->getAttr(prod_attr_id) : NULL;
+
+ for (int i=0;i<from->attrs->Count();++i) {
+ //this->attrs->add_no_update(names, from->attrs->Get(i)->attr_id, from->attrs->Get(i)->attr_val);
+ int aid=from->attrs->Get(i)->attr_id;
+ //special case for GenBank refseq genes vs transcripts:
+ if (prod && aid==desc_attr_id && strcmp(from->attrs->getAttr(desc_attr_id), prod)==0)
+ continue; //skip description if product already there and the same
+ bool haveit=false;
+ for (int ai=0;ai<this->attrs->Count();++ai) {
+ //do we have it already?
+ if (aid==this->attrs->Get(i)->attr_id) {
+ haveit=true;
+ break; //skip this, don't replace
+ }
+ }
+ if (!haveit)
+ this->attrs->Add(new GffAttr(aid, from->attrs->Get(i)->attr_val));
+ }
+}
void GffObj::setFeatureName(const char* feature) {
//change the feature name/type for a transcript
@@ -1803,33 +1869,70 @@ void GffObj::printSummary(FILE* fout) {
strand, start, end, gscore, (float)qcov/10.0);
}
+void decodeHexChars(char* dbuf, const char* s, int maxlen=1023) {
+ int dlen=0;
+ dbuf[0]=0;
+ if (s==NULL) return;
+ for (const char* p=s;(*p)!=0 && dlen<maxlen;++p) {
+ if (p[0]=='%' && isxdigit(p[1]) && isxdigit(p[2])) {
+ int a=p[1];
+ if (a>'Z') a^=0x20; //toupper()
+ if (a>'9') a=10+(a-'A');
+ else a-='0';
+ int b=p[2];
+ if (b>'Z') b^=0x20;
+ if (b>'9') b=10+(b-'A');
+ else b-='0';
+ char c=(char)((a<<4)+b);
+ if (c==';') c='.';
+ if (c>' ') {
+ dbuf[dlen]=c;
+ ++p;++p;
+ ++dlen;
+ continue;
+ }
+ }
+ dbuf[dlen]=*p;
+ ++dlen;
+ }
+ dbuf[dlen]=0;
+}
+
void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
- uint segstart, uint segend, int exidx, char phase, bool gff3) {
- static char scorestr[14];
- strcpy(scorestr,".");
+ uint segstart, uint segend, int exidx, char phase, bool gff3, bool cvtChars) {
+ char dbuf[1024];
+ strcpy(dbuf,".");
GffAttrs* xattrs=NULL;
if (exidx>=0) {
- if (exons[exidx]->score) sprintf(scorestr,"%.2f", exons[exidx]->score);
+ if (exons[exidx]->score) sprintf(dbuf,"%.2f", exons[exidx]->score);
xattrs=exons[exidx]->attrs;
}
if (phase==0 || !iscds) phase='.';
const char* ftype=iscds ? "CDS" : getSubfName();
+ const char* attrname=NULL;
+ const char* attrval=NULL;
if (gff3) {
fprintf(fout,
"%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s",
- gseqname, tlabel, ftype, segstart, segend, scorestr, strand,
+ gseqname, tlabel, ftype, segstart, segend, dbuf, strand,
phase, gffID);
if (xattrs!=NULL) {
- for (int i=0;i<xattrs->Count();i++)
- fprintf(fout, ";%s=%s",names->attrs.getName(xattrs->Get(i)->attr_id),
- xattrs->Get(i)->attr_val);
- }
+ for (int i=0;i<xattrs->Count();i++) {
+ attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
+ if (cvtChars) {
+ decodeHexChars(dbuf, xattrs->Get(i)->attr_val);
+ fprintf(fout,";%s=%s", attrname, dbuf);
+ } else {
+ fprintf(fout,";%s=%s", attrname, xattrs->Get(i)->attr_val);
+ }
+ }
+ }
fprintf(fout, "\n");
- } //GFF
+ } //GFF3
else {//for GTF -- we print only transcripts
//if (isValidTranscript())
fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
- gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase, gffID);
+ gseqname, tlabel, ftype, segstart, segend, dbuf, strand, phase, gffID);
//char* geneid=(geneID!=NULL)? geneID : gffID;
if (geneID)
fprintf(fout," gene_id \"%s\";",geneID);
@@ -1842,22 +1945,33 @@ void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname,
if (xattrs!=NULL) {
for (int i=0;i<xattrs->Count();i++) {
if (xattrs->Get(i)->attr_val==NULL) continue;
- const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
+ attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
fprintf(fout, " %s ",attrname);
- if (xattrs->Get(i)->attr_val[0]=='"')
- fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
- else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
+ if (cvtChars) {
+ decodeHexChars(dbuf, xattrs->Get(i)->attr_val);
+ attrval=dbuf;
+ } else {
+ attrval=xattrs->Get(i)->attr_val;
+ }
+
+ if (attrval[0]=='"') fprintf(fout, "%s;",attrval);
+ else fprintf(fout, "\"%s\";",attrval);
}
}
//for GTF, also append the GffObj attributes to each exon line
if ((xattrs=this->attrs)!=NULL) {
for (int i=0;i<xattrs->Count();i++) {
if (xattrs->Get(i)->attr_val==NULL) continue;
- const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
+ attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
fprintf(fout, " %s ",attrname);
- if (xattrs->Get(i)->attr_val[0]=='"')
- fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
- else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
+ if (cvtChars) {
+ decodeHexChars(dbuf, xattrs->Get(i)->attr_val);
+ attrval=dbuf;
+ } else {
+ attrval=xattrs->Get(i)->attr_val;
+ }
+ if (attrval[0]=='"') fprintf(fout, "%s;",attrval);
+ else fprintf(fout, "\"%s\";",attrval);
}
}
fprintf(fout, "\n");
@@ -1865,8 +1979,9 @@ void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname,
}
void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
- const char* tlabel, const char* gfparent) {
- static char tmpstr[255];
+ const char* tlabel, const char* gfparent, bool cvtChars) {
+ //char tmpstr[255];
+ char dbuf[1024];
if (tlabel==NULL) {
tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
(char*)"gffobj" ;
@@ -1879,8 +1994,8 @@ void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
if (gff3) {
//print GFF3 mRNA line:
- if (gscore>0.0) sprintf(tmpstr,"%.2f", gscore);
- else strcpy(tmpstr,".");
+ if (gscore>0.0) sprintf(dbuf,"%.2f", gscore);
+ else strcpy(dbuf,".");
uint pstart, pend;
if (gffp==pgffCDS) {
pstart=CDstart;
@@ -1891,8 +2006,8 @@ void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
const char* ftype=getFeatureName();
fprintf(fout,
"%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
- gseqname, tlabel, ftype, pstart, pend, tmpstr, strand, gffID);
- if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
+ gseqname, tlabel, ftype, pstart, pend, dbuf, strand, gffID);
+ if (CDstart>0 && !showCDS/* && !isCDS*/) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
if (gfparent!=NULL) {
//parent override
fprintf(fout, ";Parent=%s",gfparent);
@@ -1908,28 +2023,39 @@ void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
if (attrs!=NULL) {
for (int i=0;i<attrs->Count();i++) {
const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
- fprintf(fout,";%s=%s", attrname,
- attrs->Get(i)->attr_val);
+ if (cvtChars) {
+ decodeHexChars(dbuf, attrs->Get(i)->attr_val);
+ fprintf(fout,";%s=%s", attrname, dbuf);
+ } else {
+ fprintf(fout,";%s=%s", attrname, attrs->Get(i)->attr_val);
}
}
- fprintf(fout,"\n");
+ }
+ fprintf(fout,"\n");
}// gff3 mRNA line
+ bool is_cds_only = (gffp==pgffBoth) ? false : isCDS;
if (showExon) {
//print exons
- if (isCDS && exons.Count()>0 &&
+ if (isCDS && exons.Count()>0 &&
((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0')))
updateExonPhase();
-
for (int i=0;i<exons.Count();i++) {
- printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
+ printGxfLine(fout, tlabel, gseqname, is_cds_only, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3, cvtChars);
}
}//printing exons
- if (showCDS && !isCDS && CDstart>0) {
- GArray<GffCDSeg> cds(true,true);
- getCDSegs(cds);
- for (int i=0;i<cds.Count();i++) {
- printGxfLine(fout, tlabel, gseqname, true, cds[i].start, cds[i].end, -1, cds[i].phase, gff3);
- }
+ if (showCDS && !is_cds_only && CDstart>0) {
+ if (isCDS) {
+ for (int i=0;i<exons.Count();i++) {
+ printGxfLine(fout, tlabel, gseqname, true, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3, cvtChars);
+ }
+ }
+ else {
+ GArray<GffCDSeg> cds(true,true);
+ getCDSegs(cds);
+ for (int i=0;i<cds.Count();i++) {
+ printGxfLine(fout, tlabel, gseqname, true, cds[i].start, cds[i].end, -1, cds[i].phase, gff3, cvtChars);
+ }
+ }
} //showCDS
}
diff --git a/src/gff.h b/src/gff.h
index 51136d9..200fa52 100644
--- a/src/gff.h
+++ b/src/gff.h
@@ -8,6 +8,8 @@
#include "GList.hh"
#include "GHash.hh"
+#include <boost/crc.hpp> // for boost::crc_32_type
+
/*
const byte exMskMajSpliceL = 0x01;
const byte exMskMajSpliceR = 0x02;
@@ -20,8 +22,7 @@ const byte exMskTag = 0x80;
extern const int gff_fid_mRNA; // "mRNA" feature name
extern const int gff_fid_transcript; // *RNA, *transcript feature name
extern const int gff_fid_exon;
-extern const int gff_fid_CDS; //never really used, except for display only
- //use gff_fid_exon instead
+
extern const uint GFF_MAX_LOCUS;
extern const uint GFF_MAX_EXON;
extern const uint GFF_MAX_INTRON;
@@ -46,7 +47,8 @@ extern bool gff_show_warnings;
enum GffExonType {
- exgffNone=0, //not a recognizable exon or CDS segment
+ exgffIntron=-1, // useless "intron" feature
+ exgffNone=0, //not a recognizable exon or CDS segment
exgffStart, //from "start_codon" feature (within CDS)
exgffStop, //from "stop_codon" feature (may be outside CDS)
exgffCDS, //from "CDS" feature
@@ -55,6 +57,8 @@ enum GffExonType {
exgffExon, //from "exon" feature
};
+const char* strExonType(char xtype);
+
class GffReader;
class GffLine {
@@ -99,9 +103,16 @@ class GffLine {
parents=NULL;
}
char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false);
- GffLine(GffLine* l) { //a copy constructor
+ GffLine(GffLine* l):_parents(NULL), _parents_len(0),
+ dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL),
+ ftype(NULL), info(NULL), fstart(0), fend(0), qstart(0), qend(0), qlen(0),
+ score(0), strand(0), skip(true), is_gff3(false), is_cds(false), is_exon(false),
+ exontype(0), is_transcript(false), is_gene(false), phase(0),
+ gene_name(NULL), gene_id(NULL),
+ parents(NULL), num_parents(0), ID(NULL) { //a copy constructor
+ if (l==NULL || l->line==NULL)
+ GError("Error: invalid GffLine(l)\n");
memcpy((void*)this, (void*)l, sizeof(GffLine));
- line=NULL;
GMALLOC(line, llen+1);
memcpy(line, l->line, llen+1);
GMALLOC(dupline, llen+1);
@@ -111,12 +122,13 @@ class GffLine {
track=line+(l->track-l->line);
ftype=line+(l->ftype-l->line);
info=line+(l->info-l->line);
- //Parent=Gstrdup(l->Parent);
- if (l->_parents_len>0) {
- _parents_len=l->_parents_len;
+ if (num_parents>0 && parents) {
+ parents=NULL; //re-init, just copied earlier
+ GMALLOC(parents, num_parents*sizeof(char*));
+ //_parents_len=l->_parents_len; copied above
+ _parents=NULL; //re-init, forget pointer copy
GMALLOC(_parents, _parents_len);
memcpy(_parents, l->_parents, _parents_len);
- num_parents=l->num_parents;
for (int i=0;i<num_parents;i++) {
parents[i]=_parents+(l->parents[i] - l->_parents);
}
@@ -128,34 +140,13 @@ class GffLine {
if (l->gene_id!=NULL)
gene_id=Gstrdup(l->gene_id);
}
- GffLine() {
- line=NULL;
- dupline=NULL;
- gseqname=NULL;
- track=NULL;
- ftype=NULL;
- fstart=0;
- fend=0;
- strand=0;phase=0;
- llen=0;score=0;
- info=NULL;
- _parents=NULL;
- _parents_len=0;
- parents=NULL;
- num_parents=0;
- ID=NULL;
- gene_name=NULL;
- gene_id=NULL;
- skip=true;
- qstart=0;
- qend=0;
- qlen=0;
- exontype=0;
- is_cds=false;
- is_gff3=false;
- is_transcript=false;
- is_gene=false;
- is_exon=false;
+ GffLine():_parents(NULL), _parents_len(0),
+ dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL),
+ ftype(NULL), info(NULL), fstart(0), fend(0), qstart(0), qend(0), qlen(0),
+ score(0), strand(0), skip(true), is_gff3(false), is_cds(false), is_exon(false),
+ exontype(0), is_transcript(false), is_gene(false), phase(0),
+ gene_name(NULL), gene_id(NULL),
+ parents(NULL), num_parents(0), ID(NULL) {
}
~GffLine() {
GFREE(dupline);
@@ -221,9 +212,8 @@ class GffNames;
class GffNameInfo {
friend class GffNameList;
-protected:
- int idx;
public:
+ int idx;
char* name;
GffNameInfo(const char* n=NULL):idx(-1),name(NULL) {
if (n) name=Gstrdup(n);
@@ -312,11 +302,11 @@ class GffNames {
GffNames():tracks(),gseqs(),attrs(), feats() {
numrefs=0;
//the order below is critical!
- //has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS
+ //has to match: gff_fid_mRNA, gff_fid_exon
feats.addStatic("mRNA");//index 0=gff_fid_mRNA
feats.addStatic("transcript");//index 1=gff_fid_transcript
feats.addStatic("exon");//index 1=gff_fid_exon
- feats.addStatic("CDS"); //index 2=gff_fid_CDS
+ //feats.addStatic("CDS"); //index 2=gff_fid_CDS
}
};
@@ -355,7 +345,7 @@ class GffAttrs:public GList<GffAttr> {
}
this->Add(new GffAttr(aid, val));
}
-
+
char* getAttr(GffNames* names, const char* attrname) {
int aid=names->attrs.getId(attrname);
if (aid>=0)
@@ -549,6 +539,7 @@ public:
GffObj(GffReader* gfrd, GffLine* gffline, bool keepAttrs=false, bool noExonAttr=true);
//if gfline->Parent!=NULL then this will also add the first sub-feature
// otherwise, only the main feature is created
+ void copyAttrs(GffObj* from);
void clearAttrs() {
if (attrs!=NULL) {
bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs);
@@ -603,9 +594,9 @@ public:
//complete parsing: must be called in order to merge adjacent/close proximity subfeatures
void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false);
const char* getSubfName() { //returns the generic feature type of the entries in exons array
- int sid=exon_ftype_id;
- if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS;
- return names->feats.getName(sid);
+ //int sid=exon_ftype_id;
+ //if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS;
+ return names->feats.getName(exon_ftype_id);
}
void addCDS(uint cd_start, uint cd_end, char phase=0);
@@ -856,20 +847,20 @@ public:
void updateExonPhase(); //for CDS-only features, updates GExon::phase
void printGxfLine(FILE* fout, const char* tlabel, const char* gseqname,
- bool iscds, uint segstart, uint segend, int exidx, char phase, bool gff3);
+ bool iscds, uint segstart, uint segend, int exidx, char phase, bool gff3, bool cvtChars=false);
void printGxf(FILE* fout, GffPrintMode gffp=pgffExon,
- const char* tlabel=NULL, const char* gfparent=NULL);
- void printGtf(FILE* fout, const char* tlabel=NULL) {
- printGxf(fout, pgtfAny, tlabel);
+ const char* tlabel=NULL, const char* gfparent=NULL, bool cvtChars=false);
+ void printGtf(FILE* fout, const char* tlabel=NULL, bool cvtChars=false) {
+ printGxf(fout, pgtfAny, tlabel, NULL, cvtChars);
}
void printGff(FILE* fout, const char* tlabel=NULL,
- const char* gfparent=NULL) {
- printGxf(fout, pgffAny, tlabel, gfparent);
+ const char* gfparent=NULL, bool cvtChars=false) {
+ printGxf(fout, pgffAny, tlabel, gfparent, cvtChars);
}
void printTranscriptGff(FILE* fout, char* tlabel=NULL,
- bool showCDS=false, const char* gfparent=NULL) {
+ bool showCDS=false, const char* gfparent=NULL, bool cvtChars=false) {
if (isValidTranscript())
- printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent);
+ printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent, cvtChars);
}
void printSummary(FILE* fout=NULL);
void getCDS_ends(uint& cds_start, uint& cds_end);
@@ -964,26 +955,27 @@ class GfList: public GList<GffObj> {
}
};
-
+/*
struct GfoHolder {
- int idx; //position in GffReader::gflst array
+ //int idx; //position in GffReader::gflst array
GffObj* gffobj;
- GfoHolder(GffObj* gfo=NULL, int i=0) {
- idx=i;
+ GfoHolder(GffObj* gfo=NULL) { //, int i=0) {
+ //idx=i;
gffobj=gfo;
}
};
-
+*/
class CNonExon { //utility class used in subfeature promotion
public:
- int idx;
+ //int idx;
GffObj* parent;
GffExon* exon;
GffLine* gffline;
- CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) {
+ //CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) {
+ CNonExon(GffObj* p, GffExon* e, GffLine* gl) {
parent=p;
exon=e;
- idx=i;
+ //idx=i;
gffline=new GffLine(gl);
}
~CNonExon() {
@@ -1002,34 +994,37 @@ class GffReader {
bool gff_warns; //warn about duplicate IDs, etc. even when they are on different chromosomes
FILE* fh;
char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader
- GffNames* names; //just a pointer to the global static Gff names repository in GffObj
GffLine* gffline;
bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features
GHash<int> discarded_ids; //for transcriptsOnly mode, keep track
// of discarded parent IDs
- GHash< GVec<GfoHolder> > phash; //transcript_id+contig (Parent~Contig) => [gflst index, GffObj]
+ GHash< GPVec<GffObj> > phash; //transcript_id+contig (Parent~Contig) => [gflst index, GffObj]
//GHash<int> tids; //just for transcript_id uniqueness
char* gfoBuildId(const char* id, const char* ctg);
//void gfoRemove(const char* id, const char* ctg);
- GfoHolder* gfoAdd(GffObj* gfo, int idx);
- GfoHolder* gfoAdd(GVec<GfoHolder>& glst, GffObj* gfo, int idx);
+ GffObj* gfoAdd(GffObj* gfo);
+ GffObj* gfoAdd(GPVec<GffObj>& glst, GffObj* gfo);
// const char* id, const char* ctg, char strand, GVec<GfoHolder>** glst, uint start, uint end
- GfoHolder* gfoFind(const char* id, const char* ctg=NULL, GVec<GfoHolder>** glst=NULL,
+ GffObj* gfoFind(const char* id, const char* ctg=NULL, GPVec<GffObj>** glst=NULL,
char strand=0, uint start=0, uint end=0);
CNonExon* subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name);
- void subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo);
- GfoHolder* promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
+ void subfPoolAdd(GHash<CNonExon>& pex, GffObj* newgfo);
+ GffObj* promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
bool keepAttr, bool noExonAttr);
GList<GSeqStat> gseqstats; //list of all genomic sequences seen by this reader, accumulates stats
+
+ boost::crc_32_type _crc_result;
+
public:
+ GffNames* names; //just a pointer to the global static Gff names repository in GffObj
GfList gflst; //accumulate GffObjs being read
- GfoHolder* newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
- GffObj* parent=NULL, GffExon* pexon=NULL, GVec<GfoHolder>* glst=NULL);
- GfoHolder* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx);
- GfoHolder* updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
+ GffObj* newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
+ GffObj* parent=NULL, GffExon* pexon=NULL, GPVec<GffObj>* glst=NULL);
+ //GffObj* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx);
+ GffObj* updateGffRec(GffObj* prevgfo, GffLine* gffline,
bool keepAttr);
- GfoHolder* updateParent(GfoHolder* newgfh, GffObj* parent);
- bool addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr);
+ GffObj* updateParent(GffObj* newgfh, GffObj* parent);
+ bool addExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr);
GPVec<GSeqStat> gseqStats; //only populated after finalize()
GffReader(FILE* f=NULL, bool t_only=false, bool sortbyloc=false):discarded_ids(true),
phash(true), gseqstats(true,true,true), gflst(sortbyloc), gseqStats(1, false) {
@@ -1081,12 +1076,13 @@ class GffReader {
gff_warns=v;
gff_show_warnings=v;
}
-
+
GffLine* nextGffLine();
// load all subfeatures, re-group them:
void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true);
+ boost::crc_32_type current_crc_result() const { return _crc_result; }
}; // end of GffReader
#endif
diff --git a/src/gff_utils.cpp b/src/gff_utils.cpp
index 0907c62..419b153 100644
--- a/src/gff_utils.cpp
+++ b/src/gff_utils.cpp
@@ -341,9 +341,15 @@ void preserveContainedCDS(GffObj* t, GffObj* tfrom) {
}
}
-void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant,
+bool exonOverlap2Gene(GffObj* t, GffObj& g) {
+ if (t->exons.Count()>0) {
+ return t->exonOverlap(g.start, g.end);
+ }
+ else return g.overlap(*t);
+}
+void GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant,
bool matchAllIntrons, bool fuzzSpan) {
- GTData* tdata=new GTData(t);
+ GTData* tdata=new GTData(t); //additional transcript data
gdata->tdata.Add(tdata);
//int tidx=-1;
/*
@@ -353,12 +359,47 @@ void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedu
}
else debugState=false;
*/
+ //dumb TRNA case for RefSeq: gene parent link missing
+ //try to restore it here; BUT this only works if gene feature comes first
+ if (t->parent==NULL && t->isTranscript()) {
+ int gidx=gdata->gfs.Count()-1;
+ while (gidx>=0 && gdata->gfs[gidx]->end>=t->start) {
+ GffObj& g = *(gdata->gfs[gidx]);
+ if (g.isGene() && t->strand==g.strand && exonOverlap2Gene(t, g)) {
+ g.children.Add(t);
+ t->parent=&g;
+ //disable printing of gene if transcriptsOnly
+ if (transcriptsOnly) {
+ g.udata|=4; //tag it as non-printable
+ }
+ const char* geneName=g.getAttr("Name");
+ if (t->getAttr("Name")==NULL && geneName) {
+ t->addAttr("Name", geneName);
+ t->addAttr("gene_name", geneName);
+ }
+ t->addAttr("geneID", g.getID());
+ break;
+ }
+ gidx--;
+ }
+ }
+
+ /*
+ if (t->exons.Count()==0 && t->children.Count()==0 && forceExons) {
+ //a non-mRNA feature with no subfeatures
+ //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
+ //--this could be a single "pseudogene" entry or another genomic region without exons
+ //
+ t->addExon(t->start,t->end);
+ }
+ */
if (t->exons.Count()>0) {
//tidx=
gdata->rnas.Add(t); //added it in sorted order
}
else {
- gdata->gfs.Add(t);
+ if (t->isGene() || !this->transcriptsOnly)
+ gdata->gfs.Add(t);
return; //nothing to do with these non-transcript objects
}
if (!doCluster) return;
@@ -533,6 +574,26 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate
gffr->showWarnings(this->showWarnings);
// keepAttrs mergeCloseExons noExonAttr
gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs);
+ GVec<int> pseudoAttrIds;
+ GVec<int> pseudoFeatureIds;
+ if (this->noPseudo) {
+ GffNameList& fnames = gffr->names->feats;
+ for (int i=0;i<fnames.Count();i++) {
+ char* n=fnames[i]->name;
+ if (startsWith(n, "pseudo")) {
+ pseudoFeatureIds.Add(fnames[i]->idx);
+ }
+ }
+ GffNameList& attrnames = gffr->names->attrs;
+ for (int i=0;i<attrnames.Count();i++) {
+ char* n=attrnames[i]->name;
+ char* p=strifind(n, "pseudo");
+ if (p==n || (p==n+2 && tolower(n[0])=='i' && tolower(n[1])=='s')) {
+ pseudoAttrIds.Add(attrnames[i]->idx);
+ }
+ }
+ }
+
//int redundant=0; //redundant annotation discarded
if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
//int rna_deleted=0;
@@ -543,17 +604,40 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate
m->getAttr("transcripts")!=NULL) {
continue; //discard locus meta-features
}
-
+ if (this->noPseudo) {
+ bool is_pseudo=false;
+ for (int i=0;i<pseudoFeatureIds.Count();++i) {
+ if (pseudoFeatureIds[i]==m->ftype_id) {
+ is_pseudo=true;
+ break;
+ }
+ }
+ if (is_pseudo) continue;
+ for (int i=0;i<pseudoAttrIds.Count();++i) {
+ char* attrv=NULL;
+ if (m->attrs!=NULL) attrv=m->attrs->getAttr(pseudoAttrIds[i]);
+ if (attrv!=NULL) {
+ char fc=tolower(attrv[0]);
+ if (fc=='t' || fc=='y' || fc=='1') {
+ is_pseudo=true;
+ break;
+ }
+ }
+ }
+ if (is_pseudo) continue;
+ }
char* rloc=m->getAttr("locus");
if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
m->removeAttr("locus", rloc);
}
+ /*
if (m->exons.Count()==0 && m->children.Count()==0) {
//a non-mRNA feature with no subfeatures
//add a dummy exon just to have the generic exon checking work
m->addExon(m->start,m->end);
}
- if (forceExons && m->children.Count()==0) {
+ */
+ if (forceExons) { // && m->children.Count()==0) {
m->exon_ftype_id=gff_fid_exon;
}
GList<GffObj> gfadd(false,false);
@@ -564,7 +648,6 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate
int i=-1;
GenomicSeqData f(m->gseq_id);
GenomicSeqData* gdata=NULL;
-
if (seqdata.Found(&f,i)) gdata=seqdata[i];
else { //entry not created yet for this genomic seq
gdata=new GenomicSeqData(m->gseq_id);
diff --git a/src/gff_utils.h b/src/gff_utils.h
index 1708ec5..b15b677 100644
--- a/src/gff_utils.h
+++ b/src/gff_utils.h
@@ -563,6 +563,9 @@ struct GffLoader {
bool noExonAttrs;
bool mergeCloseExons;
bool showWarnings;
+ bool noPseudo;
+ void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster=true, bool collapseRedundant=true,
+ bool matchAllIntrons=true, bool fuzzSpan=false);
void load(GList<GenomicSeqData>&seqdata, GFValidateFunc* gf_validate=NULL,
bool doCluster=true, bool doCollapseRedundant=true,
bool matchAllIntrons=true, bool fuzzSpan=false, bool forceExons=false);
@@ -573,6 +576,7 @@ struct GffLoader {
noExonAttrs=false;
mergeCloseExons=false;
showWarnings=false;
+ noPseudo=false;
if (fname=="-" || fname=="stdin") {
f=stdin;
fname="stdin";
@@ -599,8 +603,6 @@ int qsearch_gloci(uint x, GList<GffLocus>& loci);
GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons=true, bool fuzzSpan=false);
-void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster=true, bool collapseRedundant=true,
- bool matchAllIntrons=true, bool fuzzSpan=false);
//void loadGFF(FILE* f, GList<GenomicSeqData>& seqdata, const char* fname);
void collectLocusData(GList<GenomicSeqData>& ref_data);
diff --git a/src/gffread.cpp b/src/gffread.cpp
index db0ff51..1393923 100644
--- a/src/gffread.cpp
+++ b/src/gffread.cpp
@@ -49,6 +49,7 @@ gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
-J discard any mRNAs that either lack initial START codon\n\
or the terminal STOP codon, or have an in-frame stop codon\n\
(only print mRNAs with a fulll, valid CDS)\n\
+ --no-pseudo: filter out records matching the 'pseudo' keyword\n\
\n\
-M/--merge : cluster the input transcripts into loci, collapsing matching\n\
transcripts (those with the same exact introns and fully contained)\n\
@@ -60,8 +61,11 @@ gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
(multi-exon transcripts will be collapsed if just their introns match,\n\
while single-exon transcripts can partially overlap (80%))\n\
\n\
+ --force-exons: make sure that the lowest level GFF features are printed as \n\
+ \"exon\" features\n\
-E expose (warn about) duplicate transcript IDs and other potential \n\
problems with the given GFF/GTF records\n\
+ -D decode url encoded characters within attributes\n\
-Z merge close exons into a single exon (for intron size<4)\n\
-w write a fasta file with spliced exons for each GFF transcript\n\
-x write a fasta file with spliced CDS for each GFF transcript\n\
@@ -123,9 +127,11 @@ bool validCDSonly=false; // translation with no in-frame STOP
bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
bool altPhases=false; //if original phase fails translation validation,
//try the other 2 phases until one makes it
-bool mRNAOnly=true;
+bool mRNAOnly=true;
+bool NoPseudo=false;
+bool forceExons=false;
bool spliceCheck=false; //only known splice-sites
-
+bool decodeChars=false; //decode url-encoded chars in attrs (-D)
bool fullCDSonly=false; // starts with START, ends with STOP codon
bool fullattr=false;
//bool sortByLoc=false; // if the GFF output should be sorted by location
@@ -608,88 +614,82 @@ void printGff3Header(FILE* f, GArgs& args) {
}
bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
- if (reftbl.Count()>0) {
- GStr refname(gffrec->getRefName());
- RefTran* rt=reftbl.Find(refname.chars());
- if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
- //try removing the version suffix
- refname.cut(-2);
- //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
- rt=reftbl.Find(refname.chars());
- }
- if (rt) {
- gffrec->setRefName(rt->new_name);
- }
- else return false; //discard, ref seq not in the given translation table
- }
- if (mRNAOnly && gffrec->isDiscarded()) {
- //discard generic "gene" or "locus" features with no other detailed subfeatures
- //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
- return false;
- }
- /*
- if (gffrec->exons.Count()==0 && gffrec->children.Count()==0)) {
- //a non-mRNA feature with no subfeatures
- //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
- //--this could be a single "pseudogene" entry or another genomic region without exons
- //
- gffrec->addExon(gffrec->start,gffrec->end);
- }
- */
- if (rfltGSeq!=NULL) { //filter by gseqName
- if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
- return false;
- }
- }
- if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
- return false;
- }
- //check coordinates
- if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
- if (rfltWithin) {
- if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
- return false; //not within query range
- }
- }
- else {
- if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
- return false;
- }
- }
- }
- if (multiExon && gffrec->exons.Count()<=1) {
- return false;
- }
- if (wCDSonly && gffrec->CDstart==0) {
- return false;
- }
- if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
- //keep track of chr|gene_id data -- coordinate range
- char* geneid=gffrec->getGeneID();
- if (geneid!=NULL) {
- GeneInfo* ginfo=gene_ids.Find(geneid);
- if (ginfo==NULL) {//first time seeing this gene ID
- GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
- gene_ids.Add(geneid, geneinfo);
- if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
- }
- else ginfo->update(gffrec);
- }
- }
- return true;
+ if (reftbl.Count()>0) {
+ GStr refname(gffrec->getRefName());
+ RefTran* rt=reftbl.Find(refname.chars());
+ if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
+ //try removing the version suffix
+ refname.cut(-2);
+ //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
+ rt=reftbl.Find(refname.chars());
+ }
+ if (rt) {
+ gffrec->setRefName(rt->new_name);
+ }
+ else return false; //discard, ref seq not in the given translation table
+ }
+ if (mRNAOnly && gffrec->isDiscarded()) {
+ //discard generic "locus" features with no other detailed subfeatures
+ //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",gffrec->getID());
+ return false;
+ }
+
+ if (rfltGSeq!=NULL) { //filter by gseqName
+ if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
+ return false;
+ }
+ }
+ if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
+ return false;
+ }
+ //check coordinates
+ if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
+ if (rfltWithin) {
+ if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
+ return false; //not within query range
+ }
+ }
+ else {
+ if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
+ return false;
+ }
+ }
+ }
+ if (multiExon && gffrec->exons.Count()<=1) {
+ return false;
+ }
+ if (wCDSonly && gffrec->CDstart==0) {
+ return false;
+ }
+ if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
+ //keep track of chr|gene_id data -- coordinate range
+ char* geneid=gffrec->getGeneID();
+ if (geneid!=NULL) {
+ GeneInfo* ginfo=gene_ids.Find(geneid);
+ if (ginfo==NULL) {//first time seeing this gene ID
+ GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
+ gene_ids.Add(geneid, geneinfo);
+ if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
+ }
+ else ginfo->update(gffrec);
+ }
+ }
+ return true;
}
int main(int argc, char * const argv[]) {
GArgs args(argc, argv,
- "debug;merge;cluster-only;help;force-exons;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
+ "debug;merge;cluster-only;help;force-exons;no-pseudo;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
args.printError(USAGE, true);
if (args.getOpt('h') || args.getOpt("help")) {
GMessage("%s",USAGE);
exit(1);
}
debugMode=(args.getOpt("debug")!=NULL);
- bool forceExons=(args.getOpt("force-exons")!=NULL);
+ decodeChars=(args.getOpt('D')!=NULL);
+ forceExons=(args.getOpt("force-exons")!=NULL);
+ NoPseudo=(args.getOpt("no-pseudo")!=NULL);
mRNAOnly=(args.getOpt('O')==NULL);
//sortByLoc=(args.getOpt('S')!=NULL);
addDescr=(args.getOpt('A')!=NULL);
@@ -736,6 +736,10 @@ int main(int argc, char * const argv[]) {
noExonAttr=true;
fullattr=true;
}
+ if (NoPseudo && !fullattr) {
+ noExonAttr=true;
+ fullattr=true;
+ }
ensembl_convert=(args.getOpt('L')!=NULL);
if (ensembl_convert) {
fullattr=true;
@@ -849,6 +853,7 @@ int main(int argc, char * const argv[]) {
gffloader.noExonAttrs=noExonAttr;
gffloader.mergeCloseExons=mergeCloseExons;
gffloader.showWarnings=(args.getOpt('E')!=NULL);
+ gffloader.noPseudo=NoPseudo;
gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
matchAllIntrons, fuzzSpan, forceExons);
if (doCluster)
@@ -859,10 +864,18 @@ int main(int argc, char * const argv[]) {
GStr loctrack("gffcl");
if (tracklabel) loctrack=tracklabel;
g_data.setSorted(&gseqCmpName);
+ GffPrintMode exonPrinting;
+ if (fmtGTF) {
+ exonPrinting = pgtfAny;
+ } else {
+ exonPrinting = forceExons ? pgffBoth : pgffAny;
+ }
+ bool firstGff3Print=!fmtGTF;
if (doCluster) {
//grouped in loci
for (int g=0;g<g_data.Count();g++) {
GenomicSeqData* gdata=g_data[g];
+ int gfs_i=0;
for (int l=0;l<gdata->loci.Count();l++) {
GffLocus& loc=*(gdata->loci[l]);
//check all non-replaced transcripts in this locus:
@@ -870,6 +883,19 @@ int main(int argc, char * const argv[]) {
int idxfirstvalid=-1;
for (int i=0;i<loc.rnas.Count();i++) {
GffObj& t=*(loc.rnas[i]);
+ if (f_out) {
+ while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) {
+ GffObj& gfst=*(gdata->gfs[gfs_i]);
+ if ((gfst.udata&4)==0) { //never printed
+ gfst.udata|=4;
+ if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
+ if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
+ gfst.addExon(gfst.start,gfst.end);
+ gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
+ }
+ ++gfs_i;
+ }
+ }
GTData* tdata=(GTData*)(t.uptr);
if (tdata->replaced_by!=NULL) {
if (f_repl && (t.udata & 8)==0) {
@@ -891,13 +917,11 @@ int main(int argc, char * const argv[]) {
if (idxfirstvalid<0) idxfirstvalid=i;
}
}
-
if (f_out && numvalid>0) {
GStr locname("RLOC_");
locname.appendfmt("%08d",loc.locus_num);
if (!fmtGTF) {
- if (out_counter==0)
- printGff3Header(f_out, args);
+ if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
locname.chars(), locname.chars());
@@ -927,43 +951,60 @@ int main(int argc, char * const argv[]) {
if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
t.addAttr("locus", locname.chars());
out_counter++;
- if (fmtGTF) t.printGtf(f_out, tracklabel);
+ if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
else {
+ if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
//print the parent first, if any
if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
GTData* pdata=(GTData*)(t.parent->uptr);
- if (pdata->geneinfo!=NULL)
+ if (pdata && pdata->geneinfo!=NULL)
pdata->geneinfo->finalize();
t.parent->addAttr("locus", locname.chars());
- t.parent->printGff(f_out, tracklabel);
+ t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
t.parent->udata|=4;
}
- t.printGff(f_out, tracklabel);
+ t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
}
}
} //have valid transcripts to print
}//for each locus
- if (f_out && !mRNAOnly) {
- //final pass through the non-transcripts, in case any of them were not printed
- //TODO: order broken, these should be interspersed among the rnas in the correct order!
- for (int m=0;m<gdata->gfs.Count();m++) {
- GffObj& t=*(gdata->gfs[m]);
- if ((t.udata&4)==0) { //never printed
- t.udata|=4;
- if (fmtGTF) t.printGtf(f_out, tracklabel);
- else t.printGff(f_out, tracklabel);
+ //print the rest of the isolated pseudo/gene/region features not printed yet
+ if (f_out) {
+ while (gfs_i<gdata->gfs.Count()) {
+ GffObj& gfst=*(gdata->gfs[gfs_i]);
+ if ((gfst.udata&4)==0) { //never printed
+ gfst.udata|=4;
+ if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
+ if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
+ gfst.addExon(gfst.start,gfst.end);
+ gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
}
- } //for each non-transcript
- }
- } //for each genomic sequence
+ ++gfs_i;
+ }
+ }
+ } //for each genomic sequence
}
- else {
+ else {
//not grouped into loci, print the rnas with their parents, if any
int numvalid=0;
for (int g=0;g<g_data.Count();g++) {
GenomicSeqData* gdata=g_data[g];
+ int gfs_i=0;
for (int m=0;m<gdata->rnas.Count();m++) {
GffObj& t=*(gdata->rnas[m]);
+ if (f_out) {
+ while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) {
+ GffObj& gfst=*(gdata->gfs[gfs_i]);
+ if ((gfst.udata&4)==0) { //never printed
+ gfst.udata|=4;
+ if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
+ if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
+ gfst.addExon(gfst.start,gfst.end);
+ gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
+ }
+ ++gfs_i;
+ }
+ }
GTData* tdata=(GTData*)(t.uptr);
if (tdata->replaced_by!=NULL) continue;
if (process_transcript(gfasta, t)) {
@@ -972,37 +1013,38 @@ int main(int argc, char * const argv[]) {
if (f_out) {
if (tdata->geneinfo) tdata->geneinfo->finalize();
out_counter++;
- if (fmtGTF) t.printGtf(f_out, tracklabel);
+ if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
else {
- if (out_counter==1)
- printGff3Header(f_out, args);
+ if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
//print the parent first, if any
if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
GTData* pdata=(GTData*)(t.parent->uptr);
- if (pdata->geneinfo!=NULL)
+ if (pdata && pdata->geneinfo!=NULL)
pdata->geneinfo->finalize();
- t.parent->printGff(f_out, tracklabel);
+ t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
t.parent->udata|=4;
}
- t.printGff(f_out, tracklabel);
+ t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
}
}//GFF/GTF output requested
} //valid transcript
} //for each rna
- if (f_out && !mRNAOnly) {
- //final pass through the non-transcripts, in case any of them were not printed
- //TODO: order broken, these should be interspersed among the rnas in the correct order!
- for (int m=0;m<gdata->gfs.Count();m++) {
- GffObj& t=*(gdata->gfs[m]);
- if ((t.udata&4)==0) { //never printed
- t.udata|=4;
- if (fmtGTF) t.printGtf(f_out, tracklabel);
- else t.printGff(f_out, tracklabel);
+ //print the rest of the isolated pseudo/gene/region features not printed yet
+ if (f_out) {
+ while (gfs_i<gdata->gfs.Count()) {
+ GffObj& gfst=*(gdata->gfs[gfs_i]);
+ if ((gfst.udata&4)==0) { //never printed
+ gfst.udata|=4;
+ if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
+ if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
+ gfst.addExon(gfst.start,gfst.end);
+ gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
}
- } //for each non-transcript
- }
- } //for each genomic seq
- }
+ ++gfs_i;
+ }
+ }
+ } //for each genomic seq
+ } //not clustered
if (f_repl && f_repl!=stdout) fclose(f_repl);
seqinfo.Clear();
//if (faseq!=NULL) delete faseq;
diff --git a/src/gtf_to_sam.cpp b/src/gtf_to_sam.cpp
index 64422a9..38b8311 100644
--- a/src/gtf_to_sam.cpp
+++ b/src/gtf_to_sam.cpp
@@ -187,7 +187,7 @@ void print_scaff_as_sam(FILE* sam_out,
}
-void set_relative_fpkms(vector<shared_ptr<Scaffold> >& ref_mRNAs)
+void set_relative_fpkms(vector<boost::shared_ptr<Scaffold> >& ref_mRNAs)
{
adjacency_list <vecS, vecS, undirectedS> G;
@@ -229,7 +229,7 @@ void set_relative_fpkms(vector<shared_ptr<Scaffold> >& ref_mRNAs)
//vector<vector<size_t> > cluster_indices(three_prime_ends.size());
- vector<vector<shared_ptr<Scaffold> > > grouped_scaffolds(ref_mRNAs.size());
+ vector<vector<boost::shared_ptr<Scaffold> > > grouped_scaffolds(ref_mRNAs.size());
for (size_t i = 0; i < ref_mRNAs.size(); ++i)
{
clusters[component[i]][i] = true;
@@ -238,16 +238,16 @@ void set_relative_fpkms(vector<shared_ptr<Scaffold> >& ref_mRNAs)
for (size_t i = 0; i < grouped_scaffolds.size(); ++i)
{
- vector<shared_ptr<Scaffold> >& gene = grouped_scaffolds[i];
+ vector<boost::shared_ptr<Scaffold> >& gene = grouped_scaffolds[i];
double total_fpkm = 0.0;
- BOOST_FOREACH(shared_ptr<Scaffold> scaff, gene)
+ BOOST_FOREACH(boost::shared_ptr<Scaffold> scaff, gene)
{
total_fpkm += scaff->fpkm();
}
if (total_fpkm > 0)
{
- BOOST_FOREACH (shared_ptr<Scaffold> scaff, gene)
+ BOOST_FOREACH (boost::shared_ptr<Scaffold> scaff, gene)
{
scaff->fpkm(scaff->fpkm() / total_fpkm);
}
@@ -260,19 +260,20 @@ void driver(vector<FILE*> ref_gtf_files, FILE* sam_out)
ReadTable it;
RefSequenceTable rt(true, false);
- vector<vector<shared_ptr<Scaffold> > > ref_mRNA_table;
+ vector<vector<boost::shared_ptr<Scaffold> > > ref_mRNA_table;
vector<pair<string, vector<double> > > sample_count_table;
BOOST_FOREACH (FILE* ref_gtf, ref_gtf_files)
{
- vector<shared_ptr<Scaffold> > ref_mRNAs;
- ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, false, true);
+ vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
+ boost::crc_32_type ref_gtf_crc_result;
+ ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, ref_gtf_crc_result, false, true);
ref_mRNA_table.push_back(ref_mRNAs);
}
for (size_t j = 0; j < ref_mRNA_table.size(); ++j)
{
- vector<shared_ptr<Scaffold> > ref_mRNAs = ref_mRNA_table[j];
+ vector<boost::shared_ptr<Scaffold> > ref_mRNAs = ref_mRNA_table[j];
if (!raw_fpkm)
set_relative_fpkms(ref_mRNAs);
diff --git a/src/gtf_tracking.cpp b/src/gtf_tracking.cpp
index 042838d..6c27a6f 100644
--- a/src/gtf_tracking.cpp
+++ b/src/gtf_tracking.cpp
@@ -58,7 +58,7 @@ GffObj* is_RefDup(GffObj* m, GList<GffObj>& mrnas, int& dupidx) {
}
-bool intronRedundant(GffObj& ti, GffObj& tj) {
+bool intronRedundant(GffObj& ti, GffObj& tj, bool no5share=false) {
//two transcripts are "intron redundant" iff one transcript's intron chain
// is a sub-chain of the other's
int imax=ti.exons.Count()-1;
@@ -88,6 +88,7 @@ bool intronRedundant(GffObj& ti, GffObj& tj) {
}
if (eistart!=ejstart || eiend!=ejend) return false; //not an exact intron match
//we have the first matching intron on the left
+
if (j>i) {
//i==1, ti's start must not conflict with the previous intron of tj
if (ti.start<tj.exons[j-1]->start) return false;
@@ -105,6 +106,8 @@ bool intronRedundant(GffObj& ti, GffObj& tj) {
//comment out the line above for just "intronCompatible()" check
}
//now check if the rest of the introns overlap, in the same sequence
+ int i_start=i; //first (leftmost) matching intron of ti (1-based index)
+ int j_start=j; //first (leftmost) matching intron of tj
i++;
j++;
while (i<=imax && j<=jmax) {
@@ -122,7 +125,16 @@ bool intronRedundant(GffObj& ti, GffObj& tj) {
else if (j==jmax && i<imax) {
if (tj.end>ti.exons[i]->end) return false;
}
- return true;
+ if (no5share && imax!=jmax) {
+ //if they share the 5' intron, they are NOT to be considered redundant
+ if (ti.strand=='+') {
+ if (i_start==1 && j_start==1) return false;
+ }
+ else { //reverse strand
+ if (i==imax && j==jmax) return false;
+ }
+ }
+ return true; //they are intron-redundant
}
bool t_contains(GffObj& a, GffObj& b) {
@@ -143,7 +155,7 @@ bool t_contains(GffObj& a, GffObj& b) {
else return false;
}
-int is_Redundant(GffObj*m, GList<GffObj>* mrnas) {
+int is_Redundant(GffObj*m, GList<GffObj>* mrnas, bool no5share=false) {
//first locate the list index of the mrna starting just ABOVE
//the end of this mrna
if (mrnas->Count()==0) return -1;
@@ -158,7 +170,7 @@ int is_Redundant(GffObj*m, GList<GffObj>* mrnas) {
}
if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly
- if (intronRedundant(*m, omrna)) return i;
+ if (intronRedundant(*m, omrna, no5share)) return i;
}
return -1;
}
@@ -186,7 +198,7 @@ bool betterDupRef(GffObj* a, GffObj* b) {
int parse_mRNAs(GfList& mrnas,
GList<GSeqData>& glstdata,
bool is_ref_set,
- bool check_for_dups,
+ int check_for_dups,
int qfidx, bool only_multiexon) {
int refdiscarded=0; //ref duplicates discarded
int tredundant=0; //cufflinks redundant transcripts discarded
@@ -265,13 +277,13 @@ int parse_mRNAs(GfList& mrnas,
}
} //check for duplicate ref transcripts
} //ref transcripts
- else { //-- transfrags
+ else { //-- query transfrags
if (m->strand=='+') { target_mrnas = &(gdata->mrnas_f); }
else if (m->strand=='-') { target_mrnas=&(gdata->mrnas_r); }
else { m->strand='.'; target_mrnas=&(gdata->umrnas); }
if (check_for_dups) { //check for redundancy
// check if there is a redundancy between this and another already loaded Cufflinks transcript
- int cidx = is_Redundant(m, target_mrnas);
+ int cidx = is_Redundant(m, target_mrnas, (check_for_dups>1));
if (cidx>=0) {
//always discard the redundant transcript with the fewer exons OR shorter
if (t_dominates(target_mrnas->Get(cidx),m)) {
@@ -538,16 +550,18 @@ GSeqData* getRefData(int gid, GList<GSeqData>& ref_data) {
return r;
}
-void read_transcripts(FILE* f, GList<GSeqData>& seqdata, bool keepAttrs) {
+void read_transcripts(FILE* f, GList<GSeqData>& seqdata, boost::crc_32_type& crc_result, bool keepAttrs) {
rewind(f);
GffReader gffr(f, true); //loading only recognizable transcript features
gffr.showWarnings(gtf_tracking_verbose);
// keepAttrs mergeCloseExons noExonAttrs
gffr.readAll(keepAttrs, true, true);
+
+ crc_result = gffr.current_crc_result();
// is_ref? check_for_dups,
- parse_mRNAs(gffr.gflst, seqdata, false, false);
+ parse_mRNAs(gffr.gflst, seqdata, false, 0);
}
int cmpGSeqByName(const pointer p1, const pointer p2) {
@@ -559,7 +573,7 @@ void sort_GSeqs_byName(GList<GSeqData>& seqdata) {
}
void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data,
- bool check_for_dups, int qfidx, const char* fname, bool only_multiexon) {
+ int check_for_dups, int qfidx, const char* fname, bool only_multiexon) {
//>>>>> read all transcripts/features from a GTF/GFF3 file
//int imrna_counter=0;
#ifdef HEAPROFILE
diff --git a/src/gtf_tracking.h b/src/gtf_tracking.h
index 77cb988..d883c55 100644
--- a/src/gtf_tracking.h
+++ b/src/gtf_tracking.h
@@ -1318,15 +1318,15 @@ class GXLocus:public GSeg {
int parse_mRNAs(GfList& mrnas,
GList<GSeqData>& glstdata,
bool is_ref_set=true,
- bool check_for_dups=false,
+ int check_for_dups=0,
int qfidx=-1, bool only_multiexon=false);
//reading a mRNAs from a gff file and grouping them into loci
void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data=NULL,
- bool check_for_dups=false, int qfidx=-1, const char* fname=NULL,
+ int check_for_dups=0, int qfidx=-1, const char* fname=NULL,
bool only_multiexon=false);
-void read_transcripts(FILE* f, GList<GSeqData>& seqdata, bool keepAttrs=true);
+void read_transcripts(FILE* f, GList<GSeqData>& seqdata, boost::crc_32_type& crc_result, bool keepAttrs=true);
void sort_GSeqs_byName(GList<GSeqData>& seqdata);
diff --git a/src/hits.cpp b/src/hits.cpp
index 2b703f1..5387363 100644
--- a/src/hits.cpp
+++ b/src/hits.cpp
@@ -22,6 +22,8 @@
#include "hits.h"
#include "tokenize.h"
+#include "abundances.h"
+
using namespace std;
#if ENABLE_THREADS
@@ -1101,3 +1103,174 @@ bool SAMHitFactory::inspect_header()
finalize_rg_props();
return true;
}
+
+//////////////////////////////////////////
+
+void PrecomputedExpressionHitFactory::load_count_tables(const string& expression_file_name)
+{
+ //map<int, AbundanceGroup > ab_groups;
+
+
+ std::ifstream ifs(expression_file_name.c_str());
+ boost::archive::binary_iarchive ia(ifs);
+
+ //map<string, AbundanceGroup> single_sample_tracking;
+
+ size_t num_loci = 0;
+ ia >> num_loci;
+
+ if (num_loci > 0)
+ {
+ pair<int, AbundanceGroup> first_locus;
+ ia >> first_locus;
+ boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(first_locus.second));
+
+ // populate the cached count tables so we can make convincing fake bundles later on.
+ ReadGroupProperties rg_props = **(ab->rg_props().begin());
+
+ BOOST_FOREACH(const LocusCount& c, rg_props.raw_compatible_counts())
+ {
+ compat_mass[c.locus_desc] = c.count;
+ }
+
+ BOOST_FOREACH(const LocusCount& c, rg_props.raw_total_counts())
+ {
+ total_mass[c.locus_desc] = c.count;
+ }
+ }
+}
+
+void PrecomputedExpressionHitFactory::load_checked_parameters(const string& expression_file_name)
+{
+ std::ifstream ifs(expression_file_name.c_str());
+ boost::archive::binary_iarchive ia(ifs);
+
+ //map<string, AbundanceGroup> single_sample_tracking;
+
+ size_t num_loci = 0;
+ ia >> num_loci;
+
+ if (num_loci > 0)
+ {
+ pair<int, AbundanceGroup> first_locus;
+ ia >> first_locus;
+ boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(first_locus.second));
+
+ // populate the cached count tables so we can make convincing fake bundles later on.
+ ReadGroupProperties rg_props = **(ab->rg_props().begin());
+ _rg_props.checked_parameters(rg_props.checked_parameters());
+ }
+}
+
+bool PrecomputedExpressionHitFactory::next_record(const char*& buf, size_t& buf_size)
+{
+ return false;
+}
+
+bool PrecomputedExpressionHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
+ ReadHit& bh,
+ bool strip_slash,
+ char* name_out,
+ char* name_tags)
+{
+ return false;
+}
+
+bool PrecomputedExpressionHitFactory::inspect_header()
+{
+
+ std::ifstream ifs(_expression_file_name.c_str());
+ boost::archive::binary_iarchive ia(ifs);
+
+ RefSequenceTable& rt = ref_table();
+
+ size_t num_loci = 0;
+ ia >> num_loci;
+
+ for (size_t i = 0; i < num_loci; ++i)
+ {
+ pair<int, AbundanceGroup> locus;
+
+ ia >> locus;
+ boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(locus.second));
+
+ const string locus_tag = ab->locus_tag();
+
+ string::size_type idx = locus_tag.find(':');
+ if (idx != string::npos)
+ {
+ string chrom_name = locus_tag.substr(0, idx);
+ rt.get_id(chrom_name.c_str(), NULL); // make sure the chromosome names are added to the RefSequenceTable in the order that they occur in the expression files.
+ }
+ }
+
+ return true;
+}
+
+boost::shared_ptr<const AbundanceGroup> PrecomputedExpressionHitFactory::get_abundance_for_locus(int locus_id)
+{
+#if ENABLE_THREADS
+ boost::mutex::scoped_lock lock(_factory_lock);
+#endif
+ map<int, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = _curr_ab_groups.find(locus_id);
+ if (itr != _curr_ab_groups.end())
+ return itr->second;
+ else
+ return boost::shared_ptr<const AbundanceGroup>();
+}
+
+void PrecomputedExpressionHitFactory::clear_abundance_for_locus(int locus_id)
+{
+#if ENABLE_THREADS
+ boost::mutex::scoped_lock lock(_factory_lock);
+#endif
+
+ map<int, boost::shared_ptr<const AbundanceGroup> >::iterator itr = _curr_ab_groups.find(locus_id);
+
+ if (itr != _curr_ab_groups.end())
+ _curr_ab_groups.erase(itr);
+}
+
+boost::shared_ptr<const AbundanceGroup> PrecomputedExpressionHitFactory::next_locus(int locus_id)
+{
+#if ENABLE_THREADS
+ boost::mutex::scoped_lock lock(_factory_lock);
+#endif
+// if (locus_id == 7130)
+// {
+// fprintf(stderr, "Trying to get a chr13_random\n");
+// }
+
+ if (_last_locus_id >= locus_id)
+ return boost::shared_ptr<const AbundanceGroup>(); // we already processed this one
+
+ boost::shared_ptr<const AbundanceGroup> sought_group;
+
+ map<int, boost::shared_ptr<const AbundanceGroup> >::iterator itr = _curr_ab_groups.find(locus_id);
+
+ if (itr != _curr_ab_groups.end())
+ return itr->second;
+
+ for (;_curr_locus_idx < _num_loci; ++_curr_locus_idx)
+ {
+ pair<int, AbundanceGroup> p;
+ *_ia >> p;
+ _last_locus_id = p.first;
+ boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(p.second));
+ if (_last_locus_id == locus_id)
+ {
+ sought_group = ab;
+ break;
+ }
+ else // we don't want to lose this one...
+ {
+ _curr_ab_groups[_last_locus_id] = ab;
+ }
+ }
+ _curr_ab_groups[locus_id] = sought_group;
+
+ return sought_group;
+}
+
+
+
diff --git a/src/hits.h b/src/hits.h
index 4e29f0e..f5d2a23 100644
--- a/src/hits.h
+++ b/src/hits.h
@@ -5,6 +5,8 @@
#include <config.h>
#endif
+#include <iostream>
+#include <fstream>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
@@ -22,7 +24,6 @@
#include "multireads.h"
using namespace std;
-using boost::shared_ptr;
/*
* hits.h
@@ -802,6 +803,141 @@ private:
bool _eof_encountered;
};
+class AbundanceGroup;
+
+/******************************************************************************
+ BAMHitFactory turns SAM alignments into ReadHits
+ *******************************************************************************/
+class PrecomputedExpressionHitFactory : public HitFactory
+{
+public:
+ PrecomputedExpressionHitFactory(const string& expression_file_name,
+ ReadTable& insert_table,
+ RefSequenceTable& reference_table) :
+ HitFactory(insert_table, reference_table), _expression_file_name(expression_file_name), _ifs(expression_file_name.c_str()),
+ _ia(boost::shared_ptr<boost::archive::binary_iarchive>(new boost::archive::binary_iarchive(_ifs)))
+ {
+ load_count_tables(expression_file_name);
+
+ if (inspect_header() == false)
+ {
+ throw std::runtime_error("Error: could not parse CXB header");
+ }
+
+ // Override header-inferred read group properities with whatever
+ // the user supplied.
+ if (global_read_properties != NULL)
+ {
+ _rg_props = *global_read_properties;
+ }
+
+ load_checked_parameters(expression_file_name);
+
+ //map<string, AbundanceGroup> single_sample_tracking;
+
+ _num_loci = 0;
+ *_ia >> _num_loci;
+
+ _curr_locus_idx = 0;
+ _last_locus_id = -1;
+ }
+
+ ~PrecomputedExpressionHitFactory()
+ {
+
+ }
+
+ void mark_curr_pos()
+ {
+
+ }
+
+ void undo_hit()
+ {
+ }
+
+ bool records_remain() const
+ {
+ return false;
+ }
+
+ void reset()
+ {
+ _ifs.clear() ;
+ _ifs.seekg(0, ios::beg);
+ _ia = boost::shared_ptr<boost::archive::binary_iarchive>(new boost::archive::binary_iarchive(_ifs));
+ size_t num_loci = 0;
+ *_ia >> num_loci;
+ _last_locus_id = -1;
+ _curr_locus_idx = 0;
+ _curr_ab_groups.clear();
+ }
+
+ bool next_record(const char*& buf, size_t& buf_size);
+
+ bool get_hit_from_buf(const char* bwt_buf,
+ ReadHit& bh,
+ bool strip_slash,
+ char* name_out = NULL,
+ char* name_tags = NULL);
+
+ bool inspect_header();
+
+ boost::shared_ptr<const AbundanceGroup> next_locus(int locus_id);
+
+ boost::shared_ptr<const AbundanceGroup> get_abundance_for_locus(int locus_id);
+ void clear_abundance_for_locus(int locus_id);
+
+ double get_compat_mass(const string& locus_id)
+ {
+ map<string, double >::iterator i = compat_mass.find(locus_id);
+ if (i != compat_mass.end())
+ {
+ return i->second;
+ }
+ else
+ {
+ return 0;
+ }
+ }
+
+ double get_total_mass(const string& locus_id)
+ {
+ map<string, double >::iterator i = total_mass.find(locus_id);
+ if (i != total_mass.end())
+ {
+ return i->second;
+ }
+ else
+ {
+ return 0;
+ }
+ }
+
+
+private:
+
+ void load_count_tables(const string& expression_file_name);
+ void load_checked_parameters(const string& expression_file_name);
+
+ //map<int, boost::shared_ptr<const AbundanceGroup> > ab_group_table;
+ size_t _num_loci;
+ size_t _curr_locus_idx;
+ int _last_locus_id;
+ std::ifstream _ifs;
+ string _expression_file_name;
+ boost::shared_ptr<boost::archive::binary_iarchive> _ia;
+ map<string, double> compat_mass;
+ map<string, double> total_mass;
+ map<int, boost::shared_ptr<const AbundanceGroup> > _curr_ab_groups;
+
+
+#if ENABLE_THREADS
+ boost::mutex _factory_lock;
+#endif
+};
+
+
// Forward declaration of BundleFactory, because MateHit will need a pointer
// back to the Factory that created. Ultimately, we should replace this
// with a pointer back to the ReadGroupProperty object corresponding to each
@@ -826,7 +962,7 @@ public:
_collapse_mass(0.0),
_is_mapped(false){}
- MateHit(shared_ptr<ReadGroupProperties const> rg_props,
+ MateHit(boost::shared_ptr<ReadGroupProperties const> rg_props,
RefID refid,
const ReadHit* left_alignment,
const ReadHit* right_alignment) :
@@ -846,7 +982,7 @@ public:
//bool closed() {return _closed;}
- shared_ptr<ReadGroupProperties const> read_group_props() const { return _rg_props; }
+ boost::shared_ptr<ReadGroupProperties const> read_group_props() const { return _rg_props; }
const ReadHit* left_alignment() const {return _left_alignment;}
void left_alignment(const ReadHit* left_alignment)
@@ -988,7 +1124,7 @@ public:
if (is_multi())
{
- shared_ptr<MultiReadTable> mrt = _rg_props->multi_read_table();
+ boost::shared_ptr<MultiReadTable> mrt = _rg_props->multi_read_table();
if (mrt)
return mrt->get_mass(*this);
else
@@ -1021,7 +1157,7 @@ public:
private:
- shared_ptr<ReadGroupProperties const> _rg_props;
+ boost::shared_ptr<ReadGroupProperties const> _rg_props;
RefID _refid;
const ReadHit* _left_alignment;
const ReadHit* _right_alignment;
diff --git a/src/replicates.cpp b/src/replicates.cpp
index ca3f186..1227577 100644
--- a/src/replicates.cpp
+++ b/src/replicates.cpp
@@ -407,7 +407,7 @@ void calc_quartile_scaling_factors(const vector<LocusCountList>& sample_compatib
for (size_t j = 0; j < sample_compatible_count_table.size(); ++j)
{
- //shared_ptr<ReadGroupProperties> rg = bundle_factories[fac_idx];
+ //boost::shared_ptr<ReadGroupProperties> rg = bundle_factories[fac_idx];
//double scaled_mass = scale_factors[fac_idx] * rg->total_map_mass();
total_common += sample_compatible_count_table[j].counts[i];
@@ -619,7 +619,7 @@ void build_scv_correction_fit(int nreps, int ngenes, int mean_count, SCVInterpol
vector<double> draws;
for (size_t i = 0; i < ngenes; ++i)
{
- LocusCountList locus_count("", nreps, 1);
+ LocusCountList locus_count("", nreps, 1, vector<string>(), vector<string>());
for (size_t rep_idx = 0; rep_idx < nreps; ++rep_idx)
{
double gamma_draw = gamma(rng);
@@ -764,8 +764,10 @@ fit_dispersion_model_helper(const string& condition_name,
SCVInterpolator true_to_est_scv_table;
int num_samples = sample_compatible_count_table.front().counts.size();
-
- build_scv_correction_fit(num_samples, 10000, 100000, true_to_est_scv_table);
+ if (no_scv_correction == false)
+ {
+ build_scv_correction_fit(num_samples, 10000, 100000, true_to_est_scv_table);
+ }
setuplf();
@@ -787,8 +789,8 @@ fit_dispersion_model_helper(const string& condition_name,
if (compatible_count_means.size() < min_loci_for_fitting)
{
- shared_ptr<MassDispersionModel> disperser;
- disperser = shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
+ boost::shared_ptr<MassDispersionModel> disperser;
+ disperser = boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
return disperser;
}
@@ -865,10 +867,10 @@ fit_dispersion_model_helper(const string& condition_name,
// }
}
- shared_ptr<MassDispersionModel> disperser;
- disperser = shared_ptr<MassDispersionModel>(new MassDispersionModel(condition_name, compatible_count_means, raw_variances, fitted_values));
+ boost::shared_ptr<MassDispersionModel> disperser;
+ disperser = boost::shared_ptr<MassDispersionModel>(new MassDispersionModel(condition_name, compatible_count_means, raw_variances, fitted_values));
if (dispersion_method == POISSON)
- disperser = shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
+ disperser = boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
// for (map<string, pair<double, double> >::iterator itr = labeled_mv_table.begin();
// itr != labeled_mv_table.end();
@@ -895,7 +897,7 @@ fit_dispersion_model(const string& condition_name,
if (sample_compatible_count_table[i].counts.size() <= 1)
{
// only one replicate - no point in fitting variance
- return shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
+ return boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
}
}
#if ENABLE_THREADS
@@ -921,14 +923,68 @@ fit_dispersion_model(const string& condition_name,
return model;
}
-void normalize_counts(vector<shared_ptr<ReadGroupProperties> > & all_read_groups)
+void build_norm_table(const vector<LocusCountList>& full_count_table,
+ boost::shared_ptr<const map<string, LibNormStandards> > normalizing_standards,
+ vector<LocusCountList>& norm_table)
+{
+ // If we're using housekeeping genes or spike-in controls, select the rows we'll be using from the full count table.
+ if (normalizing_standards)
+ {
+ for (size_t i = 0; i < full_count_table.size(); ++i)
+ {
+ const vector<string>& gene_ids = full_count_table[i].gene_ids;
+ const vector<string>& gene_short_names = full_count_table[i].gene_short_names;
+
+ // If the row has an ID that's in the table, take it.
+ map<string, LibNormStandards>::const_iterator g_id_itr = normalizing_standards->end();
+ map<string, LibNormStandards>::const_iterator g_name_itr = normalizing_standards->end();
+
+ for (size_t j = 0; j < gene_ids.size(); ++j)
+ {
+ g_id_itr = normalizing_standards->find(gene_ids[j]);
+ if (g_id_itr != normalizing_standards->end())
+ {
+ break;
+ }
+ }
+
+ if (g_id_itr != normalizing_standards->end())
+ {
+ norm_table.push_back(full_count_table[i]);
+ continue;
+ }
+
+ for (size_t j = 0; j < gene_short_names.size(); ++j)
+ {
+ g_name_itr = normalizing_standards->find(gene_short_names[j]);
+ if (g_name_itr != normalizing_standards->end())
+ {
+ break;
+ }
+ }
+
+ if (g_name_itr != normalizing_standards->end())
+ {
+ norm_table.push_back(full_count_table[i]);
+ continue;
+ }
+
+ }
+ }
+ else // otherwise, just take all rows.
+ {
+ norm_table = full_count_table;
+ }
+}
+
+void normalize_counts(vector<boost::shared_ptr<ReadGroupProperties> > & all_read_groups)
{
vector<LocusCountList> sample_compatible_count_table;
vector<LocusCountList> sample_total_count_table;
for (size_t i = 0; i < all_read_groups.size(); ++i)
{
- shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
+ boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
const vector<LocusCount>& raw_compatible_counts = rg_props->raw_compatible_counts();
const vector<LocusCount>& raw_total_counts = rg_props->raw_total_counts();
@@ -938,8 +994,12 @@ void normalize_counts(vector<shared_ptr<ReadGroupProperties> > & all_read_groups
{
const string& locus_id = raw_compatible_counts[j].locus_desc;
int num_transcripts = raw_compatible_counts[j].num_transcripts;
- sample_compatible_count_table.push_back(LocusCountList(locus_id,all_read_groups.size(), num_transcripts));
- sample_total_count_table.push_back(LocusCountList(locus_id,all_read_groups.size(), num_transcripts));
+
+ const vector<string>& gene_ids = raw_compatible_counts[j].gene_ids;
+ const vector<string>& gene_short_names = raw_compatible_counts[j].gene_short_names;
+
+ sample_compatible_count_table.push_back(LocusCountList(locus_id,all_read_groups.size(), num_transcripts, gene_ids, gene_short_names));
+ sample_total_count_table.push_back(LocusCountList(locus_id,all_read_groups.size(), num_transcripts, gene_ids, gene_short_names));
}
double scaled = raw_compatible_counts[j].count;
//sample_compatible_count_table[j].counts[i] = scaled * unscaling_factor;
@@ -956,13 +1016,14 @@ void normalize_counts(vector<shared_ptr<ReadGroupProperties> > & all_read_groups
if (use_compat_mass)
{
- norm_table = sample_compatible_count_table;
+ build_norm_table(sample_compatible_count_table, lib_norm_standards, norm_table);
}
else // use_total_mass
{
assert(use_total_mass);
- norm_table = sample_total_count_table;
+ build_norm_table(sample_total_count_table, lib_norm_standards, norm_table);
}
+
if (lib_norm_method == GEOMETRIC)
{
calc_geometric_scaling_factors(norm_table, scale_factors);
@@ -988,7 +1049,7 @@ void normalize_counts(vector<shared_ptr<ReadGroupProperties> > & all_read_groups
for (size_t i = 0; i < all_read_groups.size(); ++i)
{
- shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
+ boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
rg_props->internal_scale_factor(scale_factors[i]);
}
@@ -1014,14 +1075,18 @@ void normalize_counts(vector<shared_ptr<ReadGroupProperties> > & all_read_groups
for (size_t i = 0; i < all_read_groups.size(); ++i)
{
- shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
+ boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
vector<LocusCount> scaled_compatible_counts;
for (size_t j = 0; j < sample_compatible_count_table.size(); ++j)
{
string& locus_id = sample_compatible_count_table[j].locus_desc;
double count = sample_compatible_count_table[j].counts[i];
int num_transcripts = sample_compatible_count_table[j].num_transcripts;
- LocusCount locus_count(locus_id, count, num_transcripts);
+
+ const vector<string>& gids = sample_compatible_count_table[j].gene_ids;
+ const vector<string>& gnms = sample_compatible_count_table[j].gene_short_names;
+
+ LocusCount locus_count(locus_id, count, num_transcripts, gids, gnms);
scaled_compatible_counts.push_back(locus_count);
}
rg_props->common_scale_compatible_counts(scaled_compatible_counts);
@@ -1029,14 +1094,18 @@ void normalize_counts(vector<shared_ptr<ReadGroupProperties> > & all_read_groups
for (size_t i = 0; i < all_read_groups.size(); ++i)
{
- shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
+ boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
vector<LocusCount> scaled_total_counts;
for (size_t j = 0; j < sample_total_count_table.size(); ++j)
{
string& locus_id = sample_total_count_table[j].locus_desc;
double count = sample_total_count_table[j].counts[i];
int num_transcripts = sample_total_count_table[j].num_transcripts;
- LocusCount locus_count(locus_id, count, num_transcripts);
+
+ const vector<string>& gids = sample_total_count_table[j].gene_ids;
+ const vector<string>& gnms = sample_total_count_table[j].gene_short_names;
+
+ LocusCount locus_count(locus_id, count, num_transcripts, gids, gnms);
scaled_total_counts.push_back(locus_count);
}
rg_props->common_scale_total_counts(scaled_total_counts);
@@ -1066,7 +1135,7 @@ void normalize_counts(vector<shared_ptr<ReadGroupProperties> > & all_read_groups
avg_total_common_scaled_count += (1.0/all_read_groups.size()) * total_common;
}
- BOOST_FOREACH(shared_ptr<ReadGroupProperties> rg, all_read_groups)
+ BOOST_FOREACH(boost::shared_ptr<ReadGroupProperties> rg, all_read_groups)
{
rg->normalized_map_mass(avg_total_common_scaled_count);
}
diff --git a/src/replicates.h b/src/replicates.h
index f1dceb7..7e02947 100644
--- a/src/replicates.h
+++ b/src/replicates.h
@@ -116,11 +116,13 @@ private:
struct LocusCountList
{
- LocusCountList(std::string ld, int num_reps, int nt) :
- locus_desc(ld), counts(std::vector<double>(num_reps, 0)), num_transcripts(nt) {}
+ LocusCountList(std::string ld, int num_reps, int nt, const std::vector<std::string>& gids, const std::vector<std::string>& gnms) :
+ locus_desc(ld), counts(std::vector<double>(num_reps, 0)), num_transcripts(nt), gene_ids(gids), gene_short_names(gnms) {}
std::string locus_desc;
std::vector<double> counts;
int num_transcripts;
+ vector<std::string> gene_ids;
+ vector<std::string> gene_short_names;
};
void transform_counts_to_common_scale(const vector<double>& scale_factors,
@@ -151,7 +153,7 @@ void calculate_count_means_and_vars(const vector<LocusCountList>& sample_compati
class ReplicatedBundleFactory
{
public:
- ReplicatedBundleFactory(const std::vector<shared_ptr<BundleFactory> >& factories,
+ ReplicatedBundleFactory(const std::vector<boost::shared_ptr<BundleFactory> >& factories,
const string& condition_name)
: _factories(factories), _condition_name(condition_name) {}
@@ -191,21 +193,35 @@ public:
}
}
- if (non_empty_bundle == false)
+ int locus_id = -1;
+ for (size_t i = 0; i < bundles.size(); ++i)
{
- BOOST_FOREACH (HitBundle* in_bundle, bundles)
+ if (locus_id == -1)
+ locus_id = bundles[i]->id();
+ if (locus_id != bundles[i]->id())
{
- in_bundle->ref_scaffolds().clear();
- in_bundle->clear_hits();
- delete in_bundle;
+ fprintf(stderr, "Error: locus id mismatch!\n");
+ exit(1);
}
- return false;
}
+
+// if (non_empty_bundle == false)
+// {
+// bundle_out.id(locus_id);
+// BOOST_FOREACH (HitBundle* in_bundle, bundles)
+// {
+// in_bundle->ref_scaffolds().clear();
+// in_bundle->clear_hits();
+// delete in_bundle;
+// }
+// return false;
+// }
+
for (size_t i = 1; i < bundles.size(); ++i)
{
- const vector<shared_ptr<Scaffold> >& s1 = bundles[i]->ref_scaffolds();
- const vector<shared_ptr<Scaffold> >& s2 = bundles[i-1]->ref_scaffolds();
+ const vector<boost::shared_ptr<Scaffold> >& s1 = bundles[i]->ref_scaffolds();
+ const vector<boost::shared_ptr<Scaffold> >& s2 = bundles[i-1]->ref_scaffolds();
assert (s1.size() == s2.size());
for (size_t j = 0; j < s1.size(); ++j)
{
@@ -213,16 +229,30 @@ public:
}
}
+ double total_compatible_mass = 0.0;
+ double total_raw_mass = 0.0;
+
+ for (size_t i = 0; i < bundles.size(); ++i)
+ {
+ total_compatible_mass += bundles[i]->compatible_mass();
+ total_raw_mass += bundles[i]->raw_mass();
+ }
+
// Merge the replicates into a combined bundle of hits.
HitBundle::combine(bundles, bundle_out);
+ bundle_out.compatible_mass(total_compatible_mass);
+ bundle_out.add_raw_mass(total_raw_mass);
+
+ bundle_out.id(locus_id);
+
BOOST_FOREACH (HitBundle* in_bundle, bundles)
{
in_bundle->ref_scaffolds().clear();
in_bundle->clear_hits();
delete in_bundle;
}
- return true;
+ return non_empty_bundle;
}
void reset()
@@ -230,7 +260,7 @@ public:
#if ENABLE_THREADS
boost::mutex::scoped_lock lock(_rep_factory_lock);
#endif
- BOOST_FOREACH (shared_ptr<BundleFactory> fac, _factories)
+ BOOST_FOREACH (boost::shared_ptr<BundleFactory> fac, _factories)
{
fac->reset();
}
@@ -245,14 +275,14 @@ public:
for (size_t fac_idx = 0; fac_idx < _factories.size(); ++fac_idx)
{
- shared_ptr<BundleFactory> fac = _factories[fac_idx];
+ boost::shared_ptr<BundleFactory> fac = _factories[fac_idx];
BadIntronTable bad_introns;
vector<LocusCount> compatible_count_table;
vector<LocusCount> total_count_table;
- inspect_map(*fac, NULL, compatible_count_table, total_count_table, false, false);
+ inspect_map(fac, NULL, compatible_count_table, total_count_table, false, false);
- shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
+ boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
assert (compatible_count_table.size() == total_count_table.size());
@@ -263,7 +293,7 @@ public:
if (i >= sample_compatible_count_table.size())
{
- LocusCountList locus_count(c.locus_desc, _factories.size(), c.num_transcripts);
+ LocusCountList locus_count(c.locus_desc, _factories.size(), c.num_transcripts, c.gene_ids, c.gene_short_names);
sample_compatible_count_table.push_back(locus_count);
sample_compatible_count_table.back().counts[0] = raw_count;
sample_total_count_table.push_back(locus_count);
@@ -296,7 +326,7 @@ public:
for (size_t i = 0; i < scale_factors.size(); ++i)
{
- shared_ptr<ReadGroupProperties> rg_props = _factories[i]->read_group_properties();
+ boost::shared_ptr<ReadGroupProperties> rg_props = _factories[i]->read_group_properties();
assert (scale_factors[i] != 0);
rg_props->internal_scale_factor(scale_factors[i]);
}
@@ -306,7 +336,7 @@ public:
for (size_t fac_idx = 0; fac_idx < _factories.size(); ++fac_idx)
{
- shared_ptr<ReadGroupProperties> rg_props = _factories[fac_idx]->read_group_properties();
+ boost::shared_ptr<ReadGroupProperties> rg_props = _factories[fac_idx]->read_group_properties();
assert (scale_factors[fac_idx] != 0);
vector<LocusCount> common_scaled_compatible_counts;
vector<LocusCount> common_scaled_total_counts;
@@ -335,9 +365,9 @@ public:
for (size_t fac_idx = 0; fac_idx < _factories.size(); ++fac_idx)
{
- shared_ptr<BundleFactory> fac = _factories[fac_idx];
+ boost::shared_ptr<BundleFactory> fac = _factories[fac_idx];
- shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
+ boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
const vector<LocusCount>& compatible_count_table = rg_props->common_scale_compatible_counts();
const vector<LocusCount>& total_count_table = rg_props->common_scale_total_counts();
@@ -351,7 +381,7 @@ public:
if (i >= sample_compatible_count_table.size())
{
- LocusCountList locus_count(c.locus_desc, _factories.size(), c.num_transcripts);
+ LocusCountList locus_count(c.locus_desc, _factories.size(), c.num_transcripts, c.gene_ids, c.gene_short_names);
sample_compatible_count_table.push_back(locus_count);
sample_compatible_count_table.back().counts[0] = common_scale_compatible_count;
sample_total_count_table.push_back(locus_count);
@@ -371,7 +401,7 @@ public:
scale_factors.push_back(rg_props->internal_scale_factor());
}
- shared_ptr<MassDispersionModel> disperser;
+ boost::shared_ptr<MassDispersionModel> disperser;
disperser = ::fit_dispersion_model(_condition_name, scale_factors, sample_compatible_count_table);
vector<pair<double, double> > compatible_means_and_vars;
@@ -401,9 +431,9 @@ public:
}
- BOOST_FOREACH (shared_ptr<BundleFactory> fac, _factories)
+ BOOST_FOREACH (boost::shared_ptr<BundleFactory> fac, _factories)
{
- shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
+ boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
rg_props->mass_dispersion_model(disperser);
}
}
@@ -411,23 +441,23 @@ public:
// This function NEEDS to deep copy the ref_mRNAs, otherwise cuffdiff'd
// samples will clobber each other
- void set_ref_rnas(const vector<shared_ptr<Scaffold> >& mRNAs)
+ void set_ref_rnas(const vector<boost::shared_ptr<Scaffold> >& mRNAs)
{
#if ENABLE_THREADS
boost::mutex::scoped_lock lock(_rep_factory_lock);
#endif
- BOOST_FOREACH(shared_ptr<BundleFactory> fac, _factories)
+ BOOST_FOREACH(boost::shared_ptr<BundleFactory> fac, _factories)
{
fac->set_ref_rnas(mRNAs);
}
}
- void set_mask_rnas(const vector<shared_ptr<Scaffold> >& mRNAs)
+ void set_mask_rnas(const vector<boost::shared_ptr<Scaffold> >& mRNAs)
{
#if ENABLE_THREADS
boost::mutex::scoped_lock lock(_rep_factory_lock);
#endif
- BOOST_FOREACH(shared_ptr<BundleFactory> fac, _factories)
+ BOOST_FOREACH(boost::shared_ptr<BundleFactory> fac, _factories)
{
fac->set_mask_rnas(mRNAs);
}
@@ -435,40 +465,40 @@ public:
int num_replicates() const { return _factories.size(); }
- void mass_dispersion_model(shared_ptr<MassDispersionModel const> disperser)
+ void mass_dispersion_model(boost::shared_ptr<MassDispersionModel const> disperser)
{
#if ENABLE_THREADS
boost::mutex::scoped_lock lock(_rep_factory_lock);
#endif
- BOOST_FOREACH(shared_ptr<BundleFactory>& fac, _factories)
+ BOOST_FOREACH(boost::shared_ptr<BundleFactory>& fac, _factories)
{
fac->read_group_properties()->mass_dispersion_model(disperser);
}
}
- shared_ptr<MassDispersionModel const> mass_dispersion_model() const
+ boost::shared_ptr<MassDispersionModel const> mass_dispersion_model() const
{
return _factories.front()->read_group_properties()->mass_dispersion_model();
}
- void mle_error_model(shared_ptr<MleErrorModel const> mle_model)
+ void mle_error_model(boost::shared_ptr<MleErrorModel const> mle_model)
{
#if ENABLE_THREADS
boost::mutex::scoped_lock lock(_rep_factory_lock);
#endif
- BOOST_FOREACH(shared_ptr<BundleFactory>& fac, _factories)
+ BOOST_FOREACH(boost::shared_ptr<BundleFactory>& fac, _factories)
{
fac->read_group_properties()->mle_error_model(mle_model);
}
}
- shared_ptr<MleErrorModel const> mle_error_model() const
+ boost::shared_ptr<MleErrorModel const> mle_error_model() const
{
return _factories.front()->read_group_properties()->mle_error_model();
}
private:
- vector<shared_ptr<BundleFactory> > _factories;
+ vector<boost::shared_ptr<BundleFactory> > _factories;
#if ENABLE_THREADS
boost::mutex _rep_factory_lock;
#endif
diff --git a/src/rounding.h b/src/rounding.h
index ca92912..0228fca 100644
--- a/src/rounding.h
+++ b/src/rounding.h
@@ -163,30 +163,7 @@ namespace rounding
// (Either symmetric half-up or half-down will do0
return roundhalfup0( value );
}
-
- //--------------------------------------------------------------------------
- // round alternate
- // Bias: none for sequential calls
- bool _is_up = false;
- template <typename FloatType>
- FloatType roundalternate( const FloatType& value, int& is_up = _is_up )
- {
- if ((is_up != is_up))
- return roundhalfup( value );
- return roundhalfdown( value );
- }
-
- //--------------------------------------------------------------------------
- // symmetric round alternate
- // Bias: none for sequential calls
- template <typename FloatType>
- FloatType roundalternate0( const FloatType& value, int& is_up = _is_up )
- {
- if ((is_up != is_up))
- return roundhalfup0( value );
- return roundhalfdown0( value );
- }
-
+
//--------------------------------------------------------------------------
// round random
// Bias: generator's bias
diff --git a/src/scaffolds.cpp b/src/scaffolds.cpp
index c20c7a6..c6ba69b 100644
--- a/src/scaffolds.cpp
+++ b/src/scaffolds.cpp
@@ -1478,12 +1478,12 @@ bool Scaffold::map_frag(const MateHit& hit, int& start, int& end, int& frag_len)
}
else if (hit.read_group_props()->mate_strand_mapping()==FF)
{
- shared_ptr<const EmpDist> frag_len_dist = hit.read_group_props()->frag_len_dist();
+ boost::shared_ptr<const EmpDist> frag_len_dist = hit.read_group_props()->frag_len_dist();
frag_len = min(frag_len_dist->mode(), trans_len);
}
else
{
- shared_ptr<const EmpDist> frag_len_dist = hit.read_group_props()->frag_len_dist();
+ boost::shared_ptr<const EmpDist> frag_len_dist = hit.read_group_props()->frag_len_dist();
if ((hit.left_alignment()->antisense_align() && strand() != CUFF_REV)
|| (!hit.left_alignment()->antisense_align() && strand() == CUFF_REV))
@@ -1688,7 +1688,7 @@ double Scaffold::internal_exon_coverage() const
return percent_covered;
}
-bool Scaffold::has_strand_support(vector<shared_ptr<Scaffold> >* ref_scaffs) const
+bool Scaffold::has_strand_support(vector<boost::shared_ptr<Scaffold> >* ref_scaffs) const
{
if (strand() == CUFF_STRAND_UNKNOWN)
return false;
@@ -1707,7 +1707,7 @@ bool Scaffold::has_strand_support(vector<shared_ptr<Scaffold> >* ref_scaffs) con
if (ref_scaffs == NULL)
return false;
- BOOST_FOREACH (shared_ptr<Scaffold const> ref_scaff, *ref_scaffs)
+ BOOST_FOREACH (boost::shared_ptr<Scaffold const> ref_scaff, *ref_scaffs)
{
if (ref_scaff->strand() == strand() && exons_overlap(*this, *ref_scaff))
return true;
@@ -1821,17 +1821,17 @@ bool scaff_lt_rt_oplt(const Scaffold& lhs, const Scaffold& rhs)
return false;
}
-bool scaff_lt_sp(shared_ptr<Scaffold> lhs, shared_ptr<Scaffold> rhs)
+bool scaff_lt_sp(boost::shared_ptr<Scaffold> lhs, boost::shared_ptr<Scaffold> rhs)
{
return scaff_lt(*lhs,*rhs);
}
-bool scaff_lt_rt_sp(shared_ptr<Scaffold> lhs, shared_ptr<Scaffold> rhs)
+bool scaff_lt_rt_sp(boost::shared_ptr<Scaffold> lhs, boost::shared_ptr<Scaffold> rhs)
{
return scaff_lt_rt(*lhs,*rhs);
}
-bool scaff_lt_rt_oplt_sp(shared_ptr<Scaffold> lhs, shared_ptr<Scaffold> rhs)
+bool scaff_lt_rt_oplt_sp(boost::shared_ptr<Scaffold> lhs, boost::shared_ptr<Scaffold> rhs)
{
return scaff_lt_rt_oplt(*lhs,*rhs);
}
diff --git a/src/scaffolds.h b/src/scaffolds.h
index 729554e..f78f52a 100644
--- a/src/scaffolds.h
+++ b/src/scaffolds.h
@@ -406,7 +406,7 @@ public:
// returns true if the scaffold strand is supported with reads or exon overlap with
// a reference scaffold of known strand (since the scaffold may have been created with faux reads)
- bool has_strand_support(vector<shared_ptr<Scaffold> >* ref_scaffs = NULL) const;
+ bool has_strand_support(vector<boost::shared_ptr<Scaffold> >* ref_scaffs = NULL) const;
// returns true if all introns are supported with splice reads, false ow
bool hits_support_introns() const;
@@ -689,16 +689,16 @@ private:
bool scaff_lt(const Scaffold& lhs, const Scaffold& rhs);
bool scaff_lt_rt(const Scaffold& lhs, const Scaffold& rhs);
bool scaff_lt_rt_oplt(const Scaffold& lhs, const Scaffold& rhs);
-bool scaff_lt_sp(shared_ptr<Scaffold> lhs, shared_ptr<Scaffold> rhs);
-bool scaff_lt_rt_sp(shared_ptr<Scaffold> lhs, shared_ptr<Scaffold> rhs);
-bool scaff_lt_rt_oplt_sp(shared_ptr<Scaffold> lhs, shared_ptr<Scaffold> rhs);
+bool scaff_lt_sp(boost::shared_ptr<Scaffold> lhs, boost::shared_ptr<Scaffold> rhs);
+bool scaff_lt_rt_sp(boost::shared_ptr<Scaffold> lhs, boost::shared_ptr<Scaffold> rhs);
+bool scaff_lt_rt_oplt_sp(boost::shared_ptr<Scaffold> lhs, boost::shared_ptr<Scaffold> rhs);
bool overlap_in_genome(int ll, int lr, int rl, int rr);
struct StructurallyEqualScaffolds
{
- bool operator()(shared_ptr<Scaffold> lhs, shared_ptr<Scaffold> rhs)
+ bool operator()(boost::shared_ptr<Scaffold> lhs, boost::shared_ptr<Scaffold> rhs)
{
return lhs->ref_id() == rhs->ref_id() &&
lhs->augmented_ops() == rhs->augmented_ops();
diff --git a/src/tracking.cpp b/src/tracking.cpp
new file mode 100644
index 0000000..8663221
--- /dev/null
+++ b/src/tracking.cpp
@@ -0,0 +1,99 @@
+#include "tracking.h"
+
+void add_to_tracking_table(size_t sample_index,
+ Abundance& ab,
+ FPKMTrackingTable& track)
+
+{
+ pair<FPKMTrackingTable::iterator,bool> inserted;
+ pair<string, FPKMTracking > p;
+ p = make_pair(ab.description(), FPKMTracking());
+ inserted = track.insert(p);
+
+ FPKMTracking& fpkm_track = inserted.first->second;
+
+ set<string> tss = ab.tss_id();
+ set<string> gene_ids = ab.gene_id();
+ set<string> genes = ab.gene_name();
+ set<string> proteins = ab.protein_id();
+
+ fpkm_track.tss_ids.insert(tss.begin(), tss.end());
+ fpkm_track.gene_ids.insert(gene_ids.begin(), gene_ids.end());
+ fpkm_track.gene_names.insert(genes.begin(), genes.end());
+ fpkm_track.protein_ids.insert(proteins.begin(), proteins.end());
+
+ if (inserted.second)
+ {
+ fpkm_track.locus_tag = ab.locus_tag();
+ fpkm_track.description = ab.description();
+ boost::shared_ptr<Scaffold> transfrag = ab.transfrag();
+ if (transfrag && transfrag->nearest_ref_id() != "")
+ {
+ fpkm_track.classcode = transfrag->nearest_ref_classcode();
+ fpkm_track.ref_match = transfrag->nearest_ref_id();
+ }
+ else
+ {
+ fpkm_track.classcode = 0;
+ fpkm_track.ref_match = "-";
+ }
+ if (transfrag)
+ {
+ fpkm_track.length = transfrag->length();
+ }
+ else
+ {
+ fpkm_track.length = 0;
+ }
+ }
+
+ FPKMContext r1 = FPKMContext(ab.num_fragments(),
+ ab.num_fragment_var(),
+ ab.num_fragment_uncertainty_var(),
+ ab.mass_variance(),
+ ab.num_fragments_by_replicate(),
+ ab.FPKM(),
+ ab.FPKM_by_replicate(),
+ ab.FPKM_variance(),
+ ab.FPKM_conf().low,
+ ab.FPKM_conf().high,
+ ab.status(),
+ ab.status_by_replicate(),
+ ab.fpkm_samples(),
+ ab.gamma());
+
+
+
+ vector<FPKMContext>& fpkms = inserted.first->second.fpkm_series;
+ if (sample_index < fpkms.size())
+ {
+ // if the fpkm series already has an entry matching this description
+ // for this sample index, then we are dealing with a group of transcripts
+ // that occupies multiple (genomically disjoint) bundles. We need
+ // to add this bundle's contribution to the FPKM, fragments, and variance
+ // to whatever's already there.
+
+ // NOTE: we can simply sum the FKPM_variances, because we are currently
+ // assuming that transcripts in disjoint bundles share no alignments and
+ // thus have FPKM covariance == 0; This assumption will no longer be
+ // true if we decide to do multireads the right way.
+
+ FPKMContext& existing = fpkms[sample_index];
+ existing.FPKM += r1.FPKM;
+ existing.count_mean += r1.count_mean;
+ existing.FPKM_variance += r1.FPKM_variance;
+ if (existing.status == NUMERIC_FAIL || r1.status == NUMERIC_FAIL)
+ {
+ existing.status = NUMERIC_FAIL;
+ }
+ else
+ {
+ existing.status = NUMERIC_OK;
+ }
+
+ }
+ else
+ {
+ fpkms.push_back(r1);
+ }
+}
diff --git a/src/tracking.h b/src/tracking.h
new file mode 100644
index 0000000..27d67e4
--- /dev/null
+++ b/src/tracking.h
@@ -0,0 +1,118 @@
+#include "abundances.h"
+
+struct TrackingInfoPerRep
+{
+ boost::shared_ptr<const ReadGroupProperties> rg_props;
+ double fpkm;
+ double count;
+ AbundanceStatus status;
+};
+
+struct FPKMContext
+{
+ FPKMContext(double cm,
+ double cv,
+ double cuv,
+ double cdv,
+ const CountPerReplicateTable& cpr,
+ double r,
+ const FPKMPerReplicateTable& fpr,
+ double v,
+ double fcl,
+ double fch,
+ AbundanceStatus s,
+ const StatusPerReplicateTable& spr,
+ const vector<double>& fs,
+ double g)
+ : count_mean(cm),
+ count_var(cv),
+ count_uncertainty_var(cuv),
+ count_dispersion_var(cdv),
+
+ FPKM(r),
+ FPKM_variance(v),
+ FPKM_conf_lo(fcl),
+ FPKM_conf_hi(fch),
+ status(s),
+
+ fpkm_samples(fs),
+ gamma(g)
+ {
+ assert (fpr.size() == cpr.size());
+ assert (fpr.size() == spr.size());
+ assert (cpr.size() == spr.size());
+
+ // TODO: should check for proper alignment of these tables...
+ for (CountPerReplicateTable::const_iterator itr = cpr.begin(); itr != cpr.end(); ++itr)
+ {
+ TrackingInfoPerRep info;
+
+ info.rg_props = itr->first;
+ info.count = itr->second;
+
+ FPKMPerReplicateTable::const_iterator f_itr = fpr.find(itr->first);
+ if (f_itr != fpr.end())
+ info.fpkm = f_itr->second;
+
+ StatusPerReplicateTable::const_iterator s_itr = spr.find(itr->first);
+ if (s_itr != spr.end())
+ info.status = s_itr->second;
+
+ tracking_info_per_rep.push_back(info);
+ }
+
+ vector<TrackingInfoPerRep>(tracking_info_per_rep).swap(tracking_info_per_rep);
+ }
+
+ double count_mean;
+ double count_var;
+ double count_uncertainty_var;
+ double count_dispersion_var;
+ vector<TrackingInfoPerRep> tracking_info_per_rep;
+ double FPKM;
+ double FPKM_variance;
+ double FPKM_conf_lo;
+ double FPKM_conf_hi;
+ AbundanceStatus status;
+ vector<double> fpkm_samples;
+ double gamma;
+};
+
+struct FPKMTracking
+{
+ string locus_tag;
+ char classcode;
+ set<string> tss_ids; // for individual isoforms only
+ set<string> gene_ids;
+ set<string> gene_names;
+ set<string> protein_ids;
+ string description; // isoforms or tss groups (e.g.) involved in this test
+ string ref_match;
+ int length;
+
+ vector<vector<boost::shared_ptr<const ReadGroupProperties> > > rg_props;
+
+ vector<FPKMContext> fpkm_series;
+};
+
+typedef map<string, FPKMTracking> FPKMTrackingTable;
+
+struct Tracking
+{
+ FPKMTrackingTable isoform_fpkm_tracking;
+ FPKMTrackingTable tss_group_fpkm_tracking;
+ FPKMTrackingTable gene_fpkm_tracking;
+ FPKMTrackingTable cds_fpkm_tracking;
+
+ void clear()
+ {
+ isoform_fpkm_tracking.clear();
+ tss_group_fpkm_tracking.clear();
+ gene_fpkm_tracking.clear();
+ cds_fpkm_tracking.clear();
+ }
+};
+
+void add_to_tracking_table(size_t sample_index,
+ Abundance& ab,
+ FPKMTrackingTable& track);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/cufflinks.git
More information about the debian-med-commit
mailing list