[med-svn] [gmap] 04/08: Imported Upstream version 2014-12-23
Alex Mestiashvili
malex-guest at moszumanska.debian.org
Fri Mar 13 09:51:49 UTC 2015
This is an automated email from the git hooks/post-receive script.
malex-guest pushed a commit to branch master
in repository gmap.
commit 88e58216f154a6dd5ce0d6059ea50dbd1731ed1a
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date: Thu Mar 12 16:58:25 2015 +0100
Imported Upstream version 2014-12-23
---
ChangeLog | 191 ++++
Makefile.in | 2 +-
TODO | 3 -
VERSION | 2 +-
config/ax_ext.m4 | 11 +-
configure | 200 ++++-
configure.ac | 14 +-
src/ChangeLog | 0
src/Makefile.in | 2 +-
src/bitpack64-read.h | 4 +
src/chimera.c | 59 +-
src/chimera.h | 6 +-
src/compress.h | 5 +-
src/config.h.in | 6 +-
src/doublelist.c | 34 +-
src/doublelist.h | 6 +-
src/dynprog.h | 5 +-
src/except.h | 6 +-
src/genome128_hr.c | 12 +-
src/genomicpos.h | 6 +-
src/gmap.c | 160 +++-
src/gsnap.c | 241 +++---
src/iit-read.h | 7 +-
src/indexdb-write.h | 5 +-
src/indexdb.h | 6 +-
src/indexdbdef.h | 5 +-
src/oligoindex_hr.c | 15 +-
src/oligoindex_hr.h | 9 +-
src/outbuffer.c | 31 +-
src/pair.c | 153 +++-
src/pair.h | 10 +-
src/popcount.c | 6 +-
src/popcount.h | 5 +-
src/sam_sort.c | 21 +-
src/samheader.c | 90 +-
src/samheader.h | 6 +-
src/samprint.c | 328 +++++--
src/samprint.h | 8 +-
src/samread.c | 138 ++-
src/sarray-read.c | 341 ++++++--
src/sequence.h | 7 +-
src/splice.c | 763 ++++++++++------
src/stage1.c | 23 +-
src/stage1hr.c | 2012 +++++++++++++++++++++++++++++--------------
src/stage1hr.h | 15 +-
src/stage2.c | 19 +-
src/stage3.c | 571 +++++-------
src/stage3.h | 15 +-
src/stage3hr.c | 507 +++++++----
src/stage3hr.h | 24 +-
src/substring.c | 186 ++--
src/substring.h | 5 +-
src/uniqscan.c | 13 +-
util/gff3_genes.pl.in | 5 +-
util/gff3_introns.pl.in | 5 +-
util/gff3_splicesites.pl.in | 5 +-
56 files changed, 4335 insertions(+), 1999 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index d56db72..0863155 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,194 @@
+2015-03-06 twu
+
+ * stage3hr.c: Fixed bampair_sort_cmp to eliminate duplicates that were not
+ being removed
+
+ * stage3hr.c: Fixes made to computation of overlap
+
+2015-03-03 twu
+
+ * VERSION: Updated version number
+
+ * ax_ext.m4: Checking for tzcnt to run successfully
+
+ * genome128_hr.c: Changed from HAVE_BMI1 to HAVE_TZCNT
+
+ * stage3.c: Turned off branch that can lead to bad CIGAR strings
+
+2015-02-12 twu
+
+ * VERSION, chimera.c, pair.c, pair.h, public-2014-12-17, src: Applied patch
+ 158533 from trunk for GMAP chimeras. Adding a pre-extension slop in
+ finding paths to pair, but not when finding a breakpoint between the final
+ paths.
+
+2015-02-10 twu
+
+ * stage1.c: Merging revision 158350 from trunk to limit number of results in
+ find_range to 100 to avoid getting bogged down in repeats
+
+ * gmap.c, gsnap.c, pair.c, pair.h, uniqscan.c: Merging revision 158352 from
+ trunk to always add a separator line for gff3 output. Added
+ --gff3-add-separators flag to GMAP.
+
+ * gff3_genes.pl.in, gff3_introns.pl.in, gff3_splicesites.pl.in, util: Merged
+ revision 158348 from trunk to always read chr from line for gff3 files
+ without a gene name
+
+2015-02-04 twu
+
+ * VERSION, public-2014-12-17, src: Updated version number
+
+ * pair.c: Applied patch 158025 to add merge slop in Pair_pathscores to allow
+ better identification of merges
+
+2015-02-03 twu
+
+ * VERSION, gmap.c, pair.c, pair.h, public-2014-12-17, src, stage1hr.c,
+ stage3.c, stage3.h: Merged revisions 157718 to 157789 from trunk to
+ compute goodness for Stage3_T objects every time pairarray is generated
+
+2015-01-29 twu
+
+ * outbuffer.c: Moved mutex of locks outside of loops to clear out backlog in
+ Outbuffer_thread_anyorder and Outbuffer_thread_ordered
+
+ * stage3.c: Applied revision 157718 from trunk to use npairs and matches as
+ secondary criteria beyond goodness in Stage3_cmp
+
+ * stage3.c: Fixed wrong variable name in call to score_introns
+
+2015-01-22 twu
+
+ * memchk.c, popcount.c: Added include of config.h
+
+ * VERSION, bitpack64-read.h, bitpack64-serial-read.h, compress.h, dynprog.h,
+ except.h, genomicpos.h, iit-read.h, indexdb-write.h, indexdb.h,
+ indexdbdef.h, oligoindex_hr.h, popcount.h, public-2014-12-17, samprint.h,
+ sequence.h, sortinfo.h, src: Merged revisions 157224 and 157225 from trunk
+ to remove and add config.h from header files
+
+2015-01-16 twu
+
+ * gmap.c, pair.c, public-2014-12-17, src, stage2.c, stage3.c: Merged
+ revision 156845 from trunk to make better decisions for last exons having
+ partial alignments
+
+ * public-2014-12-17: Created release branch from public-2014-12-16
+
+2015-01-15 twu
+
+ * oligoindex_hr.c: Applied patch 156816 from trunk to allow all diagonals in
+ Oligoindex_get_mappings
+
+ * chimera.c, chimera.h, gmap.c: Applied patch 156811 from trunk to fix
+ non-exon-exon breakpoint and dinucleotides
+
+2015-01-14 twu
+
+ * stage2.c: Applied patch 156104 from trunk to fix uninitialized variable
+ for firstactive
+
+ * stage3hr.c: In anomalous_splice_p procedures, checking for samechr_splice
+ hittypes
+
+ * stage1hr.c: Not running GMAP on samechr_splice hittypes
+
+ * stage1hr.c: Applied patch 156105 from trunk to use correct typecast of
+ ambcoords to (Uint8list_T) NULL for large genomes
+
+2014-12-16 twu
+
+ * sarray-read.c: Applied patch 155495 to fix typo of spliceends_antisense to
+ spliceends_sense
+
+ * samheader.c: Applied patch 155409 to not print tabs if there are no
+ headers
+
+ * doublelist.c: Applied patch 155291 to fix type
+
+ * public-2014-12-16: Created release version from revision 154791
+
+2014-12-06 twu
+
+ * stage1hr.c: Fixed typo in assigning probs_acceptor
+
+ * VERSION, config.site.rescomp.prd, doublelist.c, doublelist.h, gsnap.c,
+ samprint.c, sarray-read.c, splice.c, src, stage1hr.c, stage1hr.h,
+ stage3hr.c, stage3hr.h, trunk, uniqscan.c: Merged revisions 154673 through
+ 154777 from branches/2014-12-04-stage1-ambig to compute ambiguous splicing
+ better in suffix array, stage1, and combining splices. Fixed memory leak
+ and changed criteria for comparing across hits
+
+2014-12-05 twu
+
+ * samprint.c, stage3hr.c, stage3hr.h: Merged revisions 154673 through 154678
+ from branches/2014-12-04-stage1-ambig to change XA field
+
+ * index.html: Updated for latest version
+
+ * configure.ac: Added more detailed messages about our own loading of
+ config.site files to counteract the warning message from the standard
+ autoconf loading
+
+2014-12-04 twu
+
+ * uniqscan.c: Using new interface to Substring_setup
+
+ * gsnap.c: Replaced --terminal-output-minlength with --reject-trimlength
+
+ * stage1hr.c, stage1hr.h: Calling Sarray_search_greedy with nmisses_allowed
+ being cutoff_level, and not querylength. Using reject_trimlength instead
+ of terminal_output_minlength.
+
+ * stage3hr.c, stage3hr.h: Replaced Stage3_filter_terminals with
+ Stage3_reject_trimlengths
+
+ * substring.c, substring.h: Implemented new logic based on
+ reject_trimlength. True terminals from the GSNAP algorithm are allowed at
+ this point (but taken care of now by Stage3end_reject_trimlengths).
+
+ * sarray-read.c: Improved debugging statements
+
+2014-12-03 twu
+
+ * stage3hr.c: No longer trying to clip overlaps when the two ends are not in
+ a concordant orientation
+
+ * outbuffer.c: Using new interface to SAM_print_nomapping
+
+ * samprint.c, samprint.h: Allowing for non-zero npaths to be printed in
+ SAM_print_nomapping as an NH field, which can occur with the
+ --quiet-if-excessive feature
+
+ * samread.c: Allowing for the possibility that XO is the first field in a
+ SAM line
+
+ * stage3hr.c: Fixed problem with --merge-distant-samchr feature giving the
+ wrong chrpos on SAM output on distant splices, since this was being
+ treated the same as a translocation (chrnum == 0)
+
+ * samread.c: Terminating parse_XO procedures for either '\0' or '\n'
+
+ * gmap.c, gsnap.c: Including default variables in --help statement
+
+2014-12-02 twu
+
+ * stage3hr.c: Calculating common_shift to get more even splits between the
+ two paired ends, by accounting for the common shared point between
+ common_right and common_left.
+
+ * sarray-read.c: Fixed typo in a for loop
+
+ * sam_sort.c: Made --no-sam-headers option work correctly
+
+ * sam_sort.c, samheader.c, samheader.h: For --split-output function, writing
+ SAM header files to each output file
+
+2014-11-27 twu
+
+ * archive.html, index.html: Updated for latest version
+
2014-11-26 twu
* README: Added comment about sam_sort and --split-output
diff --git a/Makefile.in b/Makefile.in
index 02f38ca..d6f6e00 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -37,7 +37,7 @@ target_triplet = @target@
subdir = .
DIST_COMMON = README $(am__configure_deps) $(srcdir)/Makefile.am \
$(srcdir)/Makefile.in $(top_srcdir)/configure AUTHORS COPYING \
- ChangeLog INSTALL NEWS TODO config/compile config/config.guess \
+ ChangeLog INSTALL NEWS config/compile config/config.guess \
config/config.sub config/depcomp config/install-sh \
config/ltmain.sh config/missing
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
diff --git a/TODO b/TODO
deleted file mode 100644
index c5a7bd7..0000000
--- a/TODO
+++ /dev/null
@@ -1,3 +0,0 @@
-
-Add flag that allows for splitting afterwards.
-
diff --git a/VERSION b/VERSION
index 7cc4453..8e2517b 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2014-11-25
\ No newline at end of file
+2014-12-23
\ No newline at end of file
diff --git a/config/ax_ext.m4 b/config/ax_ext.m4
index 5a0988b..1065623 100644
--- a/config/ax_ext.m4
+++ b/config/ax_ext.m4
@@ -435,8 +435,15 @@ AC_DEFUN([AX_EXT],
fi
if test "$ax_cv_cpu_have_bmi1_ext" = yes; then
- SIMD_CFLAGS="$SIMD_CFLAGS -mbmi"
- AC_DEFINE(HAVE_BMI1,1,[Define to 1 if you support BMI1 (Bit Manipulation Instruction set 1)])
+ CFLAGS=-mbmi
+ AC_RUN_IFELSE(
+ [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+ [[return (_tzcnt_u32(0xffffffffu) == 32) ? 0 : 9;]])],
+ [ax_cv_run_tzcnt_ext=yes])
+ if test x"$ax_cv_run_tzcnt_ext" = x"yes"; then
+ SIMD_CFLAGS="$SIMD_CFLAGS -mbmi"
+ AC_DEFINE(HAVE_TZCNT,1,[Define to 1 if you support Intel intrinsic _tzcnt instruction])
+ fi
fi
if test "$ax_cv_cpu_have_bmi2_ext" = yes; then
diff --git a/configure b/configure
index ace2d5f..28a4d62 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.63 for gmap 2014-11-25.
+# Generated by GNU Autoconf 2.63 for gmap 2014-12-23.
#
# Report bugs to <Thomas Wu <twu at gene.com>>.
#
@@ -745,8 +745,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
# Identity of this package.
PACKAGE_NAME='gmap'
PACKAGE_TARNAME='gmap'
-PACKAGE_VERSION='2014-11-25'
-PACKAGE_STRING='gmap 2014-11-25'
+PACKAGE_VERSION='2014-12-23'
+PACKAGE_STRING='gmap 2014-12-23'
PACKAGE_BUGREPORT='Thomas Wu <twu at gene.com>'
ac_unique_file="src/gmap.c"
@@ -1512,7 +1512,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures gmap 2014-11-25 to adapt to many kinds of systems.
+\`configure' configures gmap 2014-12-23 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1583,7 +1583,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of gmap 2014-11-25:";;
+ short | recursive ) echo "Configuration of gmap 2014-12-23:";;
esac
cat <<\_ACEOF
@@ -1718,7 +1718,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-gmap configure 2014-11-25
+gmap configure 2014-12-23
generated by GNU Autoconf 2.63
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1732,7 +1732,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by gmap $as_me 2014-11-25, which was
+It was created by gmap $as_me 2014-12-23, which was
generated by GNU Autoconf 2.63. Invocation command line was
$ $0 $@
@@ -2102,13 +2102,13 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
{ $as_echo "$as_me:$LINENO: checking package version" >&5
$as_echo_n "checking package version... " >&6; }
-{ $as_echo "$as_me:$LINENO: result: 2014-11-25" >&5
-$as_echo "2014-11-25" >&6; }
+{ $as_echo "$as_me:$LINENO: result: 2014-12-23" >&5
+$as_echo "2014-12-23" >&6; }
### Read defaults
-# Handle CONFIG_SITE
+# Handle CONFIG_SITE better than Autoconf does
for ac_site_file in $CONFIG_SITE; do
if { (eval echo $ac_site_file | grep '^\./' >/dev/null 2>&1)
ac_status=$?
@@ -2116,20 +2116,28 @@ for ac_site_file in $CONFIG_SITE; do
# ac_site_file starts with ./
if test -r "$ac_site_file"; then
ac_site_file="$ac_site_file"
- echo "loading site script $ac_site_file"
+ echo "really loading site script $ac_site_file: file was found"
. "$ac_site_file"
ax_user_site_file_loaded=yes
+ else
+ { { $as_echo "$as_me:$LINENO: error: cannot find $ac_site_file" >&5
+$as_echo "$as_me: error: cannot find $ac_site_file" >&2;}
+ { (exit 1); exit 1; }; }
fi
else
# ac_site_file does not start with ./
if test -r "./$ac_site_file"; then
- echo "loading site script ./$ac_site_file"
+ echo "really loading site script ./$ac_site_file: file was found"
. "./$ac_site_file"
ax_user_site_file_loaded=yes
elif test -r "$ac_site_file"; then
- echo "loading site script $ac_site_file"
- . "$ac_site_file"
+ echo "really loading site script $ac_site_file: file was found"
+ . "$ac_site_file" 2>/dev/null
ax_user_site_file_loaded=yes
+ else
+ { { $as_echo "$as_me:$LINENO: error: cannot find $ac_site_file" >&5
+$as_echo "$as_me: error: cannot find $ac_site_file" >&2;}
+ { (exit 1); exit 1; }; }
fi
fi
done
@@ -4154,7 +4162,7 @@ fi
# Define the identity of the package.
PACKAGE=gmap
- VERSION=2014-11-25
+ VERSION=2014-12-23
cat >>confdefs.h <<_ACEOF
@@ -6250,13 +6258,13 @@ if test "${lt_cv_nm_interface+set}" = set; then
else
lt_cv_nm_interface="BSD nm"
echo "int some_variable = 0;" > conftest.$ac_ext
- (eval echo "\"\$as_me:6253: $ac_compile\"" >&5)
+ (eval echo "\"\$as_me:6261: $ac_compile\"" >&5)
(eval "$ac_compile" 2>conftest.err)
cat conftest.err >&5
- (eval echo "\"\$as_me:6256: $NM \\\"conftest.$ac_objext\\\"\"" >&5)
+ (eval echo "\"\$as_me:6264: $NM \\\"conftest.$ac_objext\\\"\"" >&5)
(eval "$NM \"conftest.$ac_objext\"" 2>conftest.err > conftest.out)
cat conftest.err >&5
- (eval echo "\"\$as_me:6259: output\"" >&5)
+ (eval echo "\"\$as_me:6267: output\"" >&5)
cat conftest.out >&5
if $GREP 'External.*some_variable' conftest.out > /dev/null; then
lt_cv_nm_interface="MS dumpbin"
@@ -7461,7 +7469,7 @@ ia64-*-hpux*)
;;
*-*-irix6*)
# Find out which ABI we are using.
- echo '#line 7464 "configure"' > conftest.$ac_ext
+ echo '#line 7472 "configure"' > conftest.$ac_ext
if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
(eval $ac_compile) 2>&5
ac_status=$?
@@ -9318,11 +9326,11 @@ else
-e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
-e 's:$: $lt_compiler_flag:'`
- (eval echo "\"\$as_me:9321: $lt_compile\"" >&5)
+ (eval echo "\"\$as_me:9329: $lt_compile\"" >&5)
(eval "$lt_compile" 2>conftest.err)
ac_status=$?
cat conftest.err >&5
- echo "$as_me:9325: \$? = $ac_status" >&5
+ echo "$as_me:9333: \$? = $ac_status" >&5
if (exit $ac_status) && test -s "$ac_outfile"; then
# The compiler can only warn and ignore the option if not recognized
# So say no if there are warnings other than the usual output.
@@ -9657,11 +9665,11 @@ else
-e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
-e 's:$: $lt_compiler_flag:'`
- (eval echo "\"\$as_me:9660: $lt_compile\"" >&5)
+ (eval echo "\"\$as_me:9668: $lt_compile\"" >&5)
(eval "$lt_compile" 2>conftest.err)
ac_status=$?
cat conftest.err >&5
- echo "$as_me:9664: \$? = $ac_status" >&5
+ echo "$as_me:9672: \$? = $ac_status" >&5
if (exit $ac_status) && test -s "$ac_outfile"; then
# The compiler can only warn and ignore the option if not recognized
# So say no if there are warnings other than the usual output.
@@ -9762,11 +9770,11 @@ else
-e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
-e 's:$: $lt_compiler_flag:'`
- (eval echo "\"\$as_me:9765: $lt_compile\"" >&5)
+ (eval echo "\"\$as_me:9773: $lt_compile\"" >&5)
(eval "$lt_compile" 2>out/conftest.err)
ac_status=$?
cat out/conftest.err >&5
- echo "$as_me:9769: \$? = $ac_status" >&5
+ echo "$as_me:9777: \$? = $ac_status" >&5
if (exit $ac_status) && test -s out/conftest2.$ac_objext
then
# The compiler can only warn and ignore the option if not recognized
@@ -9817,11 +9825,11 @@ else
-e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
-e 's:$: $lt_compiler_flag:'`
- (eval echo "\"\$as_me:9820: $lt_compile\"" >&5)
+ (eval echo "\"\$as_me:9828: $lt_compile\"" >&5)
(eval "$lt_compile" 2>out/conftest.err)
ac_status=$?
cat out/conftest.err >&5
- echo "$as_me:9824: \$? = $ac_status" >&5
+ echo "$as_me:9832: \$? = $ac_status" >&5
if (exit $ac_status) && test -s out/conftest2.$ac_objext
then
# The compiler can only warn and ignore the option if not recognized
@@ -12620,7 +12628,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
-#line 12623 "configure"
+#line 12631 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@@ -12716,7 +12724,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
-#line 12719 "configure"
+#line 12727 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@@ -22360,12 +22368,73 @@ _ACEOF
fi
if test "$ax_cv_cpu_have_bmi1_ext" = yes; then
- SIMD_CFLAGS="$SIMD_CFLAGS -mbmi"
+ CFLAGS=-mbmi
+ if test "$cross_compiling" = yes; then
+ { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: cannot run test program while cross compiling
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot run test program while cross compiling
+See \`config.log' for more details." >&2;}
+ { (exit 1); exit 1; }; }; }
+else
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+#include <immintrin.h>
+int
+main ()
+{
+return (_tzcnt_u32(0xffffffffu) == 32) ? 0 : 9;
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_link") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+ { (case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_try") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ ax_cv_run_tzcnt_ext=yes
+else
+ $as_echo "$as_me: program exited with status $ac_status" >&5
+$as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+fi
+rm -rf conftest.dSYM
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
+fi
+
+
+ if test x"$ax_cv_run_tzcnt_ext" = x"yes"; then
+ SIMD_CFLAGS="$SIMD_CFLAGS -mbmi"
cat >>confdefs.h <<\_ACEOF
-#define HAVE_BMI1 1
+#define HAVE_TZCNT 1
_ACEOF
+ fi
fi
if test "$ax_cv_cpu_have_bmi2_ext" = yes; then
@@ -24524,12 +24593,73 @@ _ACEOF
fi
if test "$ax_cv_cpu_have_bmi1_ext" = yes; then
- SIMD_CFLAGS="$SIMD_CFLAGS -mbmi"
+ CFLAGS=-mbmi
+ if test "$cross_compiling" = yes; then
+ { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: cannot run test program while cross compiling
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot run test program while cross compiling
+See \`config.log' for more details." >&2;}
+ { (exit 1); exit 1; }; }; }
+else
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+#include <immintrin.h>
+int
+main ()
+{
+return (_tzcnt_u32(0xffffffffu) == 32) ? 0 : 9;
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_link") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+ { (case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_try") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ ax_cv_run_tzcnt_ext=yes
+else
+ $as_echo "$as_me: program exited with status $ac_status" >&5
+$as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+fi
+rm -rf conftest.dSYM
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
+fi
+
+
+ if test x"$ax_cv_run_tzcnt_ext" = x"yes"; then
+ SIMD_CFLAGS="$SIMD_CFLAGS -mbmi"
cat >>confdefs.h <<\_ACEOF
-#define HAVE_BMI1 1
+#define HAVE_TZCNT 1
_ACEOF
+ fi
fi
if test "$ax_cv_cpu_have_bmi2_ext" = yes; then
@@ -26350,7 +26480,7 @@ exec 6>&1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by gmap $as_me 2014-11-25, which was
+This file was extended by gmap $as_me 2014-12-23, which was
generated by GNU Autoconf 2.63. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -26413,7 +26543,7 @@ Report bugs to <bug-autoconf at gnu.org>."
_ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_version="\\
-gmap config.status 2014-11-25
+gmap config.status 2014-12-23
configured by $0, generated by GNU Autoconf 2.63,
with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
diff --git a/configure.ac b/configure.ac
index 3ac6615..9a5b0ed 100644
--- a/configure.ac
+++ b/configure.ac
@@ -23,7 +23,7 @@ AC_MSG_RESULT(PKG_VERSION)
### Read defaults
-# Handle CONFIG_SITE
+# Handle CONFIG_SITE better than Autoconf does
for ac_site_file in $CONFIG_SITE; do
if { (eval echo $ac_site_file | grep '^\./' >/dev/null 2>&1)
ac_status=$?
@@ -31,20 +31,24 @@ for ac_site_file in $CONFIG_SITE; do
# ac_site_file starts with ./
if test -r "$ac_site_file"; then
ac_site_file="$ac_site_file"
- echo "loading site script $ac_site_file"
+ echo "really loading site script $ac_site_file: file was found"
. "$ac_site_file"
ax_user_site_file_loaded=yes
+ else
+ AC_MSG_ERROR([cannot find $ac_site_file])
fi
else
# ac_site_file does not start with ./
if test -r "./$ac_site_file"; then
- echo "loading site script ./$ac_site_file"
+ echo "really loading site script ./$ac_site_file: file was found"
. "./$ac_site_file"
ax_user_site_file_loaded=yes
elif test -r "$ac_site_file"; then
- echo "loading site script $ac_site_file"
- . "$ac_site_file"
+ echo "really loading site script $ac_site_file: file was found"
+ . "$ac_site_file" 2>/dev/null
ax_user_site_file_loaded=yes
+ else
+ AC_MSG_ERROR([cannot find $ac_site_file])
fi
fi
done
diff --git a/src/ChangeLog b/src/ChangeLog
deleted file mode 100644
index e69de29..0000000
diff --git a/src/Makefile.in b/src/Makefile.in
index ba57020..7e4b800 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -42,7 +42,7 @@ bin_PROGRAMS = gmap$(EXEEXT) gmapl$(EXEEXT) get-genome$(EXEEXT) \
cmetindex$(EXEEXT) atoiindex$(EXEEXT) sam_sort$(EXEEXT)
subdir = src
DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
- $(srcdir)/config.h.in ChangeLog
+ $(srcdir)/config.h.in
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/config/libtool.m4 \
$(top_srcdir)/config/ltoptions.m4 \
diff --git a/src/bitpack64-read.h b/src/bitpack64-read.h
index c892bbd..f42f9eb 100644
--- a/src/bitpack64-read.h
+++ b/src/bitpack64-read.h
@@ -1,5 +1,9 @@
#ifndef BITPACK64_READ_INCLUDED
#define BITPACK64_READ_INCLUDED
+#ifdef HAVE_CONFIG_H
+#include <config.h> /* For HAVE_64_BIT */
+#endif
+
#include "types.h"
/* For reading differential-coded bitstreams */
diff --git a/src/chimera.c b/src/chimera.c
index bed8cb3..8bea549 100644
--- a/src/chimera.c
+++ b/src/chimera.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: chimera.c 149319 2014-09-30 02:15:42Z twu $";
+static char rcsid[] = "$Id: chimera.c 158535 2015-02-12 21:33:37Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -14,6 +14,7 @@ static char rcsid[] = "$Id: chimera.c 149319 2014-09-30 02:15:42Z twu $";
#include "maxent.h"
#include "intron.h"
#include "comp.h"
+#include "complement.h"
#define GBUFFERLEN 1024
@@ -208,7 +209,7 @@ Chimera_free (T *old) {
void
Chimera_print (FILE *fp, T this) {
- if (this->exonexonpos >= 0) {
+ if (this->exonexonpos > 0) {
fprintf(fp," *** Possible chimera with exon-exon boundary");
if (this->cdna_direction > 0) {
fprintf(fp," (sense)");
@@ -468,6 +469,7 @@ Chimera_distant_join_p (Stage3_T from, Stage3_T to, int chimera_slop) {
#define NEG_INFINITY -1000000
+#define PRE_EXTENSION_SLOP 6
bool
Chimera_bestpath (int *five_score, int *three_score, int *chimerapos, int *chimeraequivpos, int *bestfrom, int *bestto,
@@ -491,9 +493,11 @@ Chimera_bestpath (int *five_score, int *three_score, int *chimerapos, int *chime
debug4(printf(" %p",stage3array_sub1[i]));
matrix_sub1[i] = (int *) CALLOC(queryntlength,sizeof(int));
gapp_sub1[i] = (bool *) CALLOC(queryntlength,sizeof(bool));
+ debug4(Pair_dump_array(Stage3_pairarray(stage3array_sub1[i]),Stage3_npairs(stage3array_sub1[i]),true));
+ /* Allow pre_extension_slop, in case the parts need extensions to merge */
Pair_pathscores(gapp_sub1[i],matrix_sub1[i],Stage3_pairarray(stage3array_sub1[i]),
Stage3_npairs(stage3array_sub1[i]),Stage3_cdna_direction(stage3array_sub1[i]),
- queryntlength,FIVE);
+ queryntlength,FIVE,PRE_EXTENSION_SLOP);
}
debug4(printf("\n"));
@@ -504,9 +508,11 @@ Chimera_bestpath (int *five_score, int *three_score, int *chimerapos, int *chime
debug4(printf(" %p",stage3array_sub2[i]));
matrix_sub2[i] = (int *) CALLOC(queryntlength,sizeof(int));
gapp_sub2[i] = (bool *) CALLOC(queryntlength,sizeof(bool));
+ debug4(Pair_dump_array(Stage3_pairarray(stage3array_sub2[i]),Stage3_npairs(stage3array_sub2[i]),true));
+ /* Allow pre_extension_slop, in case the parts need extensions to merge */
Pair_pathscores(gapp_sub2[i],matrix_sub2[i],Stage3_pairarray(stage3array_sub2[i]),
Stage3_npairs(stage3array_sub2[i]),Stage3_cdna_direction(stage3array_sub2[i]),
- queryntlength,THREE);
+ queryntlength,THREE,PRE_EXTENSION_SLOP);
}
debug4(printf("\n"));
@@ -611,24 +617,30 @@ Chimera_bestpath (int *five_score, int *three_score, int *chimerapos, int *chime
return foundp;
}
+static char *complCode = COMPLEMENT_UC;
/* Modeled after Chimera_bestpath */
+/* Called if Chimera_find_exonexon fails */
int
-Chimera_find_breakpoint (int *chimeraequivpos, Stage3_T left_part, Stage3_T right_part,
- int queryntlength) {
- int chimerapos = 0;
+Chimera_find_breakpoint (int *chimeraequivpos, char *donor1, char *donor2, char *acceptor2, char *acceptor1,
+ Stage3_T left_part, Stage3_T right_part, int queryntlength, Genome_T genome) {
+ int chimerapos = 0, breakpoint;
int *matrix_sub1, *matrix_sub2, pos, score, bestscore;
bool *gapp_sub1, *gapp_sub2;
+ Univcoord_T left;
+ /* Don't allow pre_extension_slop here, because the ends have already been extended */
matrix_sub1 = (int *) CALLOC(queryntlength,sizeof(int));
gapp_sub1 = (bool *) CALLOC(queryntlength,sizeof(bool));
+ debug4(Pair_dump_array(Stage3_pairarray(left_part),Stage3_npairs(left_part),true));
Pair_pathscores(gapp_sub1,matrix_sub1,Stage3_pairarray(left_part),Stage3_npairs(left_part),
- Stage3_cdna_direction(left_part),queryntlength,FIVE);
+ Stage3_cdna_direction(left_part),queryntlength,FIVE,/*pre_extension_slop*/0);
matrix_sub2 = (int *) CALLOC(queryntlength,sizeof(int));
gapp_sub2 = (bool *) CALLOC(queryntlength,sizeof(bool));
+ debug4(Pair_dump_array(Stage3_pairarray(right_part),Stage3_npairs(right_part),true));
Pair_pathscores(gapp_sub2,matrix_sub2,Stage3_pairarray(right_part),Stage3_npairs(right_part),
- Stage3_cdna_direction(right_part),queryntlength,THREE);
+ Stage3_cdna_direction(right_part),queryntlength,THREE,/*pre_extension_slop*/0);
bestscore = -100000;
@@ -650,7 +662,12 @@ Chimera_find_breakpoint (int *chimeraequivpos, Stage3_T left_part, Stage3_T righ
if (gapp_sub1[pos] == false) {
if (gapp_sub2[pos+1] == false) {
/* Check for the same stage3 object on both lists */
+#if 0
+ /* ? Old formula for use before Pair_pathscores had cdnaend argument */
score = matrix_sub2[queryntlength-1] - matrix_sub2[pos] + matrix_sub1[pos] /* - 0 */;
+#else
+ score = matrix_sub1[pos] + matrix_sub2[pos+1];
+#endif
if (score > bestscore) {
bestscore = score;
@@ -660,7 +677,7 @@ Chimera_find_breakpoint (int *chimeraequivpos, Stage3_T left_part, Stage3_T righ
}
debug(
- printf("%d",score);
+ printf("%d = %d + %d",score,matrix_sub1[pos],matrix_sub2[pos+1]);
if (pos >= chimerapos && pos <= *chimeraequivpos) {
printf(" ** ");
}
@@ -682,6 +699,28 @@ Chimera_find_breakpoint (int *chimeraequivpos, Stage3_T left_part, Stage3_T righ
FREE(gapp_sub1);
FREE(matrix_sub1);
+ breakpoint = (chimerapos + (*chimeraequivpos))/2;
+
+ if (Stage3_watsonp(left_part) == true) {
+ left = Stage3_genomicpos(left_part,breakpoint,/*headp*/false);
+ *donor1 = Genome_get_char(genome,left+1);
+ *donor2 = Genome_get_char(genome,left+2);
+ } else {
+ left = Stage3_genomicpos(left_part,breakpoint,/*headp*/false);
+ *donor1 = complCode[(int) Genome_get_char(genome,left-1)];
+ *donor2 = complCode[(int) Genome_get_char(genome,left-2)];
+ }
+
+ if (Stage3_watsonp(right_part) == true) {
+ left = Stage3_genomicpos(right_part,breakpoint+1,/*headp*/true);
+ *acceptor2 = Genome_get_char(genome,left-2);
+ *acceptor1 = Genome_get_char(genome,left-1);
+ } else {
+ left = Stage3_genomicpos(right_part,breakpoint+1,/*headp*/true);
+ *acceptor2 = complCode[(int) Genome_get_char(genome,left+2)];
+ *acceptor1 = complCode[(int) Genome_get_char(genome,left+1)];
+ }
+
return chimerapos;
}
diff --git a/src/chimera.h b/src/chimera.h
index 287b35f..8f6116a 100644
--- a/src/chimera.h
+++ b/src/chimera.h
@@ -1,4 +1,4 @@
-/* $Id: chimera.h 149319 2014-09-30 02:15:42Z twu $ */
+/* $Id: chimera.h 156812 2015-01-15 20:55:07Z twu $ */
#ifndef CHIMERA_INCLUDED
#define CHIMERA_INCLUDED
@@ -52,8 +52,8 @@ Chimera_bestpath (int *five_score, int *three_score, int *chimerapos, int *chime
Stage3_T *stage3array_sub1, int npaths_sub1, Stage3_T *stage3array_sub2, int npaths_sub2,
int queryntlength, int chimera_slop, bool localp);
extern int
-Chimera_find_breakpoint (int *chimeraequivpos, Stage3_T left_part, Stage3_T right_part,
- int queryntlength);
+Chimera_find_breakpoint (int *chimeraequivpos, char *donor1, char *donor2, char *acceptor2, char *acceptor1,
+ Stage3_T left_part, Stage3_T right_part, int queryntlength, Genome_T genome);
#if 0
extern void
diff --git a/src/compress.h b/src/compress.h
index dab0777..3a60527 100644
--- a/src/compress.h
+++ b/src/compress.h
@@ -1,6 +1,9 @@
-/* $Id: compress.h 134887 2014-05-01 23:30:10Z twu $ */
+/* $Id: compress.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef COMPRESS_INCLUDED
#define COMPRESS_INCLUDED
+#ifdef HAVE_CONFIG_H
+#include <config.h> /* For HAVE_SSE2 */
+#endif
#include <stdio.h>
#include "bool.h"
diff --git a/src/config.h.in b/src/config.h.in
index 9f7c832..fae4677 100644
--- a/src/config.h.in
+++ b/src/config.h.in
@@ -35,9 +35,6 @@
*/
#undef HAVE_AVX2
-/* Define to 1 if you support BMI1 (Bit Manipulation Instruction set 1) */
-#undef HAVE_BMI1
-
/* Define to 1 if you support BMI2 (Bit Manipulation Instruction set 2) */
#undef HAVE_BMI2
@@ -236,6 +233,9 @@
/* Define to 1 if you have the <sys/types.h> header file. */
#undef HAVE_SYS_TYPES_H
+/* Define to 1 if you support Intel intrinsic _tzcnt instruction */
+#undef HAVE_TZCNT
+
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
diff --git a/src/doublelist.c b/src/doublelist.c
index bdd41b0..c95afa1 100644
--- a/src/doublelist.c
+++ b/src/doublelist.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: doublelist.c 145990 2014-08-25 21:47:32Z twu $";
+static char rcsid[] = "$Id: doublelist.c 155502 2014-12-16 22:22:35Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -104,6 +104,24 @@ Doublelist_to_array (int *n, T list) {
}
}
+double *
+Doublelist_to_array_out (int *n, T list) {
+ double *array;
+ int i;
+
+ *n = Doublelist_length(list);
+ if (*n == 0) {
+ return NULL;
+ } else {
+ array = (double *) CALLOC_OUT(*n,sizeof(double));
+ for (i = 0; i < *n; i++) {
+ array[i] = list->first;
+ list = list->rest;
+ }
+ return array;
+ }
+}
+
void
Doublelist_fill_array (double *array, T list) {
int i = 0;
@@ -133,6 +151,20 @@ Doublelist_from_string (char *string) {
return doublelist;
}
+T
+Doublelist_from_array (double *array, int n) {
+ T list = NULL, p;
+
+ while (--n >= 0) {
+ p = (T) MALLOC(sizeof(*p));
+ p->first = array[n];
+ p->rest = list;
+ list = p;
+ }
+
+ return list;
+}
+
double
Doublelist_max (T this) {
T p;
diff --git a/src/doublelist.h b/src/doublelist.h
index 0dcf7aa..752e902 100644
--- a/src/doublelist.h
+++ b/src/doublelist.h
@@ -1,4 +1,4 @@
-/* $Id: doublelist.h 145990 2014-08-25 21:47:32Z twu $ */
+/* $Id: doublelist.h 154778 2014-12-06 03:32:33Z twu $ */
#ifndef DOUBLELIST_INCLUDED
#define DOUBLELIST_INCLUDED
@@ -14,9 +14,13 @@ extern T Doublelist_reverse (T list);
extern int Doublelist_length (T list);
extern double *
Doublelist_to_array (int *n, T list);
+extern double *
+Doublelist_to_array_out (int *n, T list);
extern void
Doublelist_fill_array (double *array, T list);
extern T Doublelist_from_string (char *string);
+extern T
+Doublelist_from_array (double *array, int n);
extern double
Doublelist_max (T this);
extern double
diff --git a/src/dynprog.h b/src/dynprog.h
index 4077f0a..ab4cdae 100644
--- a/src/dynprog.h
+++ b/src/dynprog.h
@@ -1,6 +1,9 @@
-/* $Id: dynprog.h 141804 2014-07-17 02:20:36Z twu $ */
+/* $Id: dynprog.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef DYNPROG_INCLUDED
#define DYNPROG_INCLUDED
+#ifdef HAVE_CONFIG_H
+#include <config.h> /* For HAVE_SSE2, HAVE_SSE4_1 */
+#endif
/* BEST_LOCAL is a local alignment, whereas QUERYEND_INDELS and
QUERYEND_NOGAPS are global. QUERYEND_GAP allows an intron at the
diff --git a/src/except.h b/src/except.h
index 198e9b8..af2680b 100644
--- a/src/except.h
+++ b/src/except.h
@@ -1,6 +1,10 @@
-/* $Id: except.h 40271 2011-05-28 02:29:18Z twu $ */
+/* $Id: except.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef EXCEPT_INCLUDED
#define EXCEPT_INCLUDED
+#ifdef HAVE_CONFIG_H
+#include <config.h> /* For HAVE_PTHREAD */
+#endif
+
#include <setjmp.h>
#define T Except_T
diff --git a/src/genome128_hr.c b/src/genome128_hr.c
index d455af1..d77f6e0 100644
--- a/src/genome128_hr.c
+++ b/src/genome128_hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: genome128_hr.c 151045 2014-10-16 19:08:17Z twu $";
+static char rcsid[] = "$Id: genome128_hr.c 160005 2015-03-03 02:08:47Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -36,7 +36,7 @@ static char rcsid[] = "$Id: genome128_hr.c 151045 2014-10-16 19:08:17Z twu $";
#include <nmmintrin.h>
#endif
-#if defined(HAVE_LZCNT) || defined(HAVE_BMI1)
+#if defined(HAVE_LZCNT) || defined(HAVE_TZCNT)
#include <immintrin.h>
#endif
@@ -18444,7 +18444,7 @@ count_trailing_zeroes (__m128i _diff) {
#ifdef HAVE_SSE4_1
UINT8 x;
-#ifdef HAVE_BMI1
+#ifdef HAVE_TZCNT
if ((x = _mm_extract_epi64(_diff,0)) != 0) {
return (int) _tzcnt_u64(x);
} else {
@@ -18462,7 +18462,7 @@ count_trailing_zeroes (__m128i _diff) {
#else
UINT4 x;
-#ifdef HAVE_BMI1
+#ifdef HAVE_TZCNT
if ((x = (_mm_extract_epi16(_diff,1) << 16) | (_mm_extract_epi16(_diff,0) & 0x0000FFFF)) != 0) {
debug4(printf("word 0 is non-empty, so returning %d\n",_tzcnt_u32(x)));
return _tzcnt_u32(x);
@@ -18652,7 +18652,7 @@ print_diff_leading_zeroes (__m128i _diff, int offset) {
#define count_leading_zeroes(diff) ((diff >> 16) ? clz_table[diff >> 16] : 16 + clz_table[diff])
#endif
-#ifdef HAVE_BMI1
+#ifdef HAVE_TZCNT
#define count_trailing_zeroes(diff) _tzcnt_u32(diff)
#elif defined(HAVE_BUILTIN_CTZ)
#define count_trailing_zeroes(diff) __builtin_ctz(diff)
@@ -18717,7 +18717,7 @@ print_diff_leading_zeroes (UINT4 diff, int offset) {
#define count_leading_zeroes_32(diff) ((diff >> 16) ? clz_table[diff >> 16] : 16 + clz_table[diff])
#endif
-#ifdef HAVE_BMI1
+#ifdef HAVE_TZCNT
#define count_trailing_zeroes_32(diff) _tzcnt_u32(diff)
#elif defined(HAVE_BUILTIN_CTZ)
#define count_trailing_zeroes_32(diff) __builtin_ctz(diff)
diff --git a/src/genomicpos.h b/src/genomicpos.h
index 590b01b..74c4ca7 100644
--- a/src/genomicpos.h
+++ b/src/genomicpos.h
@@ -1,6 +1,10 @@
-/* $Id: genomicpos.h 145990 2014-08-25 21:47:32Z twu $ */
+/* $Id: genomicpos.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef GENOMICPOS_INCLUDED
#define GENOMICPOS_INCLUDED
+#ifdef HAVE_CONFIG_H
+#include <config.h> /* For HAVE_64_BIT */
+#endif
+
#include <stdlib.h>
#include "types.h"
diff --git a/src/gmap.c b/src/gmap.c
index 03aa19f..d668a1c 100644
--- a/src/gmap.c
+++ b/src/gmap.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: gmap.c 153947 2014-11-24 17:46:05Z twu $";
+static char rcsid[] = "$Id: gmap.c 158355 2015-02-10 19:08:45Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -336,6 +336,9 @@ static int suboptimal_score = 1000000;
static bool require_splicedir_p = false;
+/* GFF3 */
+static bool gff3_separators_p = true;
+
/* SAM */
#ifndef PMAP
static bool sam_paired_p = false;
@@ -511,6 +514,8 @@ static struct option long_options[] = {
{"suboptimal-score", required_argument, 0, 0}, /* suboptimal_score */
{"require-splicedir", no_argument, 0, 0}, /* require_splicedir_p */
+ {"gff3-add-separators", required_argument, 0, 0}, /* gff3_separators_p */
+
#ifndef PMAP
{"quality-protocol", required_argument, 0, 0}, /* quality_shift */
{"quality-print-shift", required_argument, 0, 'j'}, /* quality_shift */
@@ -849,7 +854,8 @@ stage3array_from_list (int *npaths, int *first_absmq, int *second_absmq, List_T
int threshold_score;
debug2(printf("Entering stage3array_from_list\n"));
- Stage3_recompute_goodness(stage3list); /* Is this necessary? */
+ /* Stage3_recompute_goodness(stage3list); -- No longer necessary */
+ Stage3_compute_mapq(stage3list);
if ((norig = List_length(stage3list)) == 0) {
*first_absmq = 0;
@@ -909,6 +915,8 @@ stage3array_from_list (int *npaths, int *first_absmq, int *second_absmq, List_T
qsort(array0,norig,sizeof(Stage3_T),Stage3_cmp);
for (i = 0; i < norig; i++) {
x = array0[i];
+ debug(printf("%d: chr %d:%u..%u, goodness %d, matches %d, npairs %d\n",
+ i,Stage3_chrnum(x),Stage3_chrstart(x),Stage3_chrend(x),Stage3_goodness(x),Stage3_matches(x),Stage3_npairs(x)));
for (j = i+1; j < norig; j++) {
y = array0[j];
if (Stage3_overlap(x,y)) {
@@ -982,6 +990,7 @@ update_stage3list (List_T stage3list, bool lowidentityp, Sequence_T queryseq,
struct Pair_T *pairarray;
List_T pairs;
+ int goodness;
int npairs, cdna_direction, matches, unknowns, mismatches, qopens, qindels, topens, tindels,
ncanonical, nsemicanonical, nnoncanonical;
int sensedir;
@@ -1043,7 +1052,7 @@ update_stage3list (List_T stage3list, bool lowidentityp, Sequence_T queryseq,
#ifdef PMAP
subseq_offset = Sequence_subseq_offset(queryseq); /* in nucleotides */
#endif
- pairarray = Stage3_compute(&pairs,&npairs,&cdna_direction,&sensedir,
+ pairarray = Stage3_compute(&pairs,&npairs,&goodness,&cdna_direction,&sensedir,
&matches,&nmatches_posttrim,&max_match_length,
&ambig_end_length_5,&ambig_end_length_3,
&ambig_splicetype_5,&ambig_splicetype_3,
@@ -1079,7 +1088,7 @@ update_stage3list (List_T stage3list, bool lowidentityp, Sequence_T queryseq,
/* Skip */
} else if (matches < min_matches) {
FREE_OUT(pairarray);
- } else if ((stage3 = Stage3_new(pairarray,pairs,npairs,cdna_direction,sensedir,
+ } else if ((stage3 = Stage3_new(pairarray,pairs,npairs,goodness,cdna_direction,sensedir,
stage2_source,stage2_indexsize,matches,unknowns,mismatches,
qopens,qindels,topens,tindels,ncanonical,nsemicanonical,nnoncanonical,
chrnum,chroffset,chrhigh,chrlength,watsonp,
@@ -2272,7 +2281,10 @@ find_breakpoint (int *cdna_direction, int *chimerapos, int *chimeraequivpos, int
debug2(printf("Exon-exon boundary found at %d, which is breakpoint. Comp = %c\n",
*exonexonpos,comp));
} else {
- *chimerapos = Chimera_find_breakpoint(&(*chimeraequivpos),from,to,queryntlength);
+ *chimerapos = Chimera_find_breakpoint(&(*chimeraequivpos),&(*donor1),&(*donor2),&(*acceptor2),&(*acceptor1),
+ from,to,queryntlength,genome);
+ *donor_prob = *acceptor_prob = 0.0;
+
debug2(printf("Chimera_find_breakpoint returns boundary at %d..%d (switch can occur at %d..%d)\n",
*chimerapos,*chimeraequivpos,(*chimerapos)-1,*chimeraequivpos));
@@ -2336,7 +2348,7 @@ check_for_local (bool *mergedp, List_T stage3list, int effective_start, int effe
}
#endif
- Stage3_recompute_goodness(stage3list);
+ /* Stage3_recompute_goodness(stage3list); */
max_single_goodness = 0;
for (p = stage3list; p != NULL; p = List_next(p)) {
stage3 = (Stage3_T) List_head(p);
@@ -2480,8 +2492,8 @@ check_for_local (bool *mergedp, List_T stage3list, int effective_start, int effe
if ((querysubseq = Sequence_subsequence(queryseq,effective_end-extension,queryntlength)) != NULL) {
if ((querysubuc = Sequence_subsequence(queryuc,effective_end-extension,queryntlength)) != NULL) {
debug2(printf("5 margin <= 3 margin. "));
- debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d)\n",
- effective_end,effective_end-extension,queryntlength));
+ debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d) (extension %d)\n",
+ effective_end,effective_end-extension,queryntlength,extension));
debug2(Sequence_print(stdout,querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
@@ -2522,8 +2534,8 @@ check_for_local (bool *mergedp, List_T stage3list, int effective_start, int effe
if ((querysubseq = Sequence_subsequence(queryseq,0,effective_end)) != NULL) {
if ((querysubuc = Sequence_subsequence(queryuc,0,effective_end)) != NULL) {
debug2(printf("Recomputing on original part. "));
- debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d)\n",
- effective_end,0,effective_end));
+ debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d), extension %d\n",
+ effective_end,0,effective_end,extension));
debug2(Sequence_print(stdout,querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
@@ -2698,7 +2710,7 @@ check_for_chimera (bool *mergedp, Chimera_T *chimera, List_T stage3list, int eff
}
#endif
- Stage3_recompute_goodness(stage3list);
+ /* Stage3_recompute_goodness(stage3list); */
max_single_goodness = 0;
for (p = stage3list; p != NULL; p = List_next(p)) {
stage3 = (Stage3_T) List_head(p);
@@ -3682,7 +3694,7 @@ apply_stage3 (bool *mergedp, Chimera_T *chimera, List_T gregions, bool lowidenti
debug2(printf("\n\n*** Testing for local on %d Stage3_T objects, iter %d ***\n",
List_length(stage3list),iter));
- Stage3_recompute_goodness(stage3list);
+ /* Stage3_recompute_goodness(stage3list); */
stage3list = stage3list_remove_duplicates(stage3list);
stage3list = stage3list_sort(stage3list);
@@ -3695,26 +3707,34 @@ apply_stage3 (bool *mergedp, Chimera_T *chimera, List_T gregions, bool lowidenti
nonchimericbest = (Stage3_T) List_head(stage3list);
debug2(printf("nonchimericbest is %p\n",nonchimericbest));
+#if 0
+ if (List_length(stage3list) <= 1) {
+ debug2(printf("Only 0 or 1 alignments, so won't look for local\n"));
+ testlocalp = false;
+ }
+ else
+#endif
+
if (Stage3_domain(nonchimericbest) < chimera_margin) {
- debug2(printf("Existing alignment is too short, so won't look for chimera\n"));
+ debug2(printf("Existing alignment is too short, so won't look for local\n"));
testlocalp = false;
#if 0
} else if (Stage3_fracidentity(nonchimericbest) < CHIMERA_IDENTITY &&
Chimera_alignment_break(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq),CHIMERA_FVALUE) >= chimera_margin
) {
- debug2(printf("Break in alignment quality at %d..%d detected, so will look for chimera\n",
+ debug2(printf("Break in alignment quality at %d..%d detected, so will look for local\n",
effective_start,effective_end));
testlocalp = true;
#endif
} else if (Stage3_largemargin(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq)) >= chimera_margin) {
- debug2(printf("Large margin at %d..%d detected (%d >= %d), so will look for chimera\n",
+ debug2(printf("Large margin at %d..%d detected (%d >= %d), so will look for local\n",
effective_start,effective_end,Stage3_largemargin(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq)),chimera_margin));
testlocalp = true;
} else {
- debug2(printf("Good alignment already with identity %f, so won't look for chimera\n",
+ debug2(printf("Good alignment already with identity %f, so won't look for local\n",
Stage3_fracidentity(nonchimericbest)));
testlocalp = false;
}
@@ -3757,7 +3777,7 @@ apply_stage3 (bool *mergedp, Chimera_T *chimera, List_T gregions, bool lowidenti
debug2(printf("\n\n*** Testing for chimera on %d Stage3_T objects, iter %d ***\n",
List_length(stage3list),iter));
- Stage3_recompute_goodness(stage3list);
+ /* Stage3_recompute_goodness(stage3list); */
stage3list = stage3list_remove_duplicates(stage3list);
stage3list = stage3list_sort(stage3list);
@@ -3845,7 +3865,7 @@ apply_stage3 (bool *mergedp, Chimera_T *chimera, List_T gregions, bool lowidenti
debug2(printf("apply_stage3 returning list of length %d\n",List_length(stage3list)));
/* Needed after call to stage3_from_gregions */
- Stage3_recompute_goodness(stage3list);
+ /* Stage3_recompute_goodness(stage3list); */
/* Final call, so do both filtering and sorting */
Stage3_recompute_coverage(stage3list,queryseq);
@@ -4853,6 +4873,15 @@ main (int argc, char *argv[]) {
sevenway_root = optarg;
} else if (!strcmp(long_name,"append-output")) {
appendp = true;
+ } else if (!strcmp(long_name,"gff3-add-separators")) {
+ if (!strcmp(optarg,"1")) {
+ gff3_separators_p = true;
+ } else if (!strcmp(optarg,"0")) {
+ gff3_separators_p = false;
+ } else {
+ fprintf(stderr,"--gff3-add-separators flag must be 0 or 1\n");
+ exit(9);
+ }
#ifndef PMAP
} else if (!strcmp(long_name,"no-sam-headers")) {
sam_headers_p = false;
@@ -5936,7 +5965,7 @@ main (int argc, char *argv[]) {
donor_typeint,acceptor_typeint);
Dynprog_end_setup(splicesites,splicetypes,splicedists,nsplicesites,
trieoffsets_obs,triecontents_obs,trieoffsets_max,triecontents_max);
- Pair_setup(trim_mismatch_score,trim_indel_score,sam_insert_0M_p,
+ Pair_setup(trim_mismatch_score,trim_indel_score,gff3_separators_p,sam_insert_0M_p,
force_xs_direction_p,md_lowercase_variant_p,
/*snps_p*/genomecomp_alt ? true : false,genomelength,cigar_action);
Stage3_setup(/*splicingp*/novelsplicingp == true || knownsplicingp == true,novelsplicingp,
@@ -6162,16 +6191,21 @@ Usage: gmap [OPTIONS...] <FASTA files...>, or\n\
cat <FASTA files...> | gmap [OPTIONS...]\n\
");
#endif
+ fprintf(stdout,"\n");
- fprintf(stdout,"\n\
-Input options (must include -d or -g)\n\
- -D, --dir=directory Genome directory\n\
+ fprintf(stdout,"Input options (must include -d or -g)\n");
+ fprintf(stdout,"\
+ -D, --dir=directory Genome directory. Default (as specified by --with-gmapdb to the configure program) is\n \
+ %s\n\
+",GMAPDB);
+ fprintf(stdout,"\
-d, --db=STRING Genome database. If argument is '?' (with\n\
the quotes), this command lists available databases.\n\
");
+ fprintf(stdout,"\n");
#ifdef PMAP
- fprintf(stdout,"\n\
+ fprintf(stdout,"\
-a, --alphabet=STRING Alphabet to use in PMAP genome database\n\
(allowed values in order of preference: 20, 15a, 12a).\n\
If not specified, the program will find the first available\n\
@@ -6179,7 +6213,7 @@ Input options (must include -d or -g)\n\
");
#endif
- fprintf(stdout,"\n\
+ fprintf(stdout,"\
-k, --kmer=INT kmer size to use in genome database (allowed values: 16 or less).\n\
If not specified, the program will find the highest available\n\
kmer size in the genome database\n\
@@ -6199,10 +6233,12 @@ Input options (must include -d or -g)\n\
-q, --part=INT/INT Process only the i-th out of every n sequences\n\
e.g., 0/100 or 99/100 (useful for distributing jobs\n\
to a computer farm).\n\
- --input-buffer-size=INT Size of input buffer (program reads this many sequences\n\
- at a time for efficiency) (default 1000)\n\
-\n\
");
+ fprintf(stdout,"\
+ --input-buffer-size=INT Size of input buffer (program reads this many sequences\n\
+ at a time for efficiency) (default %d)\n\
+",inbuffer_nspaces);
+ fprintf(stdout,"\n");
fprintf(stdout,"Computation options\n");
#ifdef HAVE_MMAP
@@ -6239,17 +6275,29 @@ Input options (must include -d or -g)\n\
fprintf(stdout,"\
--nosplicing Turns off splicing (useful for aligning genomic sequences\n\
onto a genome)\n\
- --min-intronlength=INT Min length for one internal intron (default 9). Below this size,\n\
+");
+ fprintf(stdout,"\
+ --min-intronlength=INT Min length for one internal intron (default %d). Below this size,\n\
a genomic gap will be considered a deletion rather than an intron.\n\
- -K, --intronlength=INT Max length for one internal intron (default 1000000)\n\
+",min_intronlength);
+ fprintf(stdout,"\
+ -K, --intronlength=INT Max length for one internal intron (default %d)\n\
+",maxintronlen_bound);
+ fprintf(stdout,"\
-w, --localsplicedist=INT Max length for known splice sites at ends of sequence\n\
- (default 2,000,000)\n\
- -L, --totallength=INT Max total intron length (default 2400000)\n\
+ (default %d)\n\
+",shortsplicedist);
+ fprintf(stdout,"\
+ -L, --totallength=INT Max total intron length (default %d)\n\
+",maxtotallen_bound);
+ fprintf(stdout,"\
-x, --chimera-margin=INT Amount of unaligned sequence that triggers\n\
- search for the remaining sequence (default 30).\n\
+ search for the remaining sequence (default %d).\n\
Enables alignment of chimeric reads, and may help\n\
with some non-chimeric reads. To turn off, set to\n\
zero.\n\
+",chimera_margin);
+ fprintf(stdout,"\
--no-chimeras Turns off finding of chimeras. Same effect as --chimera-margin=0\n\
");
@@ -6272,8 +6320,12 @@ Input options (must include -d or -g)\n\
-c, --chrsubset=string Limit search to given chromosome\n\
-z, --direction=STRING cDNA direction (sense_force, antisense_force,\n\
sense_filter, antisense_filter,or auto (default))\n\
+");
+ fprintf(stdout,"\
-H, --trimendexons=INT Trim end exons with fewer than given number of matches\n\
- (in nt, default 12)\n\
+ (in nt, default %d)\n\
+",minendexon);
+ fprintf(stdout,"\
--canonical-mode=INT Reward for canonical and semi-canonical introns\n\
0=low reward, 1=high reward (default), 2=low reward for\n\
high-identity sequences and high reward otherwise\n\
@@ -6281,9 +6333,11 @@ Input options (must include -d or -g)\n\
for cross-species alignments and other difficult cases\n\
--allow-close-indels=INT Allow an insertion and deletion close to each other\n\
(0=no, 1=yes (default), 2=only for high-quality alignments)\n\
- --microexon-spliceprob=FLOAT Allow microexons only if one of the splice site probabilities is\n\
- greater than this value (default 0.90)\n\
");
+ fprintf(stdout,"\
+ --microexon-spliceprob=FLOAT Allow microexons only if one of the splice site probabilities is\n\
+ greater than this value (default %.2f)\n\
+",microexon_spliceprob);
#if 0
fprintf(stdout,"\
@@ -6318,8 +6372,8 @@ Input options (must include -d or -g)\n\
2=repetitive seqs, 3=poor and repetitive\n\
");
#endif
-
fprintf(stdout,"\n");
+
fprintf(stdout,"\
Output types\n\
-S, --summary Show summary of alignments only\n\
@@ -6376,10 +6430,12 @@ Output types\n\
fprintf(stdout,"\
Output options\n\
- -n, --npaths=INT Maximum number of paths to show (default 5). If set to 1, GMAP\n\
+ -n, --npaths=INT Maximum number of paths to show (default %d). If set to 1, GMAP\n\
will not report chimeric alignments, since those imply\n\
two paths. If you want a single alignment plus chimeric\n\
alignments, then set this to be 0.\n\
+",maxpaths);
+ fprintf(stdout,"\
--suboptimal-score=INT Report only paths whose score is within this value of the\n\
best path. By default, if this option is not provided,\n\
the program prints all paths found.\n\
@@ -6404,14 +6460,16 @@ Output options\n\
is generated in addition to the output in the .nomapping file.\n\
--append-output When --split-output or --failedinput is given, this flag will append output\n\
to the existing files. Otherwise, the default is to create new files.\n\
- --output-buffer-size=INT Buffer size, in queries, for output thread (default 1000). When the number\n\
+");
+ fprintf(stdout,"\
+ --output-buffer-size=INT Buffer size, in queries, for output thread (default %d). When the number\n\
of results to be printed exceeds this size, the worker threads are halted\n\
until the backlog is cleared\n\
-");
+",output_buffer_size);
#ifdef PMAP
- fprintf(stdout,"\
+ fprintf(stdout,"\
-Y, --tolerant Translates genome with corrections for frameshifts\n\
");
#else
@@ -6424,7 +6482,14 @@ Output options\n\
");
#endif
- fprintf(stdout,"\n");
+ fprintf(stdout,"\n");
+
+ fprintf(stdout,"Options for GFF3 output\n");
+ fprintf(stdout,"\
+ --gff3-add-separators=INT Whether to add a ### separator after each query sequence\n\
+ Values: 0 (no), 1 (yes, default)\n\
+");
+ fprintf(stdout,"\n");
#ifndef PMAP
fprintf(stdout,"Options for SAM output\n");
@@ -6484,9 +6549,16 @@ Alignment output options\n\
0=Don't invert the cDNA (default)\n\
1=Invert cDNA and print genomic (-) strand\n\
2=Invert cDNA and print genomic (+) strand\n\
- -i, --introngap=INT Nucleotides to show on each end of intron (default=3)\n\
- -l, --wraplength=INT Wrap length for alignment (default=50)\n\
-\n\
+");
+ fprintf(stdout,"\
+ -i, --introngap=INT Nucleotides to show on each end of intron (default %d)\n\
+",ngap);
+ fprintf(stdout,"\
+ -l, --wraplength=INT Wrap length for alignment (default %d)\n\
+",wraplength);
+ fprintf(stdout,"\n");
+
+ fprintf(stdout,"\
Filtering output options\n\
--min-trimmed-coverage=FLOAT Do not print alignments with trimmed coverage less\n\
this value (default=0.0, which means no filtering)\n\
diff --git a/src/gsnap.c b/src/gsnap.c
index ef74b93..a9d3690 100644
--- a/src/gsnap.c
+++ b/src/gsnap.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: gsnap.c 153947 2014-11-24 17:46:05Z twu $";
+static char rcsid[] = "$Id: gsnap.c 158355 2015-02-10 19:08:45Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -251,8 +251,8 @@ static int subopt_levels = 0;
static double user_maxlevel_float = -1.0;
static int terminal_threshold = 2;
-static int terminal_output_minlength = 25;
-static bool user_terminal_output_minlength_p = false;
+static int reject_trimlength = 1000;
+static bool user_reject_trimlength_p = false;
/* Really have only one indel penalty */
static int indel_penalty_middle = 2;
@@ -468,7 +468,7 @@ static struct option long_options[] = {
{"max-mismatches", required_argument, 0, 'm'}, /* user_maxlevel_float */
{"terminal-threshold", required_argument, 0, 0}, /* terminal_threshold */
- {"terminal-output-minlength", required_argument, 0, 0}, /* terminal_output_minlength, user_terminal_output_minlength_p */
+ {"reject-trimlength", required_argument, 0, 0}, /* reject_trimlength, user_reject_trimlength_p */
#if 0
{"indel-penalty-middle", required_argument, 0, 'i'}, /* indel_penalty_middle */
@@ -824,7 +824,7 @@ process_request (Request_T request, Floors_T *floors_array,
if (queryseq2 == NULL) {
stage3array = Stage1_single_read(&npaths,&first_absmq,&second_absmq,
queryseq1,indexdb,indexdb2,indexdb_size_threshold,
- genomecomp,floors_array,user_maxlevel_float,subopt_levels,
+ genomecomp,floors_array,user_maxlevel_float,
indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
@@ -839,7 +839,7 @@ process_request (Request_T request, Floors_T *floors_array,
&stage3array5,&npaths5,&first_absmq5,&second_absmq5,
&stage3array3,&npaths3,&first_absmq3,&second_absmq3,
queryseq1,queryseq2,indexdb,indexdb2,indexdb_size_threshold,
- genomecomp,floors_array,user_maxlevel_float,subopt_levels,
+ genomecomp,floors_array,user_maxlevel_float,
indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
@@ -874,7 +874,7 @@ process_request (Request_T request, Floors_T *floors_array,
&stage3array5,&npaths5,&first_absmq5,&second_absmq5,
&stage3array3,&npaths3,&first_absmq3,&second_absmq3,
queryseq1,queryseq2,indexdb,indexdb2,indexdb_size_threshold,
- genomecomp,floors_array,user_maxlevel_float,subopt_levels,
+ genomecomp,floors_array,user_maxlevel_float,
indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
@@ -1762,9 +1762,9 @@ main (int argc, char *argv[]) {
} else if (!strcmp(long_name,"terminal-threshold")) {
terminal_threshold = atoi(check_valid_int(optarg));
- } else if (!strcmp(long_name,"terminal-output-minlength")) {
- terminal_output_minlength = atoi(check_valid_int(optarg));
- user_terminal_output_minlength_p = true;
+ } else if (!strcmp(long_name,"reject-trimlength")) {
+ reject_trimlength = atoi(check_valid_int(optarg));
+ user_reject_trimlength_p = true;
} else if (!strcmp(long_name,"antistranded-penalty")) {
antistranded_penalty = atoi(check_valid_int(optarg));
@@ -2150,37 +2150,16 @@ main (int argc, char *argv[]) {
fprintf(stderr,"Novel splicing (-N) and known splicing (-s) both turned on => assume reads are RNA-Seq\n");
pairmax = pairmax_rna;
shortsplicedist_known = shortsplicedist;
-#if 0
- if ((mode == CMET_STRANDED || mode == CMET_NONSTRANDED) && user_terminal_output_minlength_p == false) {
- /* terminal alignments don't work well with bisulfite reads */
- fprintf(stderr,"--terminal-output-minlength not specified, so setting to MAX_READLENGTH (%d) (meaning off) for RNA-Seq bisulfite reads\n",MAX_READLENGTH);
- terminal_output_minlength = MAX_READLENGTH;
- }
-#endif
} else if (knownsplicingp == true) {
fprintf(stderr,"Known splicing (-s) turned on => assume reads are RNA-Seq\n");
pairmax = pairmax_rna;
shortsplicedist_known = shortsplicedist;
-#if 0
- if ((mode == CMET_STRANDED || mode == CMET_NONSTRANDED) && user_terminal_output_minlength_p == false) {
- /* terminal alignments don't work well with bisulfite reads */
- fprintf(stderr,"--terminal-output-minlength not specified, so setting to MAX_READLENGTH (%d) (meaning off) for RNA-Seq bisulfite reads\n",MAX_READLENGTH);
- terminal_output_minlength = MAX_READLENGTH;
- }
-#endif
} else if (novelsplicingp == true) {
fprintf(stderr,"Novel splicing (-N) turned on => assume reads are RNA-Seq\n");
pairmax = pairmax_rna;
shortsplicedist_known = 0;
-#if 0
- if ((mode == CMET_STRANDED || mode == CMET_NONSTRANDED) && user_terminal_output_minlength_p == false) {
- /* terminal alignments don't work well with bisulfite reads */
- fprintf(stderr,"--terminal-output-minlength not specified, so setting to MAX_READLENGTH (%d) (meaning off) for RNA-Seq bisulfite reads\n",MAX_READLENGTH);
- terminal_output_minlength = MAX_READLENGTH;
- }
-#endif
} else {
/* Appears to be DNA-Seq */
@@ -2188,14 +2167,6 @@ main (int argc, char *argv[]) {
pairmax = pairmax_dna;
shortsplicedist = shortsplicedist_known = 0U;
shortsplicedist_novelend = 0U;
- /* terminal alignments still needed for GMAP alignments, so don't touch terminal_threshold, but do set output minlength */
-#if 0
- if (user_terminal_output_minlength_p == false) {
- fprintf(stderr,"--terminal-output-minlength not specified, so setting to MAX_READLENGTH (%d) (meaning off) for DNA-Seq reads\n",
- MAX_READLENGTH);
- terminal_output_minlength = MAX_READLENGTH;
- }
-#endif
}
if (shortsplicedist_novelend > shortsplicedist) {
@@ -2992,7 +2963,7 @@ main (int argc, char *argv[]) {
Stage2_setup(/*splicingp*/novelsplicingp == true || knownsplicingp == true,/*cross_species_p*/false,
suboptimal_score_start,suboptimal_score_end,
mode,/*snps_p*/snps_iit ? true : false);
- Pair_setup(trim_mismatch_score,trim_indel_score,sam_insert_0M_p,
+ Pair_setup(trim_mismatch_score,trim_indel_score,/*gff3_separators_p*/false,sam_insert_0M_p,
force_xs_direction_p,md_lowercase_variant_p,
/*snps_p*/snps_iit ? true : false,
Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias*/false),
@@ -3010,10 +2981,10 @@ main (int argc, char *argv[]) {
Splice_setup(min_shortend);
Indel_setup(min_indel_end_matches,indel_penalty_middle);
Stage1hr_setup(use_sarray_p,use_only_sarray_p,index1part,index1interval,spansize,chromosome_iit,nchromosomes,
- genomecomp_alt,mode,maxpaths_search,terminal_threshold,terminal_output_minlength,
+ genomecomp_alt,mode,maxpaths_search,terminal_threshold,reject_trimlength,
splicesites,splicetypes,splicedists,nsplicesites,
novelsplicingp,knownsplicingp,distances_observed_p,
- max_middle_insertions,max_middle_deletions,
+ subopt_levels,max_middle_insertions,max_middle_deletions,
shortsplicedist,shortsplicedist_known,shortsplicedist_novelend,min_intronlength,
min_distantsplicing_end_matches,min_distantsplicing_identity,
nullgap,maxpeelback,maxpeelback_distalmedial,
@@ -3026,10 +2997,11 @@ main (int argc, char *argv[]) {
splicing_iit,splicing_divint_crosstable,
donor_typeint,acceptor_typeint,trim_mismatch_score,
novelsplicingp,knownsplicingp,output_sam_p,mode,
- Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias*/false));
+ Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias*/false),
+ reject_trimlength);
Stage3hr_setup(invert_first_p,invert_second_p,genes_iit,genes_divint_crosstable,
tally_iit,tally_divint_crosstable,runlength_iit,runlength_divint_crosstable,
- terminal_output_minlength,distances_observed_p,pairmax,
+ reject_trimlength,distances_observed_p,pairmax,
expected_pairlength,pairlength_deviation,
localsplicing_penalty,indel_penalty_middle,antistranded_penalty,
favor_multiexon_p,gmap_min_nconsecutive,index1part,index1interval,novelsplicingp,
@@ -3237,7 +3209,10 @@ Usage: gsnap [OPTIONS...] <FASTA file>, or\n\
/* Input options */
fprintf(stdout,"Input options (must include -d)\n");
fprintf(stdout,"\
- -D, --dir=directory Genome directory\n\
+ -D, --dir=directory Genome directory. Default (as specified by --with-gmapdb to the configure program) is\n\
+ %s\n\
+",GMAPDB);
+ fprintf(stdout,"\
-d, --db=STRING Genome database\n\
--use-sarray=INT Whether to use a suffix array, which will give increased speed.\n\
Allowed values: 0 (no), 1 (yes, plus GSNAP/GMAP algorithm, default),\n\
@@ -3253,10 +3228,16 @@ Usage: gsnap [OPTIONS...] <FASTA file>, or\n\
-q, --part=INT/INT Process only the i-th out of every n sequences\n\
e.g., 0/100 or 99/100 (useful for distributing jobs\n\
to a computer farm).\n\
+");
+ fprintf(stdout,"\
--input-buffer-size=INT Size of input buffer (program reads this many sequences\n\
- at a time for efficiency) (default 1000)\n\
+ at a time for efficiency) (default %d)\n\
+",inbuffer_nspaces);
+ fprintf(stdout,"\
--barcode-length=INT Amount of barcode to remove from start of read\n\
- (default 0)\n\
+ (default %d)\n\
+",barcode_length);
+ fprintf(stdout,"\
-o, --orientation=STRING Orientation of paired-end reads\n\
Allowed values: FR (fwd-rev, or typical Illumina; default),\n\
RF (rev-fwd, for circularized inserts), or FF (fwd-fwd, same strand)\n\
@@ -3348,27 +3329,30 @@ is still designed to be fast.\n\
(0=no (default), 1=yes)\n\
--genome-unk-mismatch=INT Whether to count unknown (N) characters in the genome as a mismatch\n\
(0=no, 1=yes (default))\n\
- --maxsearch=INT Maximum number of alignments to find (default 1000).\n\
+");
+ fprintf(stdout,"\
+ --maxsearch=INT Maximum number of alignments to find (default %d).\n\
Must be larger than --npaths, which is the number to report.\n\
Keeping this number large will allow for random selection among multiple alignments.\n\
Reducing this number can speed up the program.\n\
-");
+",maxpaths_search);
#if 0
fprintf(stdout,"\
- -i, --indel-penalty-middle=INT Penalty for an indel in middle of read (default 1).\n\
+ -i, --indel-penalty-middle=INT Penalty for an indel in middle of read (default %d).\n\
Counts against mismatches allowed. To find indels, make\n\
indel-penalty less than or equal to max-mismatches\n\
- -I, --indel-penalty-end=INT Penalty for an indel at end of read (default 2).\n\
+",indel_penalty_middle);
+ fprintf(stdout,"\
+ -I, --indel-penalty-end=INT Penalty for an indel at end of read (default %d).\n\
Counts against mismatches allowed. To find indels, make\n\
indel-penalty less than or equal to max-mismatches\n\
-");
-#else
+",indel_penalty_end);
+#endif
+
fprintf(stdout,"\
--terminal-threshold=INT Threshold for computing a terminal alignment (from one end of the\n\
- read to the best possible position at the other end) (default 2\n\
- for standard, atoi-stranded, and atoi-nonstranded mode;\n\
- default 1000 for cmet-stranded and cmet-nonstranded mode).\n\
+ read to the best possible position at the other end) (default %d)\n\
For example, if this value is 2, then if GSNAP finds an exact or\n\
1-mismatch alignment, it will not try to find a terminal alignment.\n\
To turn off the computation of terminal alignments, set this to a\n\
@@ -3377,18 +3361,19 @@ is still designed to be fast.\n\
find some alignments. Therefore, to avoid getting terminal alignments\n\
in the output, you should generally set --terminal-output-minlength\n\
instead of this parameter.\n\
- --terminal-output-minlength=INT\n\
- Threshold alignment length in bp for a terminal alignment result to be printed\n\
- (in bp) (default 25 for RNA-Seq standard, atoi-stranded, and atoi-nonstranded modes;\n\
- default MAX_READLENGTH for other RNA-Seq modes and for DNA-Seq in all modes).\n\
- Setting this parameter to a value of MAX_READLENGTH or more will prevent\n\
- all terminal alignments from being printed.\n\
- -i, --indel-penalty=INT Penalty for an indel (default 2).\n\
+",terminal_threshold);
+ fprintf(stdout,"\
+ --reject-trimlength=INT\n\
+ Do not print alignments where amount trimmed on both ends totals more than\n\
+ this amount (default %d). Note that ambiguous splicing does not count\n\
+ as a trim.\n\
+",reject_trimlength);
+ fprintf(stdout,"\
+ -i, --indel-penalty=INT Penalty for an indel (default %d).\n\
Counts against mismatches allowed. To find indels, make\n\
indel-penalty less than or equal to max-mismatches.\n\
A value < 2 can lead to false positives at read ends\n\
-");
-#endif
+",indel_penalty_middle);
#if 0
/* No longer used */
@@ -3406,22 +3391,40 @@ is still designed to be fast.\n\
#endif
fprintf(stdout,"\
- --indel-endlength=INT Minimum length at end required for indel alignments (default 4)\n\
- -y, --max-middle-insertions=INT Maximum number of middle insertions allowed (default 9)\n\
- -z, --max-middle-deletions=INT Maximum number of middle deletions allowed (default 30)\n\
- -Y, --max-end-insertions=INT Maximum number of end insertions allowed (default 3)\n\
- -Z, --max-end-deletions=INT Maximum number of end deletions allowed (default 6)\n\
- -M, --suboptimal-levels=INT Report suboptimal hits beyond best hit (default 0)\n\
+ --indel-endlength=INT Minimum length at end required for indel alignments (default %d)\n\
+",min_indel_end_matches);
+ fprintf(stdout,"\
+ -y, --max-middle-insertions=INT Maximum number of middle insertions allowed (default %d)\n\
+",max_middle_insertions);
+ fprintf(stdout,"\
+ -z, --max-middle-deletions=INT Maximum number of middle deletions allowed (default %d)\n\
+",max_middle_deletions);
+ fprintf(stdout,"\
+ -Y, --max-end-insertions=INT Maximum number of end insertions allowed (default %d)\n\
+",max_end_insertions);
+ fprintf(stdout,"\
+ -Z, --max-end-deletions=INT Maximum number of end deletions allowed (default %d)\n\
+",max_end_deletions);
+ fprintf(stdout,"\
+ -M, --suboptimal-levels=INT Report suboptimal hits beyond best hit (default %d)\n\
All hits with best score plus suboptimal-levels are reported\n\
+",subopt_levels);
+ fprintf(stdout,"\
-a, --adapter-strip=STRING Method for removing adapters from reads. Currently allowed values: off, paired.\n\
Default is \"off\". To turn on, specify \"paired\", which removes adapters\n\
from paired-end reads if they appear to be present.\n\
- --trim-mismatch-score=INT Score to use for mismatches when trimming at ends (default is -3;\n\
+");
+ fprintf(stdout,"\
+ --trim-mismatch-score=INT Score to use for mismatches when trimming at ends (default is %d;\n\
to turn off trimming, specify 0). Warning: turning trimming off\n\
will give false positive mismatches at the ends of reads\n\
- --trim-indel-score=INT Score to use for indels when trimming at ends (default is -4;\n\
+",trim_mismatch_score);
+ fprintf(stdout,"\
+ --trim-indel-score=INT Score to use for indels when trimming at ends (default is %d;\n\
to turn off trimming, specify 0). Warning: turning trimming off\n\
will give false positive indels at the ends of reads\n\
+",trim_indel_score);
+ fprintf(stdout,"\
-V, --snpsdir=STRING Directory for SNPs index files (created using snpindex) (default is\n\
location of genome index files specified using -D and -d)\n \
-v, --use-snps=STRING Use database containing known SNPs (in <STRING>.iit, built\n\
@@ -3468,19 +3471,33 @@ is still designed to be fast.\n\
Allowed values: none, all, pairsearch, indel_knownsplice, terminal, improve\n\
(or multiple values, separated by commas).\n\
Default: all, i.e., pairsearch,indel_knownsplice,terminal,improve\n\
+");
+ fprintf(stdout,"\
--trigger-score-for-gmap=INT Try GMAP pairsearch on nearby genomic regions if best score (the total\n\
- of both ends if paired-end) exceeds this value (default 5)\n\
- --gmap-min-match-length=INT Keep GMAP hit only if it has this many consecutive matches (default 20)\n\
- --gmap-allowance=INT Extra mismatch/indel score allowed for GMAP alignments (default 3)\n\
+ of both ends if paired-end) exceeds this value (default %d)\n\
+",trigger_score_for_gmap);
+ fprintf(stdout,"\
+ --gmap-min-match-length=INT Keep GMAP hit only if it has this many consecutive matches (default %d)\n\
+",gmap_min_nconsecutive);
+ fprintf(stdout,"\
+ --gmap-allowance=INT Extra mismatch/indel score allowed for GMAP alignments (default %d)\n\
+",gmap_allowance);
+ fprintf(stdout,"\
--max-gmap-pairsearch=INT Perform GMAP pairsearch on nearby genomic regions up to this many\n\
- many candidate ends (default 10). Requires pairsearch in --gmap-mode\n\
+ many candidate ends (default %d). Requires pairsearch in --gmap-mode\n\
+",max_gmap_pairsearch);
+ fprintf(stdout,"\
--max-gmap-terminal=INT Perform GMAP terminal on nearby genomic regions up to this many\n\
- candidate ends (default 5). Requires terminal in --gmap-mode\n\
+ candidate ends (default %d). Requires terminal in --gmap-mode\n\
+",max_gmap_terminal);
+ fprintf(stdout,"\
--max-gmap-improvement=INT Perform GMAP improvement on nearby genomic regions up to this many\n\
- candidate ends (default 5). Requires improve in --gmap-mode\n\
+ candidate ends (default %d). Requires improve in --gmap-mode\n\
+",max_gmap_improvement);
+ fprintf(stdout,"\
--microexon-spliceprob=FLOAT Allow microexons only if one of the splice site probabilities is\n\
- greater than this value (default 0.90)\n\
-");
+ greater than this value (default %.2f)\n\
+",microexon_spliceprob);
fprintf(stdout,"\n");
@@ -3514,20 +3531,36 @@ is still designed to be fast.\n\
splice site, but extend instead into the intron. This flag makes\n\
sense only if you provide the --use-splicing flag, and you are trying\n\
to eliminate all soft clipping with --trim-mismatch-score=0\n\
- -w, --localsplicedist=INT Definition of local novel splicing event (default 200000)\n\
- --novelend-splicedist=INT Distance to look for novel splices at the ends of reads (default 50000)\n\
- -e, --local-splice-penalty=INT Penalty for a local splice (default 0). Counts against mismatches allowed\n\
- -E, --distant-splice-penalty=INT Penalty for a distant splice (default 1). A distant splice is one where\n\
+");
+ fprintf(stdout,"\
+ -w, --localsplicedist=INT Definition of local novel splicing event (default %d)\n\
+",shortsplicedist);
+ fprintf(stdout,"\
+ --novelend-splicedist=INT Distance to look for novel splices at the ends of reads (default %d)\n\
+",shortsplicedist_novelend);
+ fprintf(stdout,"\
+ -e, --local-splice-penalty=INT Penalty for a local splice (default %d). Counts against mismatches allowed\n\
+",localsplicing_penalty);
+ fprintf(stdout,"\
+ -E, --distant-splice-penalty=INT Penalty for a distant splice (default %d). A distant splice is one where\n\
the intron length exceeds the value of -w, or --localsplicedist, or is an\n\
inversion, scramble, or translocation between two different chromosomes\n\
Counts against mismatches allowed\n\
- -K, --distant-splice-endlength=INT Minimum length at end required for distant spliced alignments (default 20, min\n\
+",distantsplicing_penalty);
+ fprintf(stdout,"\
+ -K, --distant-splice-endlength=INT Minimum length at end required for distant spliced alignments (default %d, min\n\
allowed is the value of -k, or kmer size)\n\
- -l, --shortend-splice-endlength=INT Minimum length at end required for short-end spliced alignments (default 2,\n\
+",min_distantsplicing_end_matches);
+ fprintf(stdout,"\
+ -l, --shortend-splice-endlength=INT Minimum length at end required for short-end spliced alignments (default %d,\n\
but unless known splice sites are provided with the -s flag, GSNAP may still\n\
need the end length to be the value of -k, or kmer size to find a given splice\n\
- --distant-splice-identity=FLOAT Minimum identity at end required for distant spliced alignments (default 0.95)\n\
- --antistranded-penalty=INT (Not currently implemented)\n\
+",min_shortend);
+ fprintf(stdout,"\
+ --distant-splice-identity=FLOAT Minimum identity at end required for distant spliced alignments (default %.2f)\n\
+",min_distantsplicing_identity);
+ fprintf(stdout,"\
+ --antistranded-penalty=INT (Not currently implemented, since it leads to poor results)\n\
Penalty for antistranded splicing when using stranded RNA-Seq protocols.\n\
A positive value, such as 1, expects antisense on the first read\n\
and sense on the second read. Default is 0, which treats sense and antisense\n\
@@ -3543,15 +3576,21 @@ is still designed to be fast.\n\
fprintf(stdout,"Options for paired-end reads\n");
fprintf(stdout,"\
--pairmax-dna=INT Max total genomic length for DNA-Seq paired reads, or other reads\n\
- without splicing (default 1000). Used if -N or -s is not specified.\n\
+ without splicing (default %d). Used if -N or -s is not specified.\n\
+",pairmax_dna);
+ fprintf(stdout,"\
--pairmax-rna=INT Max total genomic length for RNA-Seq paired reads, or other reads\n\
- that could have a splice (default 200000). Used if -N or -s is specified.\n\
+ that could have a splice (default %d). Used if -N or -s is specified.\n\
Should probably match the value for -w, --localsplicedist.\n\
- --pairexpect=INT Expected paired-end length, used for calling splices in medial part of\n\
- paired-end reads (default 200)\n\
- --pairdev=INT Allowable deviation from expected paired-end length, used for\n\
- calling splices in medial part of paired-end reads (default 100)\n\
-");
+",pairmax_rna);
+ fprintf(stdout,"\
+ --pairexpect=INT Expected paired-end length, previously used for calling splices in medial part\n\
+ of paired-end reads (default %d). Currently not used.\n\
+",expected_pairlength);
+ fprintf(stdout,"\
+ --pairdev=INT Allowable deviation from expected paired-end length, previously used for\n\
+ calling splices in medial part of paired-end reads (default %d). Currently not used.\n\
+",pairlength_deviation);
fprintf(stdout,"\n");
@@ -3575,7 +3614,9 @@ is still designed to be fast.\n\
/* Output options */
fprintf(stdout,"Output options\n");
fprintf(stdout,"\
- -n, --npaths=INT Maximum number of paths to print (default 100).\n\
+ -n, --npaths=INT Maximum number of paths to print (default %d).\n\
+",maxpaths_report);
+ fprintf(stdout,"\
-Q, --quiet-if-excessive If more than maximum number of paths are found,\n\
then nothing is printed.\n\
-O, --ordered Print output in same order as input (relevant\n\
@@ -3617,10 +3658,12 @@ is still designed to be fast.\n\
to the existing files. Otherwise, the default is to create new files.\n\
--order-among-best=STRING Among alignments tied with the best score, order those alignments in this order.\n\
Allowed values: genomic, random (default)\n\
- --output-buffer-size=INT Buffer size, in queries, for output thread (default 1000). When the number\n\
+");
+ fprintf(stdout,"\
+ --output-buffer-size=INT Buffer size, in queries, for output thread (default %d). When the number\n\
of results to be printed exceeds this size, the worker threads are halted\n\
until the backlog is cleared\n\
-");
+",output_buffer_size);
fprintf(stdout,"\n");
/* SAM options */
diff --git a/src/iit-read.h b/src/iit-read.h
index da00fe7..e2a622b 100644
--- a/src/iit-read.h
+++ b/src/iit-read.h
@@ -1,6 +1,11 @@
-/* $Id: iit-read.h 132144 2014-04-02 16:02:28Z twu $ */
+/* $Id: iit-read.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef IIT_READ_INCLUDED
#define IIT_READ_INCLUDED
+
+#ifdef HAVE_CONFIG_H
+#include <config.h> /* For HAVE_64_BIT */
+#endif
+
#include <stdio.h>
#include "bool.h"
#include "uintlist.h"
diff --git a/src/indexdb-write.h b/src/indexdb-write.h
index 13ad3e8..a031b3c 100644
--- a/src/indexdb-write.h
+++ b/src/indexdb-write.h
@@ -1,6 +1,9 @@
-/* $Id: indexdb-write.h 132144 2014-04-02 16:02:28Z twu $ */
+/* $Id: indexdb-write.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef INDEXDB_WRITE_INCLUDED
#define INDEXDB_WRITE_INCLUDED
+#ifdef HAVE_CONFIG_H
+#include <config.h> /* For HAVE_64_BIT */
+#endif
#include "bool.h"
#include "types.h" /* For Oligospace_T */
diff --git a/src/indexdb.h b/src/indexdb.h
index ffc0584..9ee1f6b 100644
--- a/src/indexdb.h
+++ b/src/indexdb.h
@@ -1,6 +1,10 @@
-/* $Id: indexdb.h 132144 2014-04-02 16:02:28Z twu $ */
+/* $Id: indexdb.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef INDEXDB_INCLUDED
#define INDEXDB_INCLUDED
+#ifdef HAVE_CONFIG_H
+#include <config.h> /* For HAVE_64_BIT */
+#endif
+
#include <stdio.h>
#include "access.h"
#include "types.h"
diff --git a/src/indexdbdef.h b/src/indexdbdef.h
index e922b0f..cef7911 100644
--- a/src/indexdbdef.h
+++ b/src/indexdbdef.h
@@ -1,6 +1,9 @@
-/* $Id: indexdbdef.h 132144 2014-04-02 16:02:28Z twu $ */
+/* $Id: indexdbdef.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef INDEXDBDEF_INCLUDED
#define INDEXDBDEF_INCLUDED
+#ifdef HAVE_CONFIG_H
+#include <config.h> /* For HAVE_PTHREAD */
+#endif
#include "genomicpos.h"
#include "access.h"
diff --git a/src/oligoindex_hr.c b/src/oligoindex_hr.c
index 414420a..d5e452f 100644
--- a/src/oligoindex_hr.c
+++ b/src/oligoindex_hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: oligoindex_hr.c 153955 2014-11-24 17:54:45Z twu $";
+static char rcsid[] = "$Id: oligoindex_hr.c 156817 2015-01-15 21:55:11Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -18717,6 +18717,7 @@ Oligoindex_get_mappings (List_T diagonals, bool *coveredp, Chrpos_T **mappings,
good_genomicdiags = List_pop(good_genomicdiags,&item);
ptr = (Genomicdiag_T) item;
+ /* Unclear what relationship should be between ptr->i and querylength */
if (ptr->i >= querylength) {
#ifdef USE_DIAGPOOL
diagonals = Diagpool_push(diagonals,diagpool,/*diagonal*/(ptr->i - querylength),
@@ -18727,8 +18728,18 @@ Oligoindex_get_mappings (List_T diagonals, bool *coveredp, Chrpos_T **mappings,
ptr->best_consecutive_start,ptr->best_consecutive_end,
ptr->best_nconsecutive+1));
#endif
+ } else {
+ /* But eliminating this branch misses good alignments */
+#ifdef USE_DIAGPOOL
+ diagonals = Diagpool_push(diagonals,diagpool,/*diagonal*/(querylength - ptr->i),
+ ptr->best_consecutive_start,ptr->best_consecutive_end,
+ ptr->best_nconsecutive+1);
+#else
+ diagonals = List_push(diagonals,(void *) Diag_new(/*diagonal*/(querylength - ptr->i),
+ ptr->best_consecutive_start,ptr->best_consecutive_end,
+ ptr->best_nconsecutive+1));
+#endif
}
-
}
if (querylength + genomiclength > array->max_querylength + array->max_genomiclength) {
diff --git a/src/oligoindex_hr.h b/src/oligoindex_hr.h
index 4cc9184..b7f22f1 100644
--- a/src/oligoindex_hr.h
+++ b/src/oligoindex_hr.h
@@ -1,11 +1,7 @@
-/* $Id: oligoindex_hr.h 132475 2014-04-06 04:14:11Z twu $ */
+/* $Id: oligoindex_hr.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef OLIGOINDEX_HR_INCLUDED
#define OLIGOINDEX_HR_INCLUDED
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
#include "bool.h"
#include "types.h"
#include "mode.h"
@@ -13,9 +9,6 @@
#include "list.h"
#include "diagpool.h"
-#ifdef HAVE_SSE2
-#include <emmintrin.h>
-#endif
#define OVERABUNDANCE_CHECK 50
#define OVERABUNDANCE_PCT 0.97
diff --git a/src/outbuffer.c b/src/outbuffer.c
index 4c6dc81..8116462 100644
--- a/src/outbuffer.c
+++ b/src/outbuffer.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: outbuffer.c 154088 2014-11-25 21:02:46Z twu $";
+static char rcsid[] = "$Id: outbuffer.c 157721 2015-01-29 22:22:28Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -1654,7 +1654,7 @@ print_result_sam (T this, Result_T result, Request_T request) {
SAM_print_nomapping(this->fp_nomapping,ABBREV_NOMAPPING_1,
queryseq1,/*mate*/NULL,/*acc1*/Shortread_accession(queryseq1),
/*acc2*/NULL,this->chromosome_iit,resulttype,
- /*first_read_p*/true,/*nhits_mate*/0,/*mate_chrpos*/0U,
+ /*first_read_p*/true,/*npaths*/0,/*npaths_mate*/0,/*mate_chrpos*/0U,
this->quality_shift,this->sam_read_group_id,this->invert_first_p,this->invert_second_p);
if (this->failedinput_root != NULL) {
if (this->fastq_format_p == true) {
@@ -1707,7 +1707,7 @@ print_result_sam (T this, Result_T result, Request_T request) {
SAM_print_nomapping(this->fp_unpaired_transloc,ABBREV_UNPAIRED_TRANSLOC,
queryseq1,/*mate*/NULL,/*acc1*/Shortread_accession(queryseq1),
/*acc2*/NULL,this->chromosome_iit,resulttype,
- /*first_read_p*/true,/*nhits_mate*/0,/*mate_chrpos*/0U,
+ /*first_read_p*/true,npaths,/*npaths_mate*/0,/*mate_chrpos*/0U,
this->quality_shift,this->sam_read_group_id,this->invert_first_p,this->invert_second_p);
} else {
@@ -1744,7 +1744,7 @@ print_result_sam (T this, Result_T result, Request_T request) {
SAM_print_nomapping(this->fp_unpaired_mult_xs_1,ABBREV_UNPAIRED_MULT_XS,
queryseq1,/*mate*/NULL,/*acc1*/Shortread_accession(queryseq1),
/*acc2*/NULL,this->chromosome_iit,resulttype,
- /*first_read_p*/true,/*nhits_mate*/0,/*mate_chrpos*/0U,
+ /*first_read_p*/true,npaths,/*npaths_mate*/0,/*mate_chrpos*/0U,
this->quality_shift,this->sam_read_group_id,this->invert_first_p,this->invert_second_p);
} else {
@@ -2587,11 +2587,11 @@ Outbuffer_thread_anyorder (void *data) {
Request_free(&request);
noutput++;
- if (this->head && this->nprocessed - noutput > output_buffer_size) {
- /* Clear out backlog */
#ifdef HAVE_PTHREAD
- pthread_mutex_lock(&this->lock);
+ pthread_mutex_lock(&this->lock);
#endif
+ if (this->head && this->nprocessed - noutput > output_buffer_size) {
+ /* Clear out backlog */
while (this->head && this->nprocessed - noutput > output_buffer_size) {
this->head = RRlist_pop(this->head,&request,&result);
debug1(RRlist_dump(this->head,this->tail));
@@ -2613,11 +2613,10 @@ Outbuffer_thread_anyorder (void *data) {
Request_free(&request);
noutput++;
}
-
+ }
#ifdef HAVE_PTHREAD
- pthread_mutex_unlock(&this->lock);
+ pthread_mutex_unlock(&this->lock);
#endif
- }
}
@@ -2728,12 +2727,11 @@ Outbuffer_thread_ordered (void *data) {
}
}
- if (this->head && this->nprocessed - nqueued - noutput > output_buffer_size) {
- /* Clear out backlog */
#ifdef HAVE_PTHREAD
- pthread_mutex_lock(&this->lock);
+ pthread_mutex_lock(&this->lock);
#endif
-
+ if (this->head && this->nprocessed - nqueued - noutput > output_buffer_size) {
+ /* Clear out backlog */
while (this->head && this->nprocessed - nqueued - noutput > output_buffer_size) {
this->head = RRlist_pop(this->head,&request,&result);
if ((id = Result_id(result)) != (int) noutput) {
@@ -2781,11 +2779,10 @@ Outbuffer_thread_ordered (void *data) {
}
}
}
-
+ }
#ifdef HAVE_PTHREAD
- pthread_mutex_unlock(&this->lock);
+ pthread_mutex_unlock(&this->lock);
#endif
- }
}
diff --git a/src/pair.c b/src/pair.c
index 5f26c5f..8120dc3 100644
--- a/src/pair.c
+++ b/src/pair.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: pair.c 154023 2014-11-25 03:45:18Z twu $";
+static char rcsid[] = "$Id: pair.c 158535 2015-02-12 21:33:37Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -124,6 +124,7 @@ static char rcsid[] = "$Id: pair.c 154023 2014-11-25 03:45:18Z twu $";
#define TRIM_MATCH_SCORE 1
static int trim_mismatch_score;
static int trim_indel_score;
+static bool gff3_separators_p;
static bool sam_insert_0M_p = false;
static bool force_xs_direction_p;
static bool md_lowercase_variant_p;
@@ -134,11 +135,12 @@ static Cigar_action_T cigar_action;
void
Pair_setup (int trim_mismatch_score_in, int trim_indel_score_in,
- bool sam_insert_0M_p_in, bool force_xs_direction_p_in,
+ bool gff3_separators_p_in, bool sam_insert_0M_p_in, bool force_xs_direction_p_in,
bool md_lowercase_variant_p_in, bool snps_p_in, Univcoord_T genomelength_in,
Cigar_action_T cigar_action_in) {
trim_mismatch_score = trim_mismatch_score_in;
trim_indel_score = trim_indel_score_in;
+ gff3_separators_p = gff3_separators_p_in;
sam_insert_0M_p = sam_insert_0M_p_in;
force_xs_direction_p = force_xs_direction_p_in;
md_lowercase_variant_p = md_lowercase_variant_p_in;
@@ -3042,7 +3044,6 @@ Pair_print_gff3 (FILE *fp, struct T *pairs, int npairs, int pathnum, char *acces
}
}
- fprintf(fp,"###\n"); /* Terminates gene format */
} else {
print_gff3_exons_forward(fp,pairs,npairs,pathnum,start,end,sourcename,accession,chrstring,
querylength_given,skiplength,matches,mismatches,qindels,tindels,unknowns,
@@ -3050,6 +3051,10 @@ Pair_print_gff3 (FILE *fp, struct T *pairs, int npairs, int pathnum, char *acces
gff_estmatch_format_p);
}
+ if (gff3_separators_p == true) {
+ fprintf(fp,"###\n"); /* Terminates alignment */
+ }
+
if (chrnum != 0) {
FREE(chrstring);
}
@@ -6984,22 +6989,35 @@ Pair_fracidentity (int *matches, int *unknowns, int *mismatches, int *qopens, in
if (cdna_direction > 0) {
if (this->comp == FWD_CANONICAL_INTRON_COMP) {
(*ncanonical)++;
+ in_intron = true;
} else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
(*nsemicanonical)++;
+ in_intron = true;
+ } else if (this->genomejump - this->queryjump < 50) {
+ (*topens)++;
+ (*tindels) += this->genomejump - this->queryjump;
+ /* in_intron = false */
} else if (this->comp == NONINTRON_COMP) {
(*nnoncanonical)++;
+ in_intron = true;
}
} else if (cdna_direction < 0) {
if (this->comp == REV_CANONICAL_INTRON_COMP) {
(*ncanonical)++;
+ in_intron = true;
} else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
(*nsemicanonical)++;
+ in_intron = true;
+ } else if (this->genomejump - this->queryjump < 50) {
+ (*topens)++;
+ (*tindels) += this->genomejump - this->queryjump;
+ /* in_intron = false */
} else if (this->comp == NONINTRON_COMP) {
(*nnoncanonical)++;
+ in_intron = true;
}
}
- in_intron = true;
}
} else {
if (in_intron) {
@@ -7041,6 +7059,103 @@ Pair_fracidentity (int *matches, int *unknowns, int *mismatches, int *qopens, in
}
+int
+Pair_fracidentity_array (int *matches, int *unknowns, int *mismatches, int *qopens, int *qindels,
+ int *topens, int *tindels, int *ncanonical, int *nsemicanonical, int *nnoncanonical,
+ double *min_splice_prob, struct T *ptr, int npairs, int cdna_direction) {
+ bool in_intron = false;
+ int i;
+ T this, prev = NULL;
+
+ *matches = *unknowns = *mismatches = *qopens = *qindels = *topens = *tindels =
+ *ncanonical = *nsemicanonical = *nnoncanonical = 0;
+ *min_splice_prob = 1.0;
+
+ for (i = 0; i < npairs; i++) {
+ this = ptr++;
+ if (this->gapp) {
+ if (this->donor_prob < *min_splice_prob) {
+ *min_splice_prob = this->donor_prob;
+ }
+ if (this->acceptor_prob < *min_splice_prob) {
+ *min_splice_prob = this->acceptor_prob;
+ }
+ if (!in_intron) {
+ if (cdna_direction > 0) {
+ if (this->comp == FWD_CANONICAL_INTRON_COMP) {
+ (*ncanonical)++;
+ in_intron = true;
+ } else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
+ (*nsemicanonical)++;
+ in_intron = true;
+ } else if (this->genomejump - this->queryjump < 50) {
+ (*topens)++;
+ (*tindels) += this->genomejump - this->queryjump;
+ /* in_intron = false */
+ } else if (this->comp == NONINTRON_COMP) {
+ (*nnoncanonical)++;
+ in_intron = true;
+ }
+
+ } else if (cdna_direction < 0) {
+ if (this->comp == REV_CANONICAL_INTRON_COMP) {
+ (*ncanonical)++;
+ in_intron = true;
+ } else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
+ (*nsemicanonical)++;
+ in_intron = true;
+ } else if (this->genomejump - this->queryjump < 50) {
+ (*topens)++;
+ (*tindels) += this->genomejump - this->queryjump;
+ /* in_intron = false */
+ } else if (this->comp == NONINTRON_COMP) {
+ (*nnoncanonical)++;
+ in_intron = true;
+ }
+ }
+ }
+ } else {
+ if (in_intron) {
+ in_intron = false;
+ }
+ if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
+ if (this->cdna == ' ') {
+ (*tindels)++; /* If genome has extra char, count it as a genome skip */
+ if (prev && prev->cdna != ' ') {
+ (*topens)++;
+ }
+ } else if (this->genome == ' ') {
+ (*qindels)++;
+ if (prev && prev->genome != ' ') {
+ (*qopens)++;
+ }
+ } else {
+ fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
+ this->comp,this->cdna,this->genome);
+ abort();
+ }
+#ifndef PMAP
+ } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
+ (*unknowns)++;
+#endif
+ } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
+ (*matches)++;
+ } else if (this->comp == MISMATCH_COMP) {
+ (*mismatches)++;
+ } else {
+ fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
+ abort();
+ }
+ }
+ prev = this;
+ }
+
+ return (*matches) + MISMATCH*(*mismatches)
+ + QOPEN*(*qopens) + QINDEL*(*qindels) + TOPEN*(*topens) + TINDEL*(*tindels)
+ - CANONICAL_POINTS*(*nnoncanonical);
+}
+
+
#if 0
/* Called on first and last exons during distal/medial calculation */
/* Procedure seems to give random results */
@@ -7357,7 +7472,7 @@ Pair_matchscores_list (int *nmatches, int *ntotal, int *length, List_T pairs) {
void
Pair_pathscores (bool *gapp, int *pathscores, struct T *ptr, int npairs,
- int cdna_direction, int querylength, cDNAEnd_T cdnaend) {
+ int cdna_direction, int querylength, cDNAEnd_T cdnaend, int pre_extension_slop) {
int querypos, querystart, queryend;
int basescore;
bool in_intron = false;
@@ -7371,14 +7486,26 @@ Pair_pathscores (bool *gapp, int *pathscores, struct T *ptr, int npairs,
queryend = this->querypos;
/* printf("Entered Pair_pathscores with querystart %d and queryend %d\n",querystart,queryend); */
- /* Disallow transitions outside of the alignments. Previously
- allowed slop, but (1) not necessary, and (2) gave missed
- merges. */
- for (querypos = 0; querypos < querystart; querypos++) {
- gapp[querypos] = true;
- }
- for (querypos = queryend + 1; querypos < querylength; querypos++) {
- gapp[querypos] = true;
+ /* Allow transitions slightly outside of the ends
+ (pre_extension_slop) when finding non-extended paths to pair, but
+ not when finding the breakpoint for the final pair, which has
+ been extended */
+ if (cdnaend == FIVE) {
+ /* left part of chimera */
+ for (querypos = 0; querypos < querystart; querypos++) {
+ gapp[querypos] = true;
+ }
+ for (querypos = queryend + pre_extension_slop; querypos < querylength; querypos++) {
+ gapp[querypos] = true;
+ }
+ } else {
+ /* right part of chimera */
+ for (querypos = 0; querypos < querystart - pre_extension_slop; querypos++) {
+ gapp[querypos] = true;
+ }
+ for (querypos = queryend; querypos < querylength; querypos++) {
+ gapp[querypos] = true;
+ }
}
/* Initialize to cover the ends that aren't aligned */
diff --git a/src/pair.h b/src/pair.h
index 1a3e93f..be36a93 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -1,4 +1,4 @@
-/* $Id: pair.h 154023 2014-11-25 03:45:18Z twu $ */
+/* $Id: pair.h 158535 2015-02-12 21:33:37Z twu $ */
#ifndef PAIR_INCLUDED
#define PAIR_INCLUDED
@@ -31,7 +31,7 @@ typedef enum {CIGAR_ACTION_IGNORE, CIGAR_ACTION_WARNING, CIGAR_ACTION_ABORT} Cig
extern void
Pair_setup (int trim_mismatch_score_in, int trim_indel_score_in,
- bool sam_insert_0M_p_in, bool force_xs_direction_p_in,
+ bool gff3_separators_p_in, bool sam_insert_0M_p_in, bool force_xs_direction_p_in,
bool md_lowercase_variant_p_in, bool snps_p_in, Univcoord_T genomelength_in,
Cigar_action_T cigar_action_in);
extern int
@@ -278,6 +278,10 @@ Pair_fracidentity (int *matches, int *unknowns, int *mismatches,
int *ncanonical, int *nsemicanonical, int *nnoncanonical,
double *min_splice_prob, List_T pairs, int cdna_direction);
extern int
+Pair_fracidentity_array (int *matches, int *unknowns, int *mismatches, int *qopens, int *qindels,
+ int *topens, int *tindels, int *ncanonical, int *nsemicanonical, int *nnoncanonical,
+ double *min_splice_prob, struct T *ptr, int npairs, int cdna_direction);
+extern int
Pair_fracidentity_score (List_T pairs, int cdna_direction);
extern double
@@ -294,7 +298,7 @@ Pair_matchscores (int *matchscores, struct T *ptr, int npairs, int querylength);
extern void
Pair_pathscores (bool *gapp, int *pathscores, struct T *ptr, int npairs,
- int cdna_direction, int querylength, cDNAEnd_T cdnaend);
+ int cdna_direction, int querylength, cDNAEnd_T cdnaend, int pre_extension_slop);
extern int
Pair_cdna_direction (List_T pairs);
diff --git a/src/popcount.c b/src/popcount.c
index 75e0ae0..780aa46 100644
--- a/src/popcount.c
+++ b/src/popcount.c
@@ -1,4 +1,8 @@
-static char rcsid[] = "$Id: popcount.c 116713 2013-11-27 19:46:01Z twu $";
+static char rcsid[] = "$Id: popcount.c 157234 2015-01-22 18:59:19Z twu $";
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
#ifndef HAVE_BUILTIN_CTZ
const int mod_37_bit_position[] =
diff --git a/src/popcount.h b/src/popcount.h
index fba82fd..88d2244 100644
--- a/src/popcount.h
+++ b/src/popcount.h
@@ -1,6 +1,9 @@
-/* $Id: popcount.h 116713 2013-11-27 19:46:01Z twu $ */
+/* $Id: popcount.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef POPCOUNT_INCLUDED
#define POPCOUNT_INCLUDED
+#ifdef HAVE_CONFIG_H
+#include <config.h> /* For HAVE_BUILTIN_CTZ, HAVE_BUILTIN_POPCOUNT, HAVE_BUILTIN_CLZ */
+#endif
#ifndef HAVE_BUILTIN_CTZ
extern const int mod_37_bit_position[];
diff --git a/src/sam_sort.c b/src/sam_sort.c
index acd252d..97c9795 100644
--- a/src/sam_sort.c
+++ b/src/sam_sort.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: sam_sort.c 154089 2014-11-25 21:03:16Z twu $";
+static char rcsid[] = "$Id: sam_sort.c 154454 2014-12-02 19:30:27Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -1207,6 +1207,9 @@ process_with_dups (FILE *fp_sam, int headerlen, Intlist_T linelengths, int ncell
}
+/* output 0 is stdout */
+#define N_SPLIT_OUTPUTS 22
+
static void
split_output_open (char *sevenway_root, bool appendp) {
char *filename;
@@ -1218,7 +1221,7 @@ split_output_open (char *sevenway_root, bool appendp) {
write_mode = "w";
}
- outputs = (FILE **) MALLOC(23 * sizeof(FILE *));
+ outputs = (FILE **) MALLOC((1+N_SPLIT_OUTPUTS) * sizeof(FILE *));
outputs[OUTPUT_NONE] = stdout;
filename = (char *) CALLOC(strlen(sevenway_root)+strlen(".nomapping")+1,sizeof(char));
@@ -1407,7 +1410,7 @@ static void
split_output_close ( ) {
int i;
- for (i = 1; i <= 23; i++) {
+ for (i = 1; i <= N_SPLIT_OUTPUTS; i++) {
fclose(outputs[i]);
}
return;
@@ -1643,9 +1646,19 @@ main (int argc, char *argv[]) {
if (ncells == 0) {
/* Exit without printing header */
+
+ } else if (sam_headers_p == false) {
+ /* Don't print SAM headers */
+
+ } else if (sevenway_root == NULL) {
+ /* Print SAM headers to stdout */
+ moveto(fp_sam,0);
+ SAM_header_change_HD_tosorted_stdout(fp_sam,headerlen);
+
} else {
+ /* Print SAM headers to each output */
moveto(fp_sam,0);
- SAM_header_change_HD_tosorted(fp_sam,headerlen);
+ SAM_header_change_HD_tosorted_split(fp_sam,headerlen,outputs,N_SPLIT_OUTPUTS);
}
linelengths = Intlist_reverse(linelengths);
diff --git a/src/samheader.c b/src/samheader.c
index 9736fe2..3e279ae 100644
--- a/src/samheader.c
+++ b/src/samheader.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: samheader.c 153955 2014-11-24 17:54:45Z twu $";
+static char rcsid[] = "$Id: samheader.c 155503 2014-12-16 22:22:55Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -9,25 +9,28 @@ static char rcsid[] = "$Id: samheader.c 153955 2014-11-24 17:54:45Z twu $";
#define CHUNK 1024
void
-SAM_header_change_HD_tosorted (FILE *fp, int headerlen) {
+SAM_header_change_HD_tosorted_stdout (FILE *fp, int headerlen) {
char buffer[CHUNK], c, c0, c1, c2;
-
/* @HD */
while (headerlen > 0 && (c = fgetc(fp)) != '\t') {
putchar(c);
headerlen--;
}
- putchar('\t');
- headerlen--;
+ if (headerlen > 0) {
+ putchar('\t');
+ headerlen--;
+ }
/* VN */
while (headerlen > 0 && (c = fgetc(fp)) != '\t') {
putchar(c);
headerlen--;
}
- putchar('\t');
- headerlen--;
+ if (headerlen > 0) {
+ putchar('\t');
+ headerlen--;
+ }
if (headerlen > 3) {
/* SO: */
@@ -62,6 +65,79 @@ SAM_header_change_HD_tosorted (FILE *fp, int headerlen) {
void
+SAM_header_change_HD_tosorted_split (FILE *fp, int headerlen, FILE **outputs, int noutputs) {
+ char buffer[CHUNK], c, c0, c1, c2;
+ int i;
+
+ /* @HD */
+ while (headerlen > 0 && (c = fgetc(fp)) != '\t') {
+ for (i = 1; i <= noutputs; i++) {
+ putc(c,outputs[i]);
+ }
+ headerlen--;
+ }
+ if (headerlen > 0) {
+ for (i = 1; i <= noutputs; i++) {
+ putc('\t',outputs[i]);
+ }
+ headerlen--;
+ }
+
+ /* VN */
+ while (headerlen > 0 && (c = fgetc(fp)) != '\t') {
+ for (i = 1; i <= noutputs; i++) {
+ putc(c,outputs[i]);
+ }
+ headerlen--;
+ }
+ if (headerlen > 0) {
+ for (i = 1; i <= noutputs; i++) {
+ putc('\t',outputs[i]);
+ }
+ headerlen--;
+ }
+
+ if (headerlen > 3) {
+ /* SO: */
+ c0 = fgetc(fp);
+ c1 = fgetc(fp);
+ c2 = fgetc(fp);
+ for (i = 1; i <= noutputs; i++) {
+ fprintf(outputs[i],"%c%c%c",c0,c1,c2);
+ }
+ headerlen -= 3;
+
+ if (c0 == 'S' && c1 == 'O' && c2 == ':') {
+ for (i = 1; i <= noutputs; i++) {
+ fprintf(outputs[i],"coordinate\n");
+ }
+ while (headerlen > 0 && fgetc(fp) != '\n') {
+ /* Skip given SO value */
+ headerlen--;
+ }
+ headerlen--;
+ }
+ }
+
+ while (headerlen > CHUNK) {
+ fread(buffer,sizeof(char),CHUNK,fp);
+ for (i = 1; i <= noutputs; i++) {
+ fwrite(buffer,sizeof(char),CHUNK,outputs[i]);
+ }
+ headerlen -= CHUNK;
+ }
+ if (headerlen > 0) {
+ fread(buffer,sizeof(char),headerlen,fp);
+ for (i = 1; i <= noutputs; i++) {
+ fwrite(buffer,sizeof(char),headerlen,outputs[i]);
+ }
+ }
+
+ return;
+}
+
+
+void
SAM_header_print_HD (FILE *fp, int nworkers, bool orderedp) {
fprintf(fp,"@HD");
diff --git a/src/samheader.h b/src/samheader.h
index 2e4aed1..b7cbd50 100644
--- a/src/samheader.h
+++ b/src/samheader.h
@@ -1,4 +1,4 @@
-/* $Id: samheader.h 149320 2014-09-30 02:16:01Z twu $ */
+/* $Id: samheader.h 154452 2014-12-02 19:28:04Z twu $ */
#ifndef SAMHEADER_INCLUDED
#define SAMHEADER_INCLUDED
@@ -6,7 +6,9 @@
#include "bool.h"
extern void
-SAM_header_change_HD_tosorted (FILE *fp, int headerlen);
+SAM_header_change_HD_tosorted_stdout (FILE *fp, int headerlen);
+extern void
+SAM_header_change_HD_tosorted_split (FILE *fp, int headerlen, FILE **outputs, int noutputs);
extern void
SAM_header_print_HD (FILE *fp, int nworkers, bool orderedp);
extern void
diff --git a/src/samprint.c b/src/samprint.c
index f88f885..2ff9a24 100644
--- a/src/samprint.c
+++ b/src/samprint.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: samprint.c 154023 2014-11-25 03:45:18Z twu $";
+static char rcsid[] = "$Id: samprint.c 154778 2014-12-06 03:32:33Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -17,6 +17,7 @@ static char rcsid[] = "$Id: samprint.c 154023 2014-11-25 03:45:18Z twu $";
#define SANGER_ILLUMINA_DIFF 31
+/* #define PRINT_AMBIG_COORDS 1 */
/* BAM appears to truncate the H information on the ends of a cigar */
/* Also, this provides the information needed for getting term information */
@@ -850,10 +851,11 @@ make_complement_buffered (char *complement, char *sequence, unsigned int length)
+/* npaths could be non-zero, if user selected --quiet-if-excessive */
void
SAM_print_nomapping (FILE *fp, char *abbrev, Shortread_T queryseq, Stage3end_T mate, char *acc1, char *acc2,
Univ_IIT_T chromosome_iit, Resulttype_T resulttype, bool first_read_p,
- int npaths_mate, Chrpos_T mate_chrpos, int quality_shift,
+ int npaths, int npaths_mate, Chrpos_T mate_chrpos, int quality_shift,
char *sam_read_group_id, bool invertp, bool invert_mate_p) {
unsigned int flag;
@@ -914,6 +916,11 @@ SAM_print_nomapping (FILE *fp, char *abbrev, Shortread_T queryseq, Stage3end_T m
fprintf(fp,"\tRG:Z:%s",sam_read_group_id);
}
+ /* 12. TAGS: NH */
+ if (npaths > 0) {
+ fprintf(fp,"\tNH:i:%d",npaths);
+ }
+
/* 12. TAGS: XB */
Shortread_print_barcode(fp,queryseq);
@@ -2938,6 +2945,11 @@ print_halfdonor (FILE *fp, char *abbrev, Substring_T donor, Stage3end_T this, St
bool plusp, printp;
bool start_ambig, end_ambig;
int amb_length_start, amb_length_end;
+ int n, i;
+ Univcoord_T *start_ambcoords, *end_ambcoords, splicecoord;
+#ifdef PRINT_AMBIG_COORDS
+ Univcoord_T chroffset;
+#endif
querylength = Shortread_fulllength(queryseq);
@@ -3337,31 +3349,83 @@ print_halfdonor (FILE *fp, char *abbrev, Substring_T donor, Stage3end_T this, St
/* 12. TAGS: XA */
if ((start_ambig = Stage3end_start_ambiguous_p(this)) == true ||
(end_ambig = Stage3end_end_ambiguous_p(this)) == true) {
-#if 1
- amb_length_start = Stage3end_amb_length_start(this);
- amb_length_end = Stage3end_amb_length_end(this);
+ fprintf(fp,"\tXA:Z:");
+
if (plusp == true) {
- fprintf(fp,"\tXA:Z:%d,%d",amb_length_start,amb_length_end);
- } else {
- fprintf(fp,"\tXA:Z:%d,%d",amb_length_end,amb_length_start);
- }
+ if ((n = Stage3end_start_nambcoords(this)) > 0) {
+ assert(sensep == false);
+ start_ambcoords = Stage3end_start_ambcoords(this);
+ splicecoord = Substring_alignstart(donor);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(donor);
+ fprintf(fp,"%u",start_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",start_ambcoords[i] - chroffset + 1U);
+ }
#else
- if (start_ambig == true && end_ambig == true) {
- fprintf(fp,"\tXA:Z:T,T");
- } else if (plusp == true) {
- if (start_ambig == true) {
- fprintf(fp,"\tXA:Z:T,F");
- } else {
- fprintf(fp,"\tXA:Z:F,T");
+ splicecoord = Substring_alignstart(donor);
+ fprintf(fp,"%u",splicecoord - start_ambcoords[0]);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",splicecoord - start_ambcoords[i]);
+ }
+#endif
}
+ fprintf(fp,"|");
+ if ((n = Stage3end_end_nambcoords(this)) > 0) {
+ assert(sensep == true);
+ end_ambcoords = Stage3end_end_ambcoords(this);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(donor);
+ fprintf(fp,"%u",end_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",end_ambcoords[i] - chroffset + 1U);
+ }
+#else
+ splicecoord = Substring_alignend(donor);
+ fprintf(fp,"%u",end_ambcoords[0] - splicecoord);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",end_ambcoords[i] - splicecoord);
+ }
+#endif
+ }
+
} else {
- if (start_ambig == true) {
- fprintf(fp,"\tXA:Z:F,T");
- } else {
- fprintf(fp,"\tXA:Z:T,F");
+ if ((n = Stage3end_end_nambcoords(this)) > 0) {
+ assert(sensep == true);
+ end_ambcoords = Stage3end_end_ambcoords(this);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(donor);
+ fprintf(fp,"%u",end_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",end_ambcoords[i] - chroffset + 1U);
+ }
+#else
+ splicecoord = Substring_alignend(donor);
+ fprintf(fp,"%u",splicecoord - end_ambcoords[0]);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",splicecoord - end_ambcoords[i]);
+ }
+#endif
}
- }
+ fprintf(fp,"|");
+ if ((n = Stage3end_start_nambcoords(this)) > 0) {
+ assert(sensep == false);
+ start_ambcoords = Stage3end_start_ambcoords(this);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(donor);
+ fprintf(fp,"%u",start_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",start_ambcoords[i] - chroffset + 1U);
+ }
+#else
+ splicecoord = Substring_alignstart(donor);
+ fprintf(fp,"%u",start_ambcoords[0] - splicecoord);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",start_ambcoords[i] - splicecoord);
+ }
#endif
+ }
+ }
}
/* 12. TAGS: XT */
@@ -3557,6 +3621,11 @@ print_halfacceptor (FILE *fp, char *abbrev, Substring_T acceptor, Stage3end_T th
bool plusp, printp;
bool start_ambig, end_ambig;
int amb_length_start, amb_length_end;
+ int n, i;
+ Univcoord_T *start_ambcoords, *end_ambcoords, splicecoord;
+#ifdef PRINT_AMBIG_COORDS
+ Univcoord_T chroffset;
+#endif
querylength = Shortread_fulllength(queryseq);
@@ -3953,31 +4022,82 @@ print_halfacceptor (FILE *fp, char *abbrev, Substring_T acceptor, Stage3end_T th
/* 12. TAGS: XA */
if ((start_ambig = Stage3end_start_ambiguous_p(this)) == true ||
(end_ambig = Stage3end_end_ambiguous_p(this)) == true) {
-#if 1
- amb_length_start = Stage3end_amb_length_start(this);
- amb_length_end = Stage3end_amb_length_end(this);
+ fprintf(fp,"\tXA:Z:");
+
if (plusp == true) {
- fprintf(fp,"\tXA:Z:%d,%d",amb_length_start,amb_length_end);
- } else {
- fprintf(fp,"\tXA:Z:%d,%d",amb_length_end,amb_length_start);
- }
+ if ((n = Stage3end_start_nambcoords(this)) > 0) {
+ assert(sensep == true);
+ start_ambcoords = Stage3end_start_ambcoords(this);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(acceptor);
+ fprintf(fp,"%u",start_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",start_ambcoords[i] - chroffset + 1U);
+ }
#else
- if (start_ambig == true && end_ambig == true) {
- fprintf(fp,"\tXA:Z:T,T");
- } else if (plusp == true) {
- if (start_ambig == true) {
- fprintf(fp,"\tXA:Z:T,F");
- } else {
- fprintf(fp,"\tXA:Z:F,T");
+ splicecoord = Substring_alignstart(acceptor);
+ fprintf(fp,"%u",splicecoord - start_ambcoords[0]);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",splicecoord - start_ambcoords[i]);
+ }
+#endif
+ }
+ fprintf(fp,"|");
+ if ((n = Stage3end_end_nambcoords(this)) > 0) {
+ assert(sensep == false);
+ end_ambcoords = Stage3end_end_ambcoords(this);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(acceptor);
+ fprintf(fp,"%u",end_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",end_ambcoords[i] - chroffset + 1U);
+ }
+#else
+ splicecoord = Substring_alignend(acceptor);
+ fprintf(fp,"%u",end_ambcoords[0] - splicecoord);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",end_ambcoords[i] - splicecoord);
+ }
+#endif
}
+
} else {
- if (start_ambig == true) {
- fprintf(fp,"\tXA:Z:F,T");
- } else {
- fprintf(fp,"\tXA:Z:T,F");
+ if ((n = Stage3end_end_nambcoords(this)) > 0) {
+ assert(sensep == false);
+ end_ambcoords = Stage3end_end_ambcoords(this);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(acceptor);
+ fprintf(fp,"%u",end_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",end_ambcoords[i] - chroffset + 1U);
+ }
+#else
+ splicecoord = Substring_alignend(acceptor);
+ fprintf(fp,"%u",splicecoord - end_ambcoords[0]);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",splicecoord - end_ambcoords[i]);
+ }
+#endif
}
- }
+ fprintf(fp,"|");
+ if ((n = Stage3end_start_nambcoords(this)) > 0) {
+ assert(sensep == true);
+ start_ambcoords = Stage3end_start_ambcoords(this);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(acceptor);
+ fprintf(fp,"%u",start_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",start_ambcoords[i] - chroffset + 1U);
+ }
+#else
+ splicecoord = Substring_alignstart(acceptor);
+ fprintf(fp,"%u",start_ambcoords[0] - splicecoord);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",start_ambcoords[i] - splicecoord);
+ }
#endif
+ }
+ }
}
/* 12. TAGS: XT */
@@ -4641,8 +4761,13 @@ print_shortexon (FILE *fp, char *abbrev, Stage3end_T shortexon, Stage3end_T mate
bool plusp, printp;
bool start_ambig, end_ambig;
int amb_length_start, amb_length_end;
+ int n, i;
+ Univcoord_T *start_ambcoords, *end_ambcoords, splicecoord;
+#ifdef PRINT_AMBIG_COORDS
+ Univcoord_T chroffset;
+#endif
-
+
querylength = Shortread_fulllength(queryseq);
plusp = Stage3end_plusp(shortexon);
@@ -5100,31 +5225,78 @@ print_shortexon (FILE *fp, char *abbrev, Stage3end_T shortexon, Stage3end_T mate
/* 12. TAGS: XA */
if ((start_ambig = Stage3end_start_ambiguous_p(shortexon)) == true ||
(end_ambig = Stage3end_end_ambiguous_p(shortexon)) == true) {
-#if 1
- amb_length_start = Stage3end_amb_length_start(shortexon);
- amb_length_end = Stage3end_amb_length_end(shortexon);
+ fprintf(fp,"\tXA:Z:");
+
if (plusp == true) {
- fprintf(fp,"\tXA:Z:%d,%d",amb_length_start,amb_length_end);
- } else {
- fprintf(fp,"\tXA:Z:%d,%d",amb_length_end,amb_length_start);
- }
+ if ((n = Stage3end_start_nambcoords(shortexon)) > 0) {
+ start_ambcoords = Stage3end_start_ambcoords(shortexon);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(substringM);
+ fprintf(fp,"%u",start_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",start_ambcoords[i] - chroffset + 1U);
+ }
#else
- if (start_ambig == true && end_ambig == true) {
- fprintf(fp,"\tXA:Z:T,T");
- } else if (plusp == true) {
- if (start_ambig == true) {
- fprintf(fp,"\tXA:Z:T,F");
- } else {
- fprintf(fp,"\tXA:Z:F,T");
+ splicecoord = Substring_alignstart(substringM);
+ fprintf(fp,"%u",splicecoord - start_ambcoords[0]);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",splicecoord - start_ambcoords[i]);
+ }
+#endif
}
+ fprintf(fp,"|");
+ if ((n = Stage3end_end_nambcoords(shortexon)) > 0) {
+ end_ambcoords = Stage3end_end_ambcoords(shortexon);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(substringM);
+ fprintf(fp,"%u",end_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",end_ambcoords[i] - chroffset + 1U);
+ }
+#else
+ splicecoord = Substring_alignend(substringM);
+ fprintf(fp,"%u",end_ambcoords[0] - splicecoord);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",end_ambcoords[i] - splicecoord);
+ }
+#endif
+ }
+
} else {
- if (start_ambig == true) {
- fprintf(fp,"\tXA:Z:F,T");
- } else {
- fprintf(fp,"\tXA:Z:T,F");
+ if ((n = Stage3end_end_nambcoords(shortexon)) > 0) {
+ end_ambcoords = Stage3end_end_ambcoords(shortexon);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(substringM);
+ fprintf(fp,"%u",end_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",end_ambcoords[i] - chroffset + 1U);
+ }
+#else
+ splicecoord = Substring_alignend(substringM);
+ fprintf(fp,"%u",splicecoord - end_ambcoords[0]);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",splicecoord - end_ambcoords[i]);
+ }
+#endif
}
- }
+ fprintf(fp,"|");
+ if ((n = Stage3end_start_nambcoords(shortexon)) > 0) {
+ start_ambcoords = Stage3end_start_ambcoords(shortexon);
+#ifdef PRINT_AMBIG_COORDS
+ chroffset = Substring_chroffset(substringM);
+ fprintf(fp,"%u",start_ambcoords[0] - chroffset + 1U);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",start_ambcoords[i] - chroffset + 1U);
+ }
+#else
+ splicecoord = Substring_alignstart(substringM);
+ fprintf(fp,"%u",start_ambcoords[0] - splicecoord);
+ for (i = 1; i < n; i++) {
+ fprintf(fp,",%u",start_ambcoords[i] - splicecoord);
+ }
#endif
+ }
+ }
}
/* 12. TAGS: XC */
@@ -5567,9 +5739,12 @@ SAM_print (FILE *fp, char *abbrev, Stage3end_T this, Stage3end_T mate,
hittype = Stage3end_hittype(this);
/* printf("hittype %s, chrpos %u\n",Stage3end_hittype_string(this),chrpos); */
- if (npaths == 0) { /* was chrpos == 0, but we can actually align to chrpos 0 */
+
+ /* Test for nomapping was chrpos == 0, but we can actually align to chrpos 0 */
+ /* Also, can use this test here because --quiet-if-excessive cases go directly to SAM_print_nomapping */
+ if (npaths == 0) {
SAM_print_nomapping(fp,abbrev,queryseq,mate,acc1,acc2,chromosome_iit,resulttype,first_read_p,
- npaths_mate,mate_chrpos,quality_shift,
+ /*npaths*/0,npaths_mate,mate_chrpos,quality_shift,
sam_read_group_id,invertp,invert_mate_p);
if (failedinput_root != NULL) {
@@ -5879,6 +6054,7 @@ SAM_print (FILE *fp, char *abbrev, Stage3end_T this, Stage3end_T mate,
}
}
}
+
if (normalp == true) {
querylength = Shortread_fulllength(queryseq);
if ((circularpos = Stage3end_circularpos(this)) > 0 &&
@@ -6083,12 +6259,12 @@ SAM_print_paired (Result_T result, Resulttype_T resulttype,
} else
SAM_print_nomapping(fp_nomapping,ABBREV_NOMAPPING_1,queryseq1,/*mate*/(Stage3end_T) NULL,
acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/true,/*npaths_mate*/0,
+ /*first_read_p*/true,/*npaths*/0,/*npaths_mate*/0,
/*mate_chrpos*/0U,quality_shift,
sam_read_group_id,invert_first_p,invert_second_p);
SAM_print_nomapping(fp_nomapping,ABBREV_NOMAPPING_2,queryseq2,/*mate*/(Stage3end_T) NULL,
acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/false,/*npaths_mate*/0,
+ /*first_read_p*/false,/*npaths*/0,/*npaths_mate*/0,
/*mate_chrpos*/0U,quality_shift,
sam_read_group_id,invert_second_p,invert_first_p);
if (failedinput_root != NULL) {
@@ -6232,13 +6408,13 @@ SAM_print_paired (Result_T result, Resulttype_T resulttype,
SAM_print_nomapping(fp_concordant_transloc,ABBREV_CONCORDANT_TRANSLOC,
queryseq1,/*mate*/(Stage3end_T) NULL,
acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/true,/*npaths_mate*/npaths,
+ /*first_read_p*/true,npaths,/*npaths_mate*/npaths,
/*mate_chrpos*/0U,quality_shift,
sam_read_group_id,invert_first_p,invert_second_p);
SAM_print_nomapping(fp_concordant_transloc,ABBREV_CONCORDANT_TRANSLOC,
queryseq2,/*mate*/(Stage3end_T) NULL,
acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/false,/*npaths_mate*/npaths,
+ /*first_read_p*/false,npaths,/*npaths_mate*/npaths,
/*mate_chrpos*/0U,quality_shift,
sam_read_group_id,invert_second_p,invert_first_p);
}
@@ -6323,13 +6499,13 @@ SAM_print_paired (Result_T result, Resulttype_T resulttype,
SAM_print_nomapping(fp_concordant_mult_xs_1,ABBREV_CONCORDANT_MULT_XS,
queryseq1,/*mate*/(Stage3end_T) NULL,
acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/true,/*npaths_mate*/npaths,
+ /*first_read_p*/true,npaths,/*npaths_mate*/npaths,
/*mate_chrpos*/0U,quality_shift,
sam_read_group_id,invert_first_p,invert_second_p);
SAM_print_nomapping(fp_concordant_mult_xs_1,ABBREV_CONCORDANT_MULT_XS,
queryseq2,/*mate*/(Stage3end_T) NULL,
acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/false,/*npaths_mate*/npaths,
+ /*first_read_p*/false,npaths,/*npaths_mate*/npaths,
/*mate_chrpos*/0U,quality_shift,
sam_read_group_id,invert_second_p,invert_first_p);
if (failedinput_root != NULL) {
@@ -6521,13 +6697,13 @@ SAM_print_paired (Result_T result, Resulttype_T resulttype,
SAM_print_nomapping(fp_paired_mult_xs_1,ABBREV_PAIRED_MULT_XS,
queryseq1,/*mate*/(Stage3end_T) NULL,
acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/true,/*npaths_mate*/npaths,
+ /*first_read_p*/true,npaths,/*npaths_mate*/npaths,
/*mate_chrpos*/0U,quality_shift,
sam_read_group_id,invert_first_p,invert_second_p);
SAM_print_nomapping(fp_paired_mult_xs_1,ABBREV_PAIRED_MULT_XS,
queryseq2,/*mate*/(Stage3end_T) NULL,
acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/false,/*npaths_mate*/npaths,
+ /*first_read_p*/false,npaths,/*npaths_mate*/npaths,
/*mate_chrpos*/0U,quality_shift,
sam_read_group_id,invert_second_p,invert_first_p);
@@ -6693,7 +6869,7 @@ SAM_print_paired (Result_T result, Resulttype_T resulttype,
if (1 || failedinput_root != NULL) {
/* Just printing one end as nomapping */
SAM_print_nomapping(fp_xs,abbrev_xs,queryseq1,mate,acc1,acc2,chromosome_iit,
- resulttype,/*first_read_p*/true,/*npaths_mate*/npaths2,
+ resulttype,/*first_read_p*/true,npaths1,/*npaths_mate*/npaths2,
/*mate_chrpos*/chrpos3,
quality_shift,sam_read_group_id,invert_first_p,invert_second_p);
}
@@ -6749,7 +6925,7 @@ SAM_print_paired (Result_T result, Resulttype_T resulttype,
if (1 || failedinput_root != NULL) {
/* Just printing one end as nomapping */
SAM_print_nomapping(fp_xs,abbrev_xs,queryseq2,mate,acc1,acc2,chromosome_iit,
- resulttype,/*first_read_p*/false,/*npaths_mate*/npaths1,
+ resulttype,/*first_read_p*/false,npaths2,/*npaths_mate*/npaths1,
/*mate_chrpos*/chrpos5,
quality_shift,sam_read_group_id,invert_second_p,invert_first_p);
}
@@ -6844,7 +7020,7 @@ SAM_print_paired (Result_T result, Resulttype_T resulttype,
/* just printing one end as nomapping */
/* mate should be non-NULL here */
SAM_print_nomapping(fp,abbrev,queryseq1,mate,acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/true,/*npaths_mate*/npaths2,
+ /*first_read_p*/true,npaths1,/*npaths_mate*/npaths2,
/*mate_chrpos*/chrpos3,
quality_shift,sam_read_group_id,invert_first_p,invert_second_p);
}
@@ -6871,7 +7047,7 @@ SAM_print_paired (Result_T result, Resulttype_T resulttype,
/* Just printing one end as nomapping */
/* mate should be NULL here */
SAM_print_nomapping(fp_xs,abbrev_xs,queryseq1,mate,acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/true,/*npaths_mate*/npaths2,
+ /*first_read_p*/true,npaths1,/*npaths_mate*/npaths2,
/*mate_chrpos*/chrpos3,
quality_shift,sam_read_group_id,invert_first_p,invert_second_p);
}
@@ -6914,7 +7090,7 @@ SAM_print_paired (Result_T result, Resulttype_T resulttype,
/* Just printing one end as nomapping */
/* mate should be non-NULL here */
SAM_print_nomapping(fp,abbrev,queryseq2,mate,acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/false,/*npaths_mate*/npaths1,
+ /*first_read_p*/false,npaths2,/*npaths_mate*/npaths1,
/*mate_chrpos*/chrpos5,
quality_shift,sam_read_group_id,invert_second_p,invert_first_p);
}
@@ -6941,7 +7117,7 @@ SAM_print_paired (Result_T result, Resulttype_T resulttype,
/* Just printing one end as nomapping */
/* mate should be NULL here */
SAM_print_nomapping(fp_xs,abbrev_xs,queryseq2,mate,acc1,acc2,chromosome_iit,resulttype,
- /*first_read_p*/false,/*npaths_mate*/npaths1,
+ /*first_read_p*/false,npaths2,/*npaths_mate*/npaths1,
/*mate_chrpos*/chrpos5,
quality_shift,sam_read_group_id,invert_second_p,invert_first_p);
}
diff --git a/src/samprint.h b/src/samprint.h
index d9cb728..d25ea42 100644
--- a/src/samprint.h
+++ b/src/samprint.h
@@ -1,11 +1,7 @@
-/* $Id: samprint.h 154023 2014-11-25 03:45:18Z twu $ */
+/* $Id: samprint.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef SAMPRINT_INCLUDED
#define SAMPRINT_INCLUDED
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
#include <stdio.h>
#include "stage3hr.h"
#include "iit-read-univ.h"
@@ -60,7 +56,7 @@ SAM_compute_flag (bool plusp, Stage3end_T mate, Resulttype_T resulttype,
extern void
SAM_print_nomapping (FILE *fp, char *abbrev, Shortread_T queryseq, Stage3end_T mate, char *acc1, char *acc2,
Univ_IIT_T chromosome_iit, Resulttype_T resulttype, bool first_read_p,
- int npaths_mate, Chrpos_T mate_chrpos,
+ int npaths, int npaths_mate, Chrpos_T mate_chrpos,
int quality_shift, char *sam_read_group_id, bool invertp, bool invert_mate_p);
extern void
diff --git a/src/samread.c b/src/samread.c
index bcd6f07..440a380 100644
--- a/src/samread.c
+++ b/src/samread.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: samread.c 154089 2014-11-25 21:03:16Z twu $";
+static char rcsid[] = "$Id: samread.c 154570 2014-12-03 22:06:33Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -160,11 +160,69 @@ Samread_get_acc_fromfile (int *acclength, FILE *fp, int linelength) {
}
+/* Called just after we read in '\t', so should start at a field */
static SAM_split_output_type
parse_XO_fromfile (FILE *fp) {
char c = 1, c0, c1;
char abbrev0, abbrev1;
+ c0 = fgetc(fp);
+ c1 = fgetc(fp);
+ if (c0 == 'X' && c1 == 'O') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+ abbrev0 = fgetc(fp);
+ abbrev1 = fgetc(fp);
+ switch (abbrev0) {
+ case 'N':
+ if (abbrev1 == 'M') {
+ return OUTPUT_NM;
+ } else {
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ return OUTPUT_NONE;
+ }
+ case 'C':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_CU;
+ case 'C': return OUTPUT_CC;
+ case 'T': return OUTPUT_CT;
+ case 'M': return OUTPUT_CM;
+ case 'X': return OUTPUT_CX;
+ default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ case 'H':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_HU;
+ case 'C': return OUTPUT_HC;
+ case 'T': return OUTPUT_HT;
+ case 'M': return OUTPUT_HM;
+ case 'X': return OUTPUT_HX;
+ default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ case 'U':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_UU;
+ case 'C': return OUTPUT_UC;
+ case 'T': return OUTPUT_UT;
+ case 'M': return OUTPUT_UM;
+ case 'X': return OUTPUT_UX;
+ default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ case 'P':
+ switch (abbrev1) {
+ case 'C': return OUTPUT_PC;
+ case 'I': return OUTPUT_PI;
+ case 'S': return OUTPUT_PS;
+ case 'L': return OUTPUT_PL;
+ case 'M': return OUTPUT_PM;
+ case 'X': return OUTPUT_PX;
+ default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ }
+
while (c != '\0') {
while ((c = fgetc(fp)) != '\0' && c != '\t') ;
if (c == '\0') {
@@ -235,6 +293,7 @@ parse_XO_fromfile (FILE *fp) {
#define HITI_MAXDIGITS 10
+/* Called just after we read in '\t', so should start at a field */
static SAM_split_output_type
parse_XO_and_HI_fromfile (char **hiti, FILE *fp) {
SAM_split_output_type split_output = OUTPUT_NONE;
@@ -242,6 +301,83 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) {
char abbrev0, abbrev1;
*hiti = MALLOC((HITI_MAXDIGITS + 1) * sizeof(char));
+
+ c0 = fgetc(fp);
+ c1 = fgetc(fp);
+ if (c0 == 'H' && c1 == 'I') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+
+ p = *hiti;
+ while ((c = *p++ = fgetc(fp)) != '\0' && c != '\t') ;
+ *--p = '\0'; /* terminating char */
+
+ } else if (c0 == 'X' && c1 == 'O') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+ abbrev0 = fgetc(fp);
+ abbrev1 = fgetc(fp);
+ switch (abbrev0) {
+ case 'N':
+ if (abbrev1 == 'M') {
+ split_output = OUTPUT_NM;
+ } else {
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ case 'C':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_CU; break;
+ case 'C': split_output = OUTPUT_CC; break;
+ case 'T': split_output = OUTPUT_CT; break;
+ case 'M': split_output = OUTPUT_CM; break;
+ case 'X': split_output = OUTPUT_CX; break;
+ default:
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ case 'H':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_HU; break;
+ case 'C': split_output = OUTPUT_HC; break;
+ case 'T': split_output = OUTPUT_HT; break;
+ case 'M': split_output = OUTPUT_HM; break;
+ case 'X': split_output = OUTPUT_HX; break;
+ default:
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ case 'U':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_UU; break;
+ case 'C': split_output = OUTPUT_UC; break;
+ case 'T': split_output = OUTPUT_UT; break;
+ case 'M': split_output = OUTPUT_UM; break;
+ case 'X': split_output = OUTPUT_UX; break;
+ default:
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ case 'P':
+ switch (abbrev1) {
+ case 'C': split_output = OUTPUT_PC; break;
+ case 'I': split_output = OUTPUT_PI; break;
+ case 'S': split_output = OUTPUT_PS; break;
+ case 'L': split_output = OUTPUT_PL; break;
+ case 'M': split_output = OUTPUT_PM; break;
+ case 'X': split_output = OUTPUT_PX; break;
+ default:
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ default:
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ }
+
while (c != '\0') {
while ((c = fgetc(fp)) != '\0' && c != '\t') ;
if (c == '\0') {
diff --git a/src/sarray-read.c b/src/sarray-read.c
index 5cd2da2..c8d3e60 100644
--- a/src/sarray-read.c
+++ b/src/sarray-read.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: sarray-read.c 154021 2014-11-25 03:44:23Z twu $";
+static char rcsid[] = "$Id: sarray-read.c 155505 2014-12-16 22:23:18Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -44,7 +44,8 @@ static char rcsid[] = "$Id: sarray-read.c 154021 2014-11-25 03:44:23Z twu $";
/* A value of 10000 misses various splices, although they are caught by GSNAP algorithm */
#define EXCESS_SARRAY_HITS 100000
-#define LOCALSPLICING_SLOP 0.05
+#define LOCALSPLICING_NMATCHES_SLOP 1
+#define LOCALSPLICING_PROB_SLOP 0.05
#define USE_SHUFFLE_MASK 1 /* Alternative requires AVX, and that part of the code isn't called much */
@@ -252,6 +253,7 @@ static Splicetype_T *splicetypes;
static Chrpos_T *splicedists;
static int nsplicesites;
+
#if defined(HAVE_SSE2) && defined(USE_SHUFFLE_MASK)
static __m128i shuffle_mask16[16];
#endif
@@ -2760,6 +2762,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
int query_indel_pos;
#endif
+ List_T accepted_hits, rejected_hits;
List_T spliceends_sense, spliceends_antisense, lowprob;
List_T donor_hits, acceptor_hits;
int donor_length, acceptor_length;
@@ -2771,6 +2774,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
int sensedir;
Uintlist_T ambcoords;
Intlist_T amb_knowni, amb_nmismatches;
+ Doublelist_T amb_probs;
int segmenti_donor_nknown, segmentj_acceptor_nknown,
segmentj_antidonor_nknown, segmenti_antiacceptor_nknown;
@@ -2888,7 +2892,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
} else {
debug7(printf("Result: successful hit saved\n"));
- debug(printf("Reporting hit with %d mismatches\n",nmismatches));
+ debug(printf("1. Reporting hit with %d mismatches vs %d allowed\n",nmismatches,nmisses_allowed));
if ((hit = Stage3end_new_substitution(&(*found_score),nmismatches,
left,/*genomiclength*/querylength,
query_compress,plusp,genestrand,first_read_p,
@@ -3082,53 +3086,78 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
}
if (spliceends_sense != NULL) {
- /* nmismatches should be the same for all spliceends, so pick based on prob */
- hit = (Stage3end_T) List_head(spliceends_sense);
- best_nmismatches = Stage3end_nmismatches_whole(hit);
-
+ /* nmismatches here may be different for spliceends from Splice_solve, so pick based on prob and nmismatches */
+ best_nmismatches = querylength;
best_prob = 0.0;
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- debug7(printf("analyzing distance %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ debug7(printf("analyzing distance %d, donor length %d (%llu..%llu) and acceptor length %d (%llu..%llu), nmismatches %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_genomicstart(Stage3end_substring_donor(hit)),Substring_genomicend(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
+ Substring_genomicstart(Stage3end_substring_acceptor(hit)),Substring_genomicend(Stage3end_substring_acceptor(hit)),
+ Stage3end_nmismatches_whole(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
+ if ((nmismatches = Stage3end_nmismatches_whole(hit)) < best_nmismatches) {
+ best_nmismatches = nmismatches;
+ }
if ((prob = Stage3end_chimera_prob(hit)) > best_prob) {
best_prob = prob;
}
}
n_good_spliceends = 0;
+ accepted_hits = rejected_hits = (List_T) NULL;
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP) {
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP &&
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
debug7(printf("accepting distance %d, probabilities %f and %f\n",
Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
+ } else {
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- if (n_good_spliceends == 1) {
+ if (n_good_spliceends == 0) {
+ /* Conjunction is too strict. Allow for disjunction instead. */
+ List_free(&rejected_hits);
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_chimera_prob(hit) == best_prob) {
- debug7(printf("pushing distance %d, probabilities %f and %f\n",
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP ||
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, probabilities %f and %f\n",
Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
- *singlesplicing = List_push(*singlesplicing,(void *) hit);
- nhits += 1;
+ n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
} else {
- Stage3end_free(&hit);
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- List_free(&spliceends_sense);
+ }
+
+ for (p = rejected_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ Stage3end_free(&hit);
+ }
+ List_free(&rejected_hits);
+ List_free(&spliceends_sense);
+
+ if (n_good_spliceends == 1) {
+ *singlesplicing = List_push(*singlesplicing,List_head(accepted_hits));
+ nhits += 1;
+ List_free(&accepted_hits);
} else {
/* 1. Multiple hits, sense, left1 */
debug7(printf("multiple hits with best prob, sense\n"));
donor_hits = acceptor_hits = (List_T) NULL;
if (plusp == true) {
- for (p = spliceends_sense; p != NULL; p = List_next(p)) {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
donor = Stage3end_substring_donor(hit);
acceptor = Stage3end_substring_acceptor(hit);
@@ -3141,7 +3170,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
}
}
} else {
- for (p = spliceends_sense; p != NULL; p = List_next(p)) {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
donor = Stage3end_substring_donor(hit);
acceptor = Stage3end_substring_acceptor(hit);
@@ -3173,6 +3202,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
ambcoords = (Uintlist_T) NULL;
amb_knowni = (Intlist_T) NULL;
amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
for (k = i; k < j; k++) {
acceptor = Stage3end_substring_acceptor(hitarray[k]);
@@ -3183,19 +3213,24 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
#endif
amb_knowni = Intlist_push(amb_knowni,-1);
amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(acceptor));
}
nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
+ prob = best_prob - Substring_chimera_prob(donor);
*ambiguous = List_push(*ambiguous,
(void *) Stage3end_new_splice(&(*found_score),
/*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,/*amb_nmatches*/Substring_match_length_orig(acceptor),
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_nmatches*/Substring_match_length_orig(acceptor),/*amb_prob*/prob,
/*ambcoords_donor*/NULL,ambcoords,
/*amb_knowni_donor*/NULL,amb_knowni,
- /*amb_nmismatches_donort*/NULL,amb_nmismatches,
+ /*amb_nmismatches_donor*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
Stage3end_sensedir(hit),/*sarrayp*/true));
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
Uintlist_free(&ambcoords); /* LARGE_GENOMES not possible with suffix array */
@@ -3230,6 +3265,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
ambcoords = (Uintlist_T) NULL;
amb_knowni = (Intlist_T) NULL;
amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
for (k = i; k < j; k++) {
donor = Stage3end_substring_donor(hitarray[k]);
@@ -3240,19 +3276,24 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
#endif
amb_knowni = Intlist_push(amb_knowni,-1);
amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(donor));
}
nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
+ prob = best_prob - Substring_chimera_prob(acceptor);
*ambiguous = List_push(*ambiguous,
(void *) Stage3end_new_splice(&(*found_score),
nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
/*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,/*amb_nmatches*/Substring_match_length_orig(donor),
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_nmatches*/Substring_match_length_orig(donor),/*amb_prob*/prob,
ambcoords,/*ambcoords_acceptor*/NULL,
amb_knowni,/*amb_knowni_acceptor*/NULL,
amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
Stage3end_sensedir(hit),/*sarrayp*/true));
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
Uintlist_free(&ambcoords); /* LARGE_GENOMES not possible with suffix array */
@@ -3269,66 +3310,87 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
List_free(&acceptor_hits);
}
- List_free(&spliceends_sense);
+ List_free(&accepted_hits);
}
}
if (spliceends_antisense != NULL) {
- /* nmismatches should be the same for all spliceends, so pick based on prob */
- hit = (Stage3end_T) List_head(spliceends_antisense);
- best_nmismatches = Stage3end_nmismatches_whole(hit);
-
+ /* nmismatches here may be different for spliceends from Splice_solve, so pick based on prob and nmismatches */
+ best_nmismatches = querylength;
best_prob = 0.0;
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- debug7(printf("analyzing distance %d, donor length %d (%llu..%llu) and acceptor length %d (%llu..%llu), probabilities %f and %f\n",
+ debug7(printf("analyzing distance %d, donor length %d (%llu..%llu) and acceptor length %d (%llu..%llu), nmismatches %d, probabilities %f and %f\n",
Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
Substring_genomicstart(Stage3end_substring_donor(hit)),Substring_genomicend(Stage3end_substring_donor(hit)),
Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
Substring_genomicstart(Stage3end_substring_acceptor(hit)),Substring_genomicend(Stage3end_substring_acceptor(hit)),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ Stage3end_nmismatches_whole(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
+ if ((nmismatches = Stage3end_nmismatches_whole(hit)) < best_nmismatches) {
+ best_nmismatches = nmismatches;
+ }
if ((prob = Stage3end_chimera_prob(hit)) > best_prob) {
best_prob = prob;
}
}
n_good_spliceends = 0;
+ accepted_hits = rejected_hits = (List_T) NULL;
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP) {
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP &&
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
debug7(printf("accepting distance %d, donor length %d and acceptor length %d, probabilities %f and %f\n",
Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
+ } else {
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- if (n_good_spliceends == 1) {
+ if (n_good_spliceends == 0) {
+ /* Conjunction is too strict. Allow for disjunction instead. */
+ List_free(&rejected_hits);
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_chimera_prob(hit) == best_prob) {
- debug7(printf("pushing distance %d, donor length %d and acceptor length %d, probabilities %f and %f\n",
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP ||
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, donor length %d and acceptor length %d, probabilities %f and %f\n",
Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
- *singlesplicing = List_push(*singlesplicing,(void *) hit);
- nhits += 1;
+ n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
} else {
- Stage3end_free(&hit);
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- List_free(&spliceends_antisense);
+ }
+
+ for (p = rejected_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ Stage3end_free(&hit);
+ }
+ List_free(&rejected_hits);
+ List_free(&spliceends_antisense);
+
+ if (n_good_spliceends == 1) {
+ *singlesplicing = List_push(*singlesplicing,List_head(accepted_hits));
+ nhits += 1;
+ List_free(&accepted_hits);
} else {
/* 2. Multiple hits, antisense, left1 */
debug7(printf("multiple hits with best prob, antisense\n"));
donor_hits = acceptor_hits = (List_T) NULL;
if (plusp == true) {
- for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
donor = Stage3end_substring_donor(hit);
acceptor = Stage3end_substring_acceptor(hit);
@@ -3341,7 +3403,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
}
}
} else {
- for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
donor = Stage3end_substring_donor(hit);
acceptor = Stage3end_substring_acceptor(hit);
@@ -3373,6 +3435,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
ambcoords = (Uintlist_T) NULL;
amb_knowni = (Intlist_T) NULL;
amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
for (k = i; k < j; k++) {
acceptor = Stage3end_substring_acceptor(hitarray[k]);
@@ -3383,19 +3446,24 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
#endif
amb_knowni = Intlist_push(amb_knowni,-1);
amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(acceptor));
}
nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
+ prob = best_prob - Substring_chimera_prob(donor);
*ambiguous = List_push(*ambiguous,
(void *) Stage3end_new_splice(&(*found_score),
/*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,/*amb_nmatches*/Substring_match_length_orig(acceptor),
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_nmatches*/Substring_match_length_orig(acceptor),/*amb_prob*/prob,
/*ambcoords_donor*/NULL,ambcoords,
/*amb_knowni_donor*/NULL,amb_knowni,
- /*amb_nmismatches_donort*/NULL,amb_nmismatches,
+ /*amb_nmismatches_donor*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
Stage3end_sensedir(hit),/*sarrayp*/true));
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
Uintlist_free(&ambcoords); /* LARGE_GENOMES not possible with suffix array */
@@ -3430,6 +3498,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
ambcoords = (Uintlist_T) NULL;
amb_knowni = (Intlist_T) NULL;
amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
for (k = i; k < j; k++) {
donor = Stage3end_substring_donor(hitarray[k]);
@@ -3440,19 +3509,24 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
#endif
amb_knowni = Intlist_push(amb_knowni,-1);
amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(donor));
}
nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
+ prob = best_prob - Substring_chimera_prob(acceptor);
*ambiguous = List_push(*ambiguous,
(void *) Stage3end_new_splice(&(*found_score),
nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
/*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,/*amb_nmatches*/Substring_match_length_orig(donor),
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_nmatches*/Substring_match_length_orig(donor),/*amb_prob*/prob,
ambcoords,/*ambcoords_acceptor*/NULL,
amb_knowni,/*amb_knowni_acceptor*/NULL,
amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
Stage3end_sensedir(hit),/*sarrayp*/true));
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
Uintlist_free(&ambcoords); /* LARGE_GENOMES not possible with suffix array */
@@ -3469,7 +3543,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
List_free(&acceptor_hits);
}
- List_free(&spliceends_antisense);
+ List_free(&accepted_hits);
}
}
@@ -3666,53 +3740,80 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
}
if (spliceends_sense != NULL) {
- /* nmismatches should be the same for all spliceends, so pick based on prob */
- hit = (Stage3end_T) List_head(spliceends_sense);
- best_nmismatches = Stage3end_nmismatches_whole(hit);
-
+ /* nmismatches here may be different for spliceends from Splice_solve, so pick based on prob and nmismatches */
+ best_nmismatches = querylength;
best_prob = 0.0;
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- debug7(printf("analyzing distance %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ debug7(printf("analyzing distance %d, donor length %d (%llu..%llu) and acceptor length %d (%llu..%llu), nmismatches %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_genomicstart(Stage3end_substring_donor(hit)),Substring_genomicend(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
+ Substring_genomicstart(Stage3end_substring_acceptor(hit)),Substring_genomicend(Stage3end_substring_acceptor(hit)),
+ Stage3end_nmismatches_whole(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
+ if ((nmismatches = Stage3end_nmismatches_whole(hit)) < best_nmismatches) {
+ best_nmismatches = nmismatches;
+ }
if ((prob = Stage3end_chimera_prob(hit)) > best_prob) {
best_prob = prob;
}
}
n_good_spliceends = 0;
+ accepted_hits = rejected_hits = (List_T) NULL;
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP) {
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP &&
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
debug7(printf("accepting distance %d, probabilities %f and %f\n",
Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
+ } else {
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- if (n_good_spliceends == 1) {
+ if (n_good_spliceends == 0) {
+ /* Conjunction is too strict. Allow for disjunction instead. */
+ List_free(&rejected_hits);
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_chimera_prob(hit) == best_prob) {
- debug7(printf("pushing distance %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP ||
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, donor length %d and acceptor length %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
+ Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
- *singlesplicing = List_push(*singlesplicing,(void *) hit);
- nhits += 1;
+ n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
} else {
- Stage3end_free(&hit);
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- List_free(&spliceends_sense);
+ }
+
+ for (p = rejected_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ Stage3end_free(&hit);
+ }
+ List_free(&rejected_hits);
+ List_free(&spliceends_sense);
+
+ if (n_good_spliceends == 1) {
+ *singlesplicing = List_push(*singlesplicing,List_head(accepted_hits));
+ nhits += 1;
+ List_free(&accepted_hits);
} else {
/* 3. Multiple hits, sense, left2 */
debug7(printf("multiple hits with best prob, sense\n"));
donor_hits = acceptor_hits = (List_T) NULL;
if (plusp == true) {
- for (p = spliceends_sense; p != NULL; p = List_next(p)) {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
donor = Stage3end_substring_donor(hit);
acceptor = Stage3end_substring_acceptor(hit);
@@ -3725,7 +3826,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
}
}
} else {
- for (p = spliceends_sense; p != NULL; p = List_next(p)) {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
donor = Stage3end_substring_donor(hit);
acceptor = Stage3end_substring_acceptor(hit);
@@ -3757,6 +3858,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
ambcoords = (Uintlist_T) NULL;
amb_knowni = (Intlist_T) NULL;
amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
for (k = i; k < j; k++) {
acceptor = Stage3end_substring_acceptor(hitarray[k]);
@@ -3767,19 +3869,24 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
#endif
amb_knowni = Intlist_push(amb_knowni,-1);
amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(acceptor));
}
nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
+ prob = best_prob - Substring_chimera_prob(donor);
*ambiguous = List_push(*ambiguous,
(void *) Stage3end_new_splice(&(*found_score),
/*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,/*amb_nmatches*/Substring_match_length_orig(acceptor),
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_nmatches*/Substring_match_length_orig(acceptor),/*amb_prob*/prob,
/*ambcoords_donor*/NULL,ambcoords,
/*amb_knowni_donor*/NULL,amb_knowni,
/*amb_nmismatches_donor*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
Stage3end_sensedir(hit),/*sarrayp*/true));
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
Uintlist_free(&ambcoords); /* LARGE_GENOMES not possible with suffix array */
@@ -3815,6 +3922,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
ambcoords = (Uintlist_T) NULL;
amb_knowni = (Intlist_T) NULL;
amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
for (k = i; k < j; k++) {
donor = Stage3end_substring_donor(hitarray[k]);
@@ -3825,19 +3933,24 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
#endif
amb_knowni = Intlist_push(amb_knowni,-1);
amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(donor));
}
nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
+ prob = best_prob - Substring_chimera_prob(acceptor);
*ambiguous = List_push(*ambiguous,
(void *) Stage3end_new_splice(&(*found_score),
nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
/*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,/*amb_nmatches*/Substring_match_length_orig(donor),
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_nmatches*/Substring_match_length_orig(donor),/*amb_prob*/prob,
ambcoords,/*ambcoords_acceptor*/NULL,
amb_knowni,/*amb_knowni_acceptor*/NULL,
amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
Stage3end_sensedir(hit),/*sarrayp*/true));
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
Uintlist_free(&ambcoords); /* LARGE_GENOMES not possible with suffix array */
@@ -3854,58 +3967,85 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
List_free(&acceptor_hits);
}
- List_free(&spliceends_sense);
+ List_free(&accepted_hits);
}
}
if (spliceends_antisense != NULL) {
- /* nmismatches should be the same for all spliceends, so pick based on prob */
- hit = (Stage3end_T) List_head(spliceends_antisense);
- best_nmismatches = Stage3end_nmismatches_whole(hit);
-
+ /* nmismatches here may be different for spliceends from Splice_solve, so pick based on prob and nmismatches */
+ best_nmismatches = querylength;
best_prob = 0.0;
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- debug7(printf("analyzing distance %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ debug7(printf("analyzing distance %d, donor length %d (%llu..%llu) and acceptor length %d (%llu..%llu), nmismatches %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_genomicstart(Stage3end_substring_donor(hit)),Substring_genomicend(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
+ Substring_genomicstart(Stage3end_substring_acceptor(hit)),Substring_genomicend(Stage3end_substring_acceptor(hit)),
+ Stage3end_nmismatches_whole(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
+ if ((nmismatches = Stage3end_nmismatches_whole(hit)) < best_nmismatches) {
+ best_nmismatches = nmismatches;
+ }
if ((prob = Stage3end_chimera_prob(hit)) > best_prob) {
best_prob = prob;
}
}
n_good_spliceends = 0;
+ accepted_hits = rejected_hits = (List_T) NULL;
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP) {
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP &&
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
debug7(printf("accepting distance %d, probabilities %f and %f\n",
Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
+ } else {
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- if (n_good_spliceends == 1) {
+ if (n_good_spliceends == 0) {
+ /* Conjunction is too strict. Allow for disjunction instead. */
+ List_free(&rejected_hits);
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_chimera_prob(hit) == best_prob) {
- debug7(printf("pushing distance %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP ||
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, donor length %d and acceptor length %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
+ Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
- *singlesplicing = List_push(*singlesplicing,(void *) hit);
- nhits += 1;
+ n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
} else {
- Stage3end_free(&hit);
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- List_free(&spliceends_antisense);
+ }
+
+ for (p = rejected_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ Stage3end_free(&hit);
+ }
+ List_free(&rejected_hits);
+ List_free(&spliceends_antisense);
+
+ if (n_good_spliceends == 1) {
+ *singlesplicing = List_push(*singlesplicing,List_head(accepted_hits));
+ nhits += 1;
+ List_free(&accepted_hits);
} else {
/* 4. Multiple hits, antisense, left2 */
debug7(printf("multiple hits with best prob, antisense\n"));
donor_hits = acceptor_hits = (List_T) NULL;
if (plusp == true) {
- for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
donor = Stage3end_substring_donor(hit);
acceptor = Stage3end_substring_acceptor(hit);
@@ -3918,7 +4058,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
}
}
} else {
- for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
donor = Stage3end_substring_donor(hit);
acceptor = Stage3end_substring_acceptor(hit);
@@ -3950,8 +4090,9 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
ambcoords = (Uintlist_T) NULL;
amb_knowni = (Intlist_T) NULL;
amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
- for (k = i; i < j; k++) {
+ for (k = i; k < j; k++) {
acceptor = Stage3end_substring_acceptor(hitarray[k]);
#ifdef LARGE_GENOMES
ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
@@ -3960,19 +4101,24 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
#endif
amb_knowni = Intlist_push(amb_knowni,-1);
amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(acceptor));
}
nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
+ prob = best_prob - Substring_chimera_prob(donor);
*ambiguous = List_push(*ambiguous,
(void *) Stage3end_new_splice(&(*found_score),
/*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,/*amb_nmatches*/Substring_match_length_orig(acceptor),
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_nmatches*/Substring_match_length_orig(acceptor),/*amb_prob*/prob,
/*ambcoords_donor*/NULL,ambcoords,
/*amb_knowni_donor*/NULL,amb_knowni,
/*amb_nmismatches_donor*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
Stage3end_sensedir(hit),/*sarrayp*/true));
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
Uintlist_free(&ambcoords); /* LARGE_GENOMES not possible with suffix array */
@@ -4007,6 +4153,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
ambcoords = (Uintlist_T) NULL;
amb_knowni = (Intlist_T) NULL;
amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
for (k = i; k < j; k++) {
donor = Stage3end_substring_donor(hitarray[k]);
@@ -4017,19 +4164,24 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
#endif
amb_knowni = Intlist_push(amb_knowni,-1);
amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(donor));
}
nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
+ prob = best_prob - Substring_chimera_prob(acceptor);
*ambiguous = List_push(*ambiguous,
(void *) Stage3end_new_splice(&(*found_score),
nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
/*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,/*amb_nmatches*/Substring_match_length_orig(donor),
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_nmatches*/Substring_match_length_orig(donor),/*amb_prob*/prob,
ambcoords,/*ambcoords_acceptor*/NULL,
amb_knowni,/*amb_knowni_acceptor*/NULL,
amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
Stage3end_sensedir(hit),/*sarrayp*/true));
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
Uintlist_free(&ambcoords); /* LARGE_GENOMES not possible with suffix array */
@@ -4046,7 +4198,7 @@ collect_elt_matches (int *found_score, List_T *subs, List_T *indels, List_T *amb
List_free(&acceptor_hits);
}
- List_free(&spliceends_antisense);
+ List_free(&accepted_hits);
}
}
@@ -4158,6 +4310,11 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am
debug(printf("Looking at plus position %u => %d mismatches\n",left,nmismatches));
}
+#if 0
+ } else if (terminal_threshold >= *found_score) {
+ debug(printf("terminal_threshold %d exceeds found_score %d, so not checking middle of read\n",terminal_threshold,*found_score));
+#endif
+
} else {
/* Try starting from middle of read */
halfwaypos = querylength/2;
@@ -4217,6 +4374,12 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am
}
debug(printf("Looking at minus position %u => %d mismatches\n",left,nmismatches));
}
+
+#if 0
+ } else if (terminal_threshold >= *found_score) {
+ debug(printf("terminal_threshold %d exceeds found_score %d, so not checking middle of read\n",terminal_threshold,*found_score));
+#endif
+
} else {
/* Try starting from middle of read */
halfwaypos = querylength/2;
@@ -4303,8 +4466,8 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am
}
}
} else {
- /* Second (right) elt is best */
- debug(printf("Second elt %p is best:\n",elt));
+ /* Second (right) plus elt is best */
+ debug(printf("Second plus elt %p is best:\n",elt));
plus_set = List_push(NULL,best_plus_elt);
best_plus_elt = elt;
best_plus_nmatches = nmatches;
@@ -4381,8 +4544,8 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am
}
}
} else {
- /* Second (right) elt is best */
- debug(printf("Second elt %p is best:\n",elt));
+ /* Second (right) minus elt is best */
+ debug(printf("Second minus elt %p is best:\n",elt));
minus_set = List_push(NULL,best_minus_elt);
best_minus_elt = elt;
best_minus_nmatches = nmatches;
@@ -4498,6 +4661,7 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am
/*plusp*/true,genestrand,first_read_p)) <= nmisses_allowed) {
chrnum = Univ_IIT_get_one(chromosome_iit,left,left);
Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
+ debug(printf("2. Reporting hit with %d mismatches vs %d allowed\n",nmismatches,nmisses_allowed));
if ((hit = Stage3end_new_substitution(&(*found_score),nmismatches,
left,/*genomiclength*/querylength,
query_compress_fwd,/*plusp*/true,genestrand,first_read_p,
@@ -4535,6 +4699,7 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am
/*plusp*/false,genestrand,first_read_p)) <= nmisses_allowed) {
chrnum = Univ_IIT_get_one(chromosome_iit,left,left);
Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
+ debug(printf("3. Reporting hit with %d mismatches vs %d allowed\n",nmismatches,nmisses_allowed));
if ((hit = Stage3end_new_substitution(&(*found_score),nmismatches,
left,/*genomiclength*/querylength,
query_compress_rev,/*plusp*/false,genestrand,first_read_p,
diff --git a/src/sequence.h b/src/sequence.h
index 222a46e..45108f0 100644
--- a/src/sequence.h
+++ b/src/sequence.h
@@ -1,6 +1,11 @@
-/* $Id: sequence.h 132731 2014-04-08 21:19:57Z twu $ */
+/* $Id: sequence.h 157232 2015-01-22 18:55:31Z twu $ */
#ifndef SEQUENCE_INCLUDED
#define SEQUENCE_INCLUDED
+
+#ifdef HAVE_CONFIG_H
+#include <config.h> /* For HAVE_ZLIB, HAVE_BZLIB */
+#endif
+
#include <stdio.h>
#include "bool.h"
diff --git a/src/splice.c b/src/splice.c
index 19ed48d..90aad69 100644
--- a/src/splice.c
+++ b/src/splice.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: splice.c 153224 2014-11-13 23:02:28Z twu $";
+static char rcsid[] = "$Id: splice.c 154778 2014-12-06 03:32:33Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -18,7 +18,9 @@ static char rcsid[] = "$Id: splice.c 153224 2014-11-13 23:02:28Z twu $";
#define LOWPROB_SUPPORT 20
-#define LOCALSPLICING_SLOP 0.05
+
+#define LOCALSPLICING_NMATCHES_SLOP 1
+#define LOCALSPLICING_PROB_SLOP 0.05
/* Splice_solve_single */
@@ -36,10 +38,10 @@ static char rcsid[] = "$Id: splice.c 153224 2014-11-13 23:02:28Z twu $";
#endif
/* Group by segmentj */
-#ifdef DEBUG9
-#define debug9(x) x
+#ifdef DEBUG7
+#define debug7(x) x
#else
-#define debug9(x)
+#define debug7(x)
#endif
@@ -416,10 +418,11 @@ Splice_solve_single_sense (int *found_score, int *nhits, List_T hits, List_T *lo
*nhits += 1;
return List_push(hits,(void *) Stage3end_new_splice(&(*found_score),best_segmenti_nmismatches,best_segmentj_nmismatches,
donor,acceptor,/*distance*/segmentj_left - segmenti_left,
- /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,first_read_p,sensedir,
sarrayp));
} else if (subs_or_indels_p == true) {
@@ -434,10 +437,11 @@ Splice_solve_single_sense (int *found_score, int *nhits, List_T hits, List_T *lo
*lowprob = List_push(*lowprob,
(void *) Stage3end_new_splice(&(*found_score),best_segmenti_nmismatches,best_segmentj_nmismatches,
donor,acceptor,/*distance*/segmentj_left - segmenti_left,
- /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,first_read_p,sensedir,
sarrayp));
return hits;
@@ -480,10 +484,11 @@ Splice_solve_single_sense (int *found_score, int *nhits, List_T hits, List_T *lo
*nhits += 1;
return List_push(hits,(void *) Stage3end_new_splice(&(*found_score),best_segmentj_nmismatches,best_segmenti_nmismatches,
donor,acceptor,/*distance*/segmentj_left - segmenti_left,
- /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,first_read_p,sensedir,
sarrayp));
} else if (subs_or_indels_p == true) {
@@ -498,10 +503,11 @@ Splice_solve_single_sense (int *found_score, int *nhits, List_T hits, List_T *lo
*lowprob = List_push(*lowprob,
(void *) Stage3end_new_splice(&(*found_score),best_segmentj_nmismatches,best_segmenti_nmismatches,
donor,acceptor,/*distance*/segmentj_left - segmenti_left,
- /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,first_read_p,sensedir,
sarrayp));
return hits;
@@ -858,10 +864,11 @@ Splice_solve_single_antisense (int *found_score, int *nhits, List_T hits, List_T
*nhits += 1;
return List_push(hits,(void *) Stage3end_new_splice(&(*found_score),best_segmenti_nmismatches,best_segmentj_nmismatches,
donor,acceptor,/*distance*/segmentj_left - segmenti_left,
- /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,first_read_p,sensedir,
sarrayp));
} else if (subs_or_indels_p == true) {
@@ -876,10 +883,11 @@ Splice_solve_single_antisense (int *found_score, int *nhits, List_T hits, List_T
*lowprob = List_push(*lowprob,
(void *) Stage3end_new_splice(&(*found_score),best_segmenti_nmismatches,best_segmentj_nmismatches,
donor,acceptor,/*distance*/segmentj_left - segmenti_left,
- /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,first_read_p,sensedir,
sarrayp));
return hits;
@@ -922,10 +930,11 @@ Splice_solve_single_antisense (int *found_score, int *nhits, List_T hits, List_T
*nhits += 1;
return List_push(hits,(void *) Stage3end_new_splice(&(*found_score),best_segmentj_nmismatches,best_segmenti_nmismatches,
donor,acceptor,/*distance*/segmentj_left - segmenti_left,
- /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,first_read_p,sensedir,
sarrayp));
} else if (subs_or_indels_p == true) {
@@ -940,10 +949,11 @@ Splice_solve_single_antisense (int *found_score, int *nhits, List_T hits, List_T
*lowprob = List_push(*lowprob,
(void *) Stage3end_new_splice(&(*found_score),best_segmentj_nmismatches,best_segmenti_nmismatches,
donor,acceptor,/*distance*/segmentj_left - segmenti_left,
- /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,splicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,first_read_p,sensedir,
sarrayp));
return hits;
@@ -1471,9 +1481,11 @@ Splice_solve_double (int *found_score, int *nhits, List_T hits, List_T *lowprob,
*nhits += 1;
hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),donor,acceptor,shortexon,
/*amb_length_donor*/0,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,sarrayp));
} else if (subs_or_indels_p == true) {
@@ -1489,9 +1501,11 @@ Splice_solve_double (int *found_score, int *nhits, List_T hits, List_T *lowprob,
*lowprob = List_push(*lowprob,
(void *) Stage3end_new_shortexon(&(*found_score),donor,acceptor,shortexon,
/*amb_length_donor*/0,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,sarrayp));
} else {
@@ -1544,9 +1558,11 @@ Splice_solve_double (int *found_score, int *nhits, List_T hits, List_T *lowprob,
*nhits += 1;
hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),donor,acceptor,shortexon,
/*amb_length_donor*/0,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,sarrayp));
} else if (subs_or_indels_p == true) {
@@ -1562,9 +1578,11 @@ Splice_solve_double (int *found_score, int *nhits, List_T hits, List_T *lowprob,
*lowprob = List_push(*lowprob,
(void *) Stage3end_new_shortexon(&(*found_score),donor,acceptor,shortexon,
/*amb_length_donor*/0,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,sarrayp));
} else {
@@ -1580,16 +1598,53 @@ Splice_solve_double (int *found_score, int *nhits, List_T hits, List_T *lowprob,
return hits;
}
+
+static int
+donor_match_length_cmp (const void *a, const void *b) {
+ Stage3end_T x = * (Stage3end_T *) a;
+ Stage3end_T y = * (Stage3end_T *) b;
+
+ int x_length = Substring_match_length_orig(Stage3end_substring_donor(x));
+ int y_length = Substring_match_length_orig(Stage3end_substring_donor(y));
+
+ if (x_length < y_length) {
+ return -1;
+ } else if (y_length < x_length) {
+ return +1;
+ } else {
+ return 0;
+ }
+}
+
+static int
+acceptor_match_length_cmp (const void *a, const void *b) {
+ Stage3end_T x = * (Stage3end_T *) a;
+ Stage3end_T y = * (Stage3end_T *) b;
+
+ int x_length = Substring_match_length_orig(Stage3end_substring_acceptor(x));
+ int y_length = Substring_match_length_orig(Stage3end_substring_acceptor(y));
+
+ if (x_length < y_length) {
+ return -1;
+ } else if (y_length < x_length) {
+ return +1;
+ } else {
+ return 0;
+ }
+}
+
+
static List_T
group_by_segmenti_aux (int *found_score, List_T winners, List_T *ambiguous,
- Stage3end_T *array, int n, int querylength, bool first_read_p, bool sarrayp) {
- Stage3end_T hit;
- int j, i, k;
+ Stage3end_T *hitarray, int n, int querylength, bool first_read_p, bool sarrayp) {
+ Stage3end_T hit, *subarray;
+ int i, j, k, ii, jj, kk, nn;
int n_good_spliceends;
Univcoord_T segmenti_left;
Substring_T donor, acceptor;
int best_nmismatches, nmismatches, nmismatches_donor, nmismatches_acceptor;
double best_prob, prob;
+ List_T accepted_hits, rejected_hits, donor_hits, acceptor_hits, p;
int sensedir;
#ifdef LARGE_GENOMES
@@ -1598,171 +1653,260 @@ group_by_segmenti_aux (int *found_score, List_T winners, List_T *ambiguous,
Uintlist_T ambcoords;
#endif
Intlist_T amb_knowni, amb_nmismatches;
-
- j = 0;
- while (j + 1 < n) {
- segmenti_left = Stage3end_chimera_segmenti_left(array[j]);
- k = j + 1;
- while (k < n && Stage3end_chimera_segmenti_left(array[k]) == segmenti_left) {
- k++;
- }
- /* [j..(k-1)] constitutes a group of splices with the same segmenti */
- debug9(printf("GROUP from %d to %d\n",j,k-1));
- for (i = j; i < k; i++) {
- debug9(printf("%d %u %u\n",i,Stage3end_chimera_segmenti_left(array[i]),Stage3end_chimera_segmentj_left(array[i])));
+ Doublelist_T amb_probs;
+ int donor_length, acceptor_length;
+ bool plusp;
+
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ segmenti_left = Stage3end_chimera_segmenti_left(hit);
+ j = i + 1;
+ while (j < n && Stage3end_chimera_segmenti_left(hitarray[j]) == segmenti_left) {
+ j++;
}
- debug9(printf("\n"));
-
- if (j == k - 1) {
+ if (j == i + 1) {
/* Singleton */
- debug9(printf("Saving hit %d\n",j));
- winners = List_push(winners,(void *) array[j]);
+ debug7(printf("Saving hit %d\n",i));
+ winners = List_push(winners,(void *) hit);
} else {
+ plusp = Stage3end_plusp(hit);
best_nmismatches = querylength;
- for (i = j; i < k; i++) {
- hit = array[i];
- debug9(printf("analyzing distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ best_prob = 0.0;
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ debug7(printf("analyzing distance %d, donor length %d (%llu..%llu) and acceptor length %d (%llu..%llu), nmismatches %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_genomicstart(Stage3end_substring_donor(hit)),Substring_genomicend(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
+ Substring_genomicstart(Stage3end_substring_acceptor(hit)),Substring_genomicend(Stage3end_substring_acceptor(hit)),
+ Stage3end_nmismatches_whole(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
-
if ((nmismatches = Stage3end_nmismatches_whole(hit)) < best_nmismatches) {
best_nmismatches = nmismatches;
- best_prob = Stage3end_chimera_prob(hit);
- } else if (nmismatches == best_nmismatches && (prob = Stage3end_chimera_prob(hit)) > best_prob) {
+ }
+ if ((prob = Stage3end_chimera_prob(hit)) > best_prob) {
best_prob = prob;
}
}
n_good_spliceends = 0;
- for (i = j; i < k; i++) {
- hit = array[i];
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug9(printf("accepting distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ accepted_hits = rejected_hits = (List_T) NULL;
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP &&
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
+ } else {
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
-
- if (n_good_spliceends == 1) {
- for (i = j; i < k; i++) {
- hit = array[i];
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug9(printf("pushing distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+
+ if (n_good_spliceends == 0) {
+ /* Conjunction is too strict. Allow for disjunction instead. */
+ List_free(&rejected_hits);
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP ||
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
- debug9(printf("Saving hit %d\n",i));
- winners = List_push(winners,(void *) hit);
+ n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
} else {
- debug9(printf("Freeing hit %d\n",i));
- Stage3end_free(&hit);
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
+ }
+
+ for (p = rejected_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ Stage3end_free(&hit);
+ }
+ List_free(&rejected_hits);
+
+ if (n_good_spliceends == 1) {
+ winners = List_push(winners,List_head(accepted_hits));
+ List_free(&accepted_hits);
} else {
- /* Create ambiguous */
- hit = array[j];
- donor = Stage3end_substring_donor(hit);
- acceptor = Stage3end_substring_acceptor(hit);
- sensedir = Stage3end_sensedir(hit);
-
- ambcoords = NULL;
- amb_knowni = (Intlist_T) NULL;
- amb_nmismatches = (Intlist_T) NULL;
-
- if (Substring_left_genomicseg(donor) == segmenti_left) {
- for (i = j; i < k; i++) {
- hit = array[i];
+ /* Multiple hits */
+ donor_hits = acceptor_hits = (List_T) NULL;
+ if (plusp == true) {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ donor = Stage3end_substring_donor(hit);
acceptor = Stage3end_substring_acceptor(hit);
+ if (Substring_genomicstart(donor) == segmenti_left) {
+ donor_hits = List_push(donor_hits,(void *) hit);
+ } else if (Substring_genomicstart(acceptor) == segmenti_left) {
+ acceptor_hits = List_push(acceptor_hits,(void *) hit);
+ } else {
+ Stage3end_free(&hit);
+ }
+ }
+ } else {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ donor = Stage3end_substring_donor(hit);
+ acceptor = Stage3end_substring_acceptor(hit);
+ if (Substring_genomicend(donor) == segmenti_left) {
+ donor_hits = List_push(donor_hits,(void *) hit);
+ } else if (Substring_genomicend(acceptor) == segmenti_left) {
+ acceptor_hits = List_push(acceptor_hits,(void *) hit);
+ } else {
+ Stage3end_free(&hit);
+ }
+ }
+ }
+
+ if (donor_hits != NULL) {
+ subarray = (Stage3end_T *) List_to_array_n(&nn,donor_hits);
+ qsort(subarray,nn,sizeof(Stage3end_T),donor_match_length_cmp);
+ ii = 0;
+ while (ii < nn) {
+ hit = subarray[ii];
+ donor = Stage3end_substring_donor(hit);
+ donor_length = Substring_match_length_orig(donor);
+ jj = ii + 1;
+ while (jj < nn && Substring_match_length_orig(Stage3end_substring_donor(subarray[jj])) == donor_length) {
+ jj++;
+ }
+ if (jj == ii + 1) {
+ winners = List_push(winners,(void *) hit);
+ } else {
+ sensedir = Stage3end_sensedir(hit);
+
+ ambcoords = NULL;
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
+
+ for (kk = ii; kk < jj; kk++) {
+ acceptor = Stage3end_substring_acceptor(subarray[kk]);
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
- }
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(acceptor));
+ }
- nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
- donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(acceptor),
- /*ambcoords_donor*/NULL,ambcoords,
- /*amb_knowni_donor*/NULL,amb_knowni,
- /*amb_nmismatches_donor*/NULL,amb_nmismatches,
- /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
- sensedir,sarrayp));
+ nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
+ prob = best_prob - Substring_chimera_prob(donor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
+ donor,/*acceptor*/NULL,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(acceptor),/*amb_prob*/prob,
+ /*ambcoords_donor*/NULL,ambcoords,
+ /*amb_knowni_donor*/NULL,amb_knowni,
+ /*amb_nmismatches_donor*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
+ /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
+ sensedir,sarrayp));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_knowni);
+ Intlist_free(&amb_nmismatches);
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ Uint8list_free(&ambcoords);
#else
- Uintlist_free(&ambcoords);
+ Uintlist_free(&ambcoords);
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
-
- } else if (Substring_left_genomicseg(acceptor) == segmenti_left) {
- for (i = j; i < k; i++) {
- hit = array[i];
- donor = Stage3end_substring_donor(hit);
+ for (kk = ii; kk < jj; kk++) {
+ hit = subarray[kk];
+ Stage3end_free(&hit);
+ }
+ }
+
+ ii = jj;
+ }
+ FREE(subarray);
+ List_free(&donor_hits);
+ }
+
+ if (acceptor_hits != NULL) {
+ subarray = (Stage3end_T *) List_to_array_n(&nn,acceptor_hits);
+ qsort(subarray,nn,sizeof(Stage3end_T),acceptor_match_length_cmp);
+ ii = 0;
+ while (ii < nn) {
+ hit = subarray[ii];
+ acceptor = Stage3end_substring_acceptor(hit);
+ acceptor_length = Substring_match_length_orig(acceptor);
+ jj = ii + 1;
+ while (jj < nn && Substring_match_length_orig(Stage3end_substring_acceptor(subarray[jj])) == acceptor_length) {
+ jj++;
+ }
+ if (jj == ii + 1) {
+ winners = List_push(winners,(void *) hit);
+ } else {
+ sensedir = Stage3end_sensedir(hit);
+
+ ambcoords = NULL;
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
+
+ for (kk = ii; kk < jj; kk++) {
+ donor = Stage3end_substring_donor(subarray[kk]);
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
- }
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(donor));
+ }
- nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
- /*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(donor),
- ambcoords,/*ambcoords_acceptor*/NULL,
- amb_knowni,/*amb_knowni_acceptor*/NULL,
- amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
- /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
- sensedir,sarrayp));
+ nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
+ prob = best_prob - Substring_chimera_prob(acceptor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
+ /*donor*/NULL,acceptor,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(donor),/*amb_prob*/prob,
+ ambcoords,/*ambcoords_acceptor*/NULL,
+ amb_knowni,/*amb_knowni_acceptor*/NULL,
+ amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
+ /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
+ sensedir,sarrayp));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_knowni);
+ Intlist_free(&amb_nmismatches);
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ Uint8list_free(&ambcoords);
#else
- Uintlist_free(&ambcoords);
+ Uintlist_free(&ambcoords);
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
-
- } else {
- fprintf(stderr,"Unexpected: Neither donor left %u nor acceptor left %u equals segmenti_left %u\n",
- Substring_left_genomicseg(donor),Substring_left_genomicseg(acceptor),segmenti_left);
- abort();
- }
+ for (kk = ii; kk < jj; kk++) {
+ hit = subarray[kk];
+ Stage3end_free(&hit);
+ }
+ }
- for (i = j; i < k; i++) {
- hit = array[i];
- debug9(printf("Freeing hit %d\n",i));
- Stage3end_free(&hit);
+ ii = jj;
+ }
+ FREE(subarray);
+ List_free(&acceptor_hits);
}
+
+ List_free(&accepted_hits);
}
}
- j = k;
- }
-
- if (j < n) {
- /* Singleton */
- debug9(printf("Keeping hit %d\n",i));
- winners = List_push(winners,(void *) array[j]);
+ i = j;
}
return winners;
@@ -1827,14 +1971,17 @@ Splice_group_by_segmenti (int *found_score, List_T localsplicing, List_T *ambigu
static List_T
group_by_segmentj_aux (int *found_score, List_T winners, List_T *ambiguous,
- Stage3end_T *array, int n, int querylength, bool first_read_p, bool sarrayp) {
- Stage3end_T hit;
- int j, i, k;
+ Stage3end_T *hitarray, int n, int querylength, bool first_read_p, bool sarrayp) {
+ Stage3end_T hit, *subarray;
+ int i, j, k, ii, jj, kk, nn;
int n_good_spliceends;
Univcoord_T segmentj_left;
Substring_T donor, acceptor;
int best_nmismatches, nmismatches, nmismatches_donor, nmismatches_acceptor;
double best_prob, prob;
+ List_T accepted_hits, rejected_hits, donor_hits, acceptor_hits, p;
+ int donor_length, acceptor_length;
+ bool plusp;
int sensedir;
#ifdef LARGE_GENOMES
@@ -1843,171 +1990,261 @@ group_by_segmentj_aux (int *found_score, List_T winners, List_T *ambiguous,
Uintlist_T ambcoords;
#endif
Intlist_T amb_knowni, amb_nmismatches;
-
- j = 0;
- while (j + 1 < n) {
- segmentj_left = Stage3end_chimera_segmentj_left(array[j]);
- k = j + 1;
- while (k < n && Stage3end_chimera_segmentj_left(array[k]) == segmentj_left) {
- k++;
- }
- /* [j..(k-1)] constitutes a group of splices with the same segmentj */
- debug9(printf("GROUP from %d to %d\n",j,k-1));
- for (i = j; i < k; i++) {
- debug9(printf("%d %u %u\n",i,Stage3end_chimera_segmenti_left(array[i]),Stage3end_chimera_segmentj_left(array[i])));
+ Doublelist_T amb_probs;
+
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ segmentj_left = Stage3end_chimera_segmentj_left(hit);
+ j = i + 1;
+ while (j < n && Stage3end_chimera_segmentj_left(hitarray[j]) == segmentj_left) {
+ j++;
}
- debug9(printf("\n"));
-
- if (j == k - 1) {
+ if (j == i + 1) {
/* Singleton */
- debug9(printf("Saving hit %d\n",j));
- winners = List_push(winners,(void *) array[j]);
+ debug7(printf("Saving hit %d\n",i));
+ winners = List_push(winners,(void *) hit);
} else {
+ plusp = Stage3end_plusp(hit);
best_nmismatches = querylength;
- for (i = j; i < k; i++) {
- hit = array[i];
- debug9(printf("analyzing distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ best_prob = 0.0;
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ debug7(printf("analyzing distance %d, donor length %d (%llu..%llu) and acceptor length %d (%llu..%llu), nmismatches %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_genomicstart(Stage3end_substring_donor(hit)),Substring_genomicend(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
+ Substring_genomicstart(Stage3end_substring_acceptor(hit)),Substring_genomicend(Stage3end_substring_acceptor(hit)),
+ Stage3end_nmismatches_whole(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
-
if ((nmismatches = Stage3end_nmismatches_whole(hit)) < best_nmismatches) {
best_nmismatches = nmismatches;
- best_prob = Stage3end_chimera_prob(hit);
- } else if (nmismatches == best_nmismatches && (prob = Stage3end_chimera_prob(hit)) > best_prob) {
+ }
+ if ((prob = Stage3end_chimera_prob(hit)) > best_prob) {
best_prob = prob;
}
}
n_good_spliceends = 0;
- for (i = j; i < k; i++) {
- hit = array[i];
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug9(printf("accepting distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ accepted_hits = rejected_hits = (List_T) NULL;
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP &&
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
+ } else {
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
-
- if (n_good_spliceends == 1) {
- for (i = j; i < k; i++) {
- hit = array[i];
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug9(printf("pushing distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+
+ if (n_good_spliceends == 0) {
+ /* Conjunction is too strict. Allow for disjunction instead. */
+ List_free(&rejected_hits);
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP ||
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
- debug9(printf("Saving hit %d\n",i));
- winners = List_push(winners,(void *) hit);
+ n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
} else {
- debug9(printf("Freeing hit %d\n",i));
- Stage3end_free(&hit);
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
+ }
+
+ for (p = rejected_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ Stage3end_free(&hit);
+ }
+ List_free(&rejected_hits);
+
+ if (n_good_spliceends == 1) {
+ assert(List_length(accepted_hits) == 1);
+ winners = List_push(winners,List_head(accepted_hits));
+ List_free(&accepted_hits);
} else {
- /* Create ambiguous */
- hit = array[j];
- donor = Stage3end_substring_donor(hit);
- acceptor = Stage3end_substring_acceptor(hit);
- sensedir = Stage3end_sensedir(hit);
-
- ambcoords = NULL;
- amb_knowni = (Intlist_T) NULL;
- amb_nmismatches = (Intlist_T) NULL;
-
- if (Substring_left_genomicseg(donor) == segmentj_left) {
- for (i = j; i < k; i++) {
- hit = array[i];
+ /* Multiple hits */
+ donor_hits = acceptor_hits = (List_T) NULL;
+ if (plusp == true) {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ donor = Stage3end_substring_donor(hit);
+ acceptor = Stage3end_substring_acceptor(hit);
+ if (Substring_genomicstart(donor) == segmentj_left) {
+ donor_hits = List_push(donor_hits,(void *) hit);
+ } else if (Substring_genomicstart(acceptor) == segmentj_left) {
+ acceptor_hits = List_push(acceptor_hits,(void *) hit);
+ } else {
+ abort();
+ Stage3end_free(&hit);
+ }
+ }
+ } else {
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ donor = Stage3end_substring_donor(hit);
acceptor = Stage3end_substring_acceptor(hit);
+ if (Substring_genomicend(donor) == segmentj_left) {
+ donor_hits = List_push(donor_hits,(void *) hit);
+ } else if (Substring_genomicend(acceptor) == segmentj_left) {
+ acceptor_hits = List_push(acceptor_hits,(void *) hit);
+ } else {
+ abort();
+ Stage3end_free(&hit);
+ }
+ }
+ }
+
+ if (donor_hits != NULL) {
+ subarray = (Stage3end_T *) List_to_array_n(&nn,donor_hits);
+ qsort(subarray,nn,sizeof(Stage3end_T),donor_match_length_cmp);
+ ii = 0;
+ while (ii < nn) {
+ hit = subarray[ii];
+ donor = Stage3end_substring_donor(hit);
+ donor_length = Substring_match_length_orig(donor);
+ jj = ii + 1;
+ while (jj < nn && Substring_match_length_orig(Stage3end_substring_donor(subarray[jj])) == donor_length) {
+ jj++;
+ }
+ if (jj == ii + 1) {
+ winners = List_push(winners,(void *) hit);
+ } else {
+ sensedir = Stage3end_sensedir(hit);
+
+ ambcoords = NULL;
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
+
+ for (kk = ii; kk < jj; kk++) {
+ acceptor = Stage3end_substring_acceptor(subarray[kk]);
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
- }
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(acceptor));
+ }
- nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
- donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(acceptor),
- /*ambcoords_donor*/NULL,ambcoords,
- /*amb_knowni_donor*/NULL,amb_knowni,
- /*amb_nmismatches_donor*/NULL,amb_nmismatches,
- /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
- sensedir,sarrayp));
+ nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
+ prob = best_prob - Substring_chimera_prob(acceptor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
+ donor,/*acceptor*/NULL,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(acceptor),/*amb_prob*/prob,
+ /*ambcoords_donor*/NULL,ambcoords,
+ /*amb_knowni_donor*/NULL,amb_knowni,
+ /*amb_nmismatches_donor*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
+ /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
+ sensedir,sarrayp));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_knowni);
+ Intlist_free(&amb_nmismatches);
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ Uint8list_free(&ambcoords);
#else
- Uintlist_free(&ambcoords);
+ Uintlist_free(&ambcoords);
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
-
- } else if (Substring_left_genomicseg(acceptor) == segmentj_left) {
- for (i = j; i < k; i++) {
- hit = array[i];
- donor = Stage3end_substring_donor(hit);
+ for (kk = ii; kk < jj; kk++) {
+ hit = subarray[kk];
+ Stage3end_free(&hit);
+ }
+ }
+
+ ii = jj;
+ }
+ FREE(subarray);
+ List_free(&donor_hits);
+ }
+
+ if (acceptor_hits != NULL) {
+ subarray = (Stage3end_T *) List_to_array_n(&nn,acceptor_hits);
+ qsort(subarray,nn,sizeof(Stage3end_T),acceptor_match_length_cmp);
+ ii = 0;
+ while (ii < nn) {
+ hit = subarray[ii];
+ acceptor = Stage3end_substring_acceptor(hit);
+ acceptor_length = Substring_match_length_orig(acceptor);
+ jj = ii + 1;
+ while (jj < nn && Substring_match_length_orig(Stage3end_substring_acceptor(subarray[jj])) == acceptor_length) {
+ jj++;
+ }
+ if (jj == ii + 1) {
+ winners = List_push(winners,(void *) hit);
+ } else {
+ sensedir = Stage3end_sensedir(hit);
+
+ ambcoords = NULL;
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
+
+ for (kk = ii; kk < jj; kk++) {
+ donor = Stage3end_substring_donor(subarray[kk]);
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
- }
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(donor));
+ }
- nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
- /*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(donor),
- ambcoords,/*ambcoords_acceptor*/NULL,
- amb_knowni,/*amb_knowni_acceptor*/NULL,
- amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
- /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
- sensedir,sarrayp));
+ nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
+ prob = best_prob - Substring_chimera_prob(acceptor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
+ /*donor*/NULL,acceptor,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(donor),/*amb_prob*/prob,
+ ambcoords,/*ambcoords_acceptor*/NULL,
+ amb_knowni,/*amb_knowni_acceptor*/NULL,
+ amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
+ /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
+ sensedir,sarrayp));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_knowni);
+ Intlist_free(&amb_nmismatches);
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ Uint8list_free(&ambcoords);
#else
- Uintlist_free(&ambcoords);
+ Uintlist_free(&ambcoords);
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
-
- } else {
- fprintf(stderr,"Unexpected: Neither donor left %u nor acceptor left %u equals segmentj_left %u\n",
- Substring_left_genomicseg(donor),Substring_left_genomicseg(acceptor),segmentj_left);
- abort();
- }
+ for (kk = ii; kk < jj; kk++) {
+ hit = subarray[kk];
+ Stage3end_free(&hit);
+ }
+ }
- for (i = j; i < k; i++) {
- hit = array[i];
- debug9(printf("Freeing hit %d\n",i));
- Stage3end_free(&hit);
+ ii = jj;
+ }
+ FREE(subarray);
+ List_free(&acceptor_hits);
}
+
+ List_free(&accepted_hits);
}
}
- j = k;
- }
-
- if (j < n) {
- /* Singleton */
- debug9(printf("Keeping hit %d\n",i));
- winners = List_push(winners,(void *) array[j]);
+ i = j;
}
return winners;
diff --git a/src/stage1.c b/src/stage1.c
index 674d733..c2b3d24 100644
--- a/src/stage1.c
+++ b/src/stage1.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage1.c 145990 2014-08-25 21:47:32Z twu $";
+static char rcsid[] = "$Id: stage1.c 158357 2015-02-10 19:10:16Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -30,6 +30,11 @@ static char rcsid[] = "$Id: stage1.c 145990 2014-08-25 21:47:32Z twu $";
#define SCAN_ENDS 1
+/* Need to limit ninrange in find_range, or else we get bogged down in
+ repeats */
+#define MAX_NINRANGE 100
+
+
#define MAX_INDELS 15
#define MIN_REPEAT 6
#define MAXENTRIES 100
@@ -1662,7 +1667,7 @@ find_range (int **querypositions, int *ninrange, int starti, int endi,
}
*ninrange = 0;
- for (querypos = starti; querypos <= endi; querypos++) {
+ for (querypos = starti; *ninrange < MAX_NINRANGE && querypos <= endi; querypos++) {
i = binary_search(0,npositions[querypos],positions[querypos],leftbound);
while (i < npositions[querypos] && positions[querypos][i] < rightbound) {
debug2(printf("At querypos %d, found position %u in (%u,%u)\n",querypos,positions[querypos][i],leftbound,rightbound));
@@ -1680,7 +1685,7 @@ find_range (int **querypositions, int *ninrange, int starti, int endi,
}
*ninrange = 0;
- for (querypos = starti; querypos <= endi; querypos++) {
+ for (querypos = starti; *ninrange < MAX_NINRANGE && querypos <= endi; querypos++) {
i = binary_search(0,npositions[querypos],positions[querypos],leftbound);
while (i < npositions[querypos] && positions[querypos][i] < rightbound) {
(*querypositions)[*ninrange] = querypos;
@@ -1792,11 +1797,11 @@ find_extensions (Univcoord_T *extension5, Univcoord_T *extension3, T this,
for (j = i+1; j < ninrange; j++) {
debug2(printf(" %u@%d",range[j],querypositions[j]));
expectedi = range[j] + querypositions[j] - querypositions[i];
- if (range[i] + 20 > expectedi && range[i] < expectedi + 20) {
- concentration++;
- lastj = j;
- debug2(printf("*"));
- }
+ if (range[i] + 20 > expectedi && range[i] < expectedi + 20) {
+ concentration++;
+ lastj = j;
+ debug2(printf("*"));
+ }
}
debug2(printf("\nConcentration is %d\n\n",concentration));
if (concentration > best_concentration ||
@@ -3718,7 +3723,7 @@ Stage1_compute (bool *lowidentityp, Sequence_T queryuc, Indexdb_T indexdb_fwd, I
}
/* Clean up gregionlist */
- debug(printf("Starting extensions\n"));
+ debug(printf("Starting extensions on %d gregions\n",List_length(gregionlist)));
for (p = gregionlist; p != NULL; p = List_next(p)) {
gregion = (Gregion_T) List_head(p);
if (Gregion_extendedp(gregion) == false) {
diff --git a/src/stage1hr.c b/src/stage1hr.c
index 7f65f45..89b847e 100644
--- a/src/stage1hr.c
+++ b/src/stage1hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage1hr.c 154087 2014-11-25 21:02:22Z twu $";
+static char rcsid[] = "$Id: stage1hr.c 157977 2015-02-03 18:46:53Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -113,8 +113,9 @@ static Chrpos_T shortsplicedist_novelend;
/* Penalties */
+static int subopt_levels;
static int terminal_threshold;
-static int terminal_output_minlength;
+static int reject_trimlength;
static bool novelsplicingp;
static bool knownsplicingp;
@@ -215,7 +216,8 @@ static int end_miss_two; /* Used for computing max_terminal_length */
#define MAX_LOCALSPLICING_POTENTIAL 1000
-#define LOCALSPLICING_SLOP 0.05
+#define LOCALSPLICING_NMATCHES_SLOP 1
+#define LOCALSPLICING_PROB_SLOP 0.05
/* Overall flow */
@@ -6609,6 +6611,43 @@ find_segmentm_span (Segment_T segmentm, int max_mismatches_allowed,
#endif
+/* Copied from sarray-read.c */
+static int
+donor_match_length_cmp (const void *a, const void *b) {
+ Stage3end_T x = * (Stage3end_T *) a;
+ Stage3end_T y = * (Stage3end_T *) b;
+
+ int x_length = Substring_match_length_orig(Stage3end_substring_donor(x));
+ int y_length = Substring_match_length_orig(Stage3end_substring_donor(y));
+
+ if (x_length < y_length) {
+ return -1;
+ } else if (y_length < x_length) {
+ return +1;
+ } else {
+ return 0;
+ }
+}
+
+/* Copied from sarray-read.c */
+static int
+acceptor_match_length_cmp (const void *a, const void *b) {
+ Stage3end_T x = * (Stage3end_T *) a;
+ Stage3end_T y = * (Stage3end_T *) b;
+
+ int x_length = Substring_match_length_orig(Stage3end_substring_acceptor(x));
+ int y_length = Substring_match_length_orig(Stage3end_substring_acceptor(y));
+
+ if (x_length < y_length) {
+ return -1;
+ } else if (y_length < x_length) {
+ return +1;
+ } else {
+ return 0;
+ }
+}
+
+
static List_T
find_singlesplices_plus (int *found_score, List_T hits, List_T *ambiguous, List_T *lowprob,
Segment_T *plus_spliceable, int plus_nspliceable,
@@ -6616,10 +6655,7 @@ find_singlesplices_plus (int *found_score, List_T hits, List_T *ambiguous, List_
Compress_T query_compress /* expecting fwd */,
int splicing_penalty, int max_mismatches_allowed, bool first_read_p, int genestrand,
bool subs_or_indels_p) {
-#ifdef DEBUG4S
- int i;
-#endif
- int j;
+ int k, j, i, n;
Segment_T segmenti, segmentj, segmentj_end, *ptr;
Univcoord_T segmenti_left, segmentj_left;
int nmismatches_left, nmismatches_right;
@@ -6651,8 +6687,11 @@ find_singlesplices_plus (int *found_score, List_T hits, List_T *ambiguous, List_
int *floors_from_neg3, *floors_to_pos3;
int nhits_local /*= 0*/;
+ List_T accepted_hits, rejected_hits;
List_T spliceends_sense, spliceends_antisense, p;
- Stage3end_T hit;
+ List_T donor_hits, acceptor_hits;
+ int donor_length, acceptor_length;
+ Stage3end_T hit, *hitarray;
int n_good_spliceends;
int best_nmismatches, nmismatches, nmismatches_donor, nmismatches_acceptor;
double best_prob, prob;
@@ -6665,6 +6704,7 @@ find_singlesplices_plus (int *found_score, List_T hits, List_T *ambiguous, List_
Uintlist_T ambcoords;
#endif
Intlist_T amb_knowni, amb_nmismatches;
+ Doublelist_T amb_probs;
debug4s(printf("*** Starting find_singlesplices_plus on %d spliceable segments ***\n",plus_nspliceable));
@@ -6852,279 +6892,471 @@ find_singlesplices_plus (int *found_score, List_T hits, List_T *ambiguous, List_
/* Process results for segmenti, sense. Modified from collect_elt_matches in sarray-read.c. */
if (spliceends_sense != NULL) {
+ /* nmismatches here may be different for spliceends from Splice_solve, so pick based on prob and nmismatches */
best_nmismatches = querylength;
+ best_prob = 0.0;
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- debug7(printf("analyzing distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ debug7(printf("analyzing distance %d, donor length %d (%llu..%llu) and acceptor length %d (%llu..%llu), nmismatches %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_genomicstart(Stage3end_substring_donor(hit)),Substring_genomicend(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
+ Substring_genomicstart(Stage3end_substring_acceptor(hit)),Substring_genomicend(Stage3end_substring_acceptor(hit)),
+ Stage3end_nmismatches_whole(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
if ((nmismatches = Stage3end_nmismatches_whole(hit)) < best_nmismatches) {
best_nmismatches = nmismatches;
- best_prob = Stage3end_chimera_prob(hit);
- } else if (nmismatches == best_nmismatches && (prob = Stage3end_chimera_prob(hit)) > best_prob) {
+ }
+ if ((prob = Stage3end_chimera_prob(hit)) > best_prob) {
best_prob = prob;
}
}
n_good_spliceends = 0;
+ accepted_hits = rejected_hits = (List_T) NULL;
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug7(printf("accepting distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP &&
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
+ } else {
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
-
- debug7(printf("Have %d good spliceends\n",n_good_spliceends));
- if (n_good_spliceends == 1) {
+
+ if (n_good_spliceends == 0) {
+ /* Conjunction is too strict. Allow for disjunction instead. */
+ List_free(&rejected_hits);
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug7(printf("pushing sense distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP ||
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
- hits = List_push(hits,(void *) hit);
+ n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
} else {
- Stage3end_free(&hit);
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- List_free(&spliceends_sense);
+ }
+
+ for (p = rejected_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ Stage3end_free(&hit);
+ }
+ List_free(&rejected_hits);
+ List_free(&spliceends_sense);
+
+ if (n_good_spliceends == 1) {
+ hits = List_push(hits,List_head(accepted_hits));
+ List_free(&accepted_hits);
} else {
- /* Create ambiguous, sense */
- debug7(printf("Creating ambiguous, sense\n"));
- hit = (Stage3end_T) List_head(spliceends_sense);
- donor = Stage3end_substring_donor(hit);
- acceptor = Stage3end_substring_acceptor(hit);
- sensedir = Stage3end_sensedir(hit);
-
- ambcoords = NULL;
- amb_knowni = (Intlist_T) NULL;
- amb_nmismatches = (Intlist_T) NULL;
-
- if (Substring_left_genomicseg(donor) == /*segmenti_left*/ segmenti->diagonal - querylength) {
- for (p = spliceends_sense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- acceptor = Stage3end_substring_acceptor(hit);
+ /* 1. Multiple hits, sense, left1 (segmenti_left) */
+ debug7(printf("multiple splice hits, sense, plus\n"));
+ donor_hits = acceptor_hits = (List_T) NULL;
+
+ /* plus branch from collect_elt_matches */
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ donor = Stage3end_substring_donor(hit);
+ acceptor = Stage3end_substring_acceptor(hit);
+ if (Substring_genomicstart(donor) == segmenti_left) {
+ donor_hits = List_push(donor_hits,(void *) hit);
+ } else if (Substring_genomicstart(acceptor) == segmenti_left) {
+ acceptor_hits = List_push(acceptor_hits,(void *) hit);
+ } else {
+ abort();
+ Stage3end_free(&hit);
+ }
+ }
+
+ if (donor_hits != NULL) {
+ hitarray = (Stage3end_T *) List_to_array_n(&n,donor_hits);
+ qsort(hitarray,n,sizeof(Stage3end_T),donor_match_length_cmp);
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ donor = Stage3end_substring_donor(hit);
+ donor_length = Substring_match_length_orig(donor);
+ j = i + 1;
+ while (j < n && Substring_match_length_orig(Stage3end_substring_donor(hitarray[j])) == donor_length) {
+ j++;
+ }
+ if (j == i + 1) {
+ hits = List_push(hits,(void *) hit);
+ } else {
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
+ ambcoords = (Uint8list_T) NULL;
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
+ ambcoords = (Uintlist_T) NULL;
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
- }
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
- nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
- donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(acceptor),
- /*ambcoords_donor*/NULL,ambcoords,
- /*amb_knowni_donor*/NULL,amb_knowni,
- /*amb_nmismatches_donor*/NULL,amb_nmismatches,
- /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
- sensedir,/*sarrayp*/false));
+ for (k = i; k < j; k++) {
+ acceptor = Stage3end_substring_acceptor(hitarray[k]);
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
#else
- Uintlist_free(&ambcoords);
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
-
- } else if (Substring_left_genomicseg(acceptor) == /*segmenti_left*/ segmenti->diagonal - querylength) {
- for (p = spliceends_sense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- donor = Stage3end_substring_donor(hit);
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(acceptor));
+ }
+
+ nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
+ prob = best_prob - Substring_chimera_prob(donor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
+ donor,/*acceptor*/NULL,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(acceptor),/*amb_prob*/prob,
+ /*ambcoords_donor*/NULL,ambcoords,
+ /*amb_knowni_donor*/NULL,amb_knowni,
+ /*amb_nmismatches_donor*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
+ /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
+ Stage3end_sensedir(hit),/*sarrayp*/false));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_nmismatches);
+ Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
+ Uint8list_free(&ambcoords);
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
+ Uintlist_free(&ambcoords);
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ Stage3end_free(&hit);
+ }
+ }
+
+ i = j;
}
+ FREE(hitarray);
+ List_free(&donor_hits);
+ }
- nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
- /*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(donor),
- ambcoords,/*ambcoords_acceptor*/NULL,
- amb_knowni,/*amb_knowni_acceptor*/NULL,
- amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
- /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
- sensedir,/*sarrayp*/false));
+ if (acceptor_hits != NULL) {
+ hitarray = (Stage3end_T *) List_to_array_n(&n,acceptor_hits);
+ qsort(hitarray,n,sizeof(Stage3end_T),acceptor_match_length_cmp);
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ acceptor = Stage3end_substring_acceptor(hit);
+ acceptor_length = Substring_match_length_orig(acceptor);
+ j = i + 1;
+ while (j < n && Substring_match_length_orig(Stage3end_substring_acceptor(hitarray[j])) == acceptor_length) {
+ j++;
+ }
+ if (j == i + 1) {
+ hits = List_push(hits,(void *) hit);
+ } else {
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ ambcoords = (Uint8list_T) NULL;
#else
- Uintlist_free(&ambcoords);
+ ambcoords = (Uintlist_T) NULL;
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
-
- } else {
- fprintf(stderr,"Unexpected: Neither donor left %u nor acceptor left %u equals segmenti_left %u\n",
- Substring_left_genomicseg(donor),Substring_left_genomicseg(acceptor),segmenti->diagonal - querylength);
- abort();
- }
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
- for (p = spliceends_sense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- Stage3end_free(&hit);
+ for (k = i; k < j; k++) {
+ donor = Stage3end_substring_donor(hitarray[k]);
+#ifdef LARGE_GENOMES
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
+#else
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
+#endif
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(donor));
+ }
+
+ nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
+ prob = best_prob - Substring_chimera_prob(acceptor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
+ /*donor*/NULL,acceptor,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(donor),/*amb_prob*/prob,
+ ambcoords,/*ambcoords_acceptor*/NULL,
+ amb_knowni,/*amb_knowni_acceptor*/NULL,
+ amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
+ /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
+ Stage3end_sensedir(hit),/*sarrayp*/false));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_nmismatches);
+ Intlist_free(&amb_knowni);
+#ifdef LARGE_GENOMES
+ Uint8list_free(&ambcoords);
+#else
+ Uintlist_free(&ambcoords);
+#endif
+
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ Stage3end_free(&hit);
+ }
+ }
+
+ i = j;
+ }
+ FREE(hitarray);
+ List_free(&acceptor_hits);
}
- List_free(&spliceends_sense);
+
+ List_free(&accepted_hits);
}
}
/* Process results for segmenti, antisense. Modified from collect_elt_matches in sarray-read.c. */
if (spliceends_antisense != NULL) {
+ /* nmismatches here may be different for spliceends from Splice_solve, so pick based on prob and nmismatches */
best_nmismatches = querylength;
+ best_prob = 0.0;
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- debug7(printf("analyzing distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ debug7(printf("analyzing distance %d, donor length %d (%llu..%llu) and acceptor length %d (%llu..%llu), nmismatches %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_genomicstart(Stage3end_substring_donor(hit)),Substring_genomicend(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
+ Substring_genomicstart(Stage3end_substring_acceptor(hit)),Substring_genomicend(Stage3end_substring_acceptor(hit)),
+ Stage3end_nmismatches_whole(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
if ((nmismatches = Stage3end_nmismatches_whole(hit)) < best_nmismatches) {
best_nmismatches = nmismatches;
- best_prob = Stage3end_chimera_prob(hit);
- } else if (nmismatches == best_nmismatches && (prob = Stage3end_chimera_prob(hit)) > best_prob) {
+ }
+ if ((prob = Stage3end_chimera_prob(hit)) > best_prob) {
best_prob = prob;
}
}
n_good_spliceends = 0;
+ accepted_hits = rejected_hits = (List_T) NULL;
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug7(printf("accepting distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP &&
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, donor length %d and acceptor length %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
+ } else {
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
-
- debug7(printf("Have %d good spliceends\n",n_good_spliceends));
- if (n_good_spliceends == 1) {
+
+ if (n_good_spliceends == 0) {
+ /* Conjunction is too strict. Allow for disjunction instead. */
+ List_free(&rejected_hits);
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug7(printf("pushing antisense distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP ||
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, donor length %d and acceptor length %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
- hits = List_push(hits,(void *) hit);
+ n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
} else {
- Stage3end_free(&hit);
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- List_free(&spliceends_antisense);
+ }
+
+ for (p = rejected_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ Stage3end_free(&hit);
+ }
+ List_free(&rejected_hits);
+ List_free(&spliceends_antisense);
+
+ if (n_good_spliceends == 1) {
+ hits = List_push(hits,List_head(accepted_hits));
+ List_free(&accepted_hits);
} else {
- /* Create ambiguous, antisense */
- debug7(printf("Creating ambiguous, antisense\n"));
- hit = (Stage3end_T) List_head(spliceends_antisense);
- donor = Stage3end_substring_donor(hit);
- acceptor = Stage3end_substring_acceptor(hit);
- sensedir = Stage3end_sensedir(hit);
-
- ambcoords = NULL;
- amb_knowni = (Intlist_T) NULL;
- amb_nmismatches = (Intlist_T) NULL;
-
- if (Substring_left_genomicseg(donor) == /*segmenti_left*/ segmenti->diagonal - querylength) {
- for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- acceptor = Stage3end_substring_acceptor(hit);
+ /* 2. Multiple hits, antisense, left1 (segmenti_left) */
+ debug7(printf("multiple splice hits, antisense, plus\n"));
+ donor_hits = acceptor_hits = (List_T) NULL;
+
+ /* plus branch from collect_elt_matches */
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ donor = Stage3end_substring_donor(hit);
+ acceptor = Stage3end_substring_acceptor(hit);
+ if (Substring_genomicstart(donor) == segmenti_left) {
+ donor_hits = List_push(donor_hits,(void *) hit);
+ } else if (Substring_genomicstart(acceptor) == segmenti_left) {
+ acceptor_hits = List_push(acceptor_hits,(void *) hit);
+ } else {
+ abort();
+ Stage3end_free(&hit);
+ }
+ }
+
+ if (donor_hits != NULL) {
+ hitarray = (Stage3end_T *) List_to_array_n(&n,donor_hits);
+ qsort(hitarray,n,sizeof(Stage3end_T),donor_match_length_cmp);
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ donor = Stage3end_substring_donor(hit);
+ donor_length = Substring_match_length_orig(donor);
+ j = i + 1;
+ while (j < n && Substring_match_length_orig(Stage3end_substring_donor(hitarray[j])) == donor_length) {
+ j++;
+ }
+ if (j == i + 1) {
+ hits = List_push(hits,(void *) hit);
+ } else {
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
+ ambcoords = (Uint8list_T) NULL;
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
+ ambcoords = (Uintlist_T) NULL;
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
- }
-
- nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
- donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(acceptor),
- /*ambcoords_donor*/NULL,ambcoords,
- /*amb_knowni_donor*/NULL,amb_knowni,
- /*amb_nmismatches_donor*/NULL,amb_nmismatches,
- /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
- sensedir,/*sarrayp*/false));
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
+
+ for (k = i; k < j; k++) {
+ acceptor = Stage3end_substring_acceptor(hitarray[k]);
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
#else
- Uintlist_free(&ambcoords);
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
-
- } else if (Substring_left_genomicseg(acceptor) == /*segmenti_left*/ segmenti->diagonal - querylength) {
- for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- donor = Stage3end_substring_donor(hit);
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(acceptor));
+ }
+
+ nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
+ prob = best_prob - Substring_chimera_prob(donor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
+ donor,/*acceptor*/NULL,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(acceptor),/*amb_prob*/prob,
+ /*ambcoords_donor*/NULL,ambcoords,
+ /*amb_knowni_donor*/NULL,amb_knowni,
+ /*amb_nmismatches_donort*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
+ /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
+ Stage3end_sensedir(hit),/*sarrayp*/false));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_nmismatches);
+ Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
+ Uint8list_free(&ambcoords);
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
+ Uintlist_free(&ambcoords);
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ Stage3end_free(&hit);
+ }
+ }
+
+ i = j;
}
+ FREE(hitarray);
+ List_free(&donor_hits);
+ }
- nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
- /*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(donor),
- ambcoords,/*ambcoords_acceptor*/NULL,
- amb_knowni,/*amb_knowni_acceptor*/NULL,
- amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
- /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
- sensedir,/*sarrayp*/false));
+ if (acceptor_hits != NULL) {
+ hitarray = (Stage3end_T *) List_to_array_n(&n,acceptor_hits);
+ qsort(hitarray,n,sizeof(Stage3end_T),acceptor_match_length_cmp);
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ acceptor = Stage3end_substring_acceptor(hit);
+ acceptor_length = Substring_match_length_orig(acceptor);
+ j = i + 1;
+ while (j < n && Substring_match_length_orig(Stage3end_substring_acceptor(hitarray[j])) == acceptor_length) {
+ j++;
+ }
+ if (j == i + 1) {
+ hits = List_push(hits,(void *) hit);
+ } else {
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ ambcoords = (Uint8list_T) NULL;
#else
- Uintlist_free(&ambcoords);
+ ambcoords = (Uintlist_T) NULL;
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
-
- } else {
- fprintf(stderr,"Unexpected: Neither donor left %u nor acceptor left %u equals segmenti_left %u\n",
- Substring_left_genomicseg(donor),Substring_left_genomicseg(acceptor),segmenti->diagonal - querylength);
- abort();
- }
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
- for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- Stage3end_free(&hit);
+ for (k = i; k < j; k++) {
+ donor = Stage3end_substring_donor(hitarray[k]);
+#ifdef LARGE_GENOMES
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
+#else
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
+#endif
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(donor));
+ }
+
+ nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
+ prob = best_prob - Substring_chimera_prob(acceptor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
+ /*donor*/NULL,acceptor,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(donor),/*amb_prob*/prob,
+ ambcoords,/*ambcoords_acceptor*/NULL,
+ amb_knowni,/*amb_knowni_acceptor*/NULL,
+ amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
+ /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
+ Stage3end_sensedir(hit),/*sarrayp*/false));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_nmismatches);
+ Intlist_free(&amb_knowni);
+#ifdef LARGE_GENOMES
+ Uint8list_free(&ambcoords);
+#else
+ Uintlist_free(&ambcoords);
+#endif
+
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ Stage3end_free(&hit);
+ }
+ }
+
+ i = j;
+ }
+ FREE(hitarray);
+ List_free(&acceptor_hits);
}
- List_free(&spliceends_antisense);
+
+ List_free(&accepted_hits);
}
}
@@ -7144,10 +7376,7 @@ find_singlesplices_minus (int *found_score, List_T hits, List_T *ambiguous, List
Floors_T floors, int querylength, int query_lastpos, Compress_T query_compress /* expecting rev */,
int splicing_penalty, int max_mismatches_allowed, bool first_read_p, int genestrand,
bool subs_or_indels_p) {
-#ifdef DEBUG4S
- int i;
-#endif
- int j;
+ int k, j, i, n;
Segment_T segmenti, segmentj, segmentj_end, *ptr;
Univcoord_T segmenti_left, segmentj_left;
int nmismatches_left, nmismatches_right;
@@ -7179,8 +7408,11 @@ find_singlesplices_minus (int *found_score, List_T hits, List_T *ambiguous, List
int *floors_from_neg3, *floors_to_pos3;
int nhits_local /*= 0*/;
+ List_T accepted_hits, rejected_hits;
List_T spliceends_sense, spliceends_antisense, p;
- Stage3end_T hit;
+ List_T donor_hits, acceptor_hits;
+ int donor_length, acceptor_length;
+ Stage3end_T hit, *hitarray;
int n_good_spliceends;
int best_nmismatches, nmismatches, nmismatches_donor, nmismatches_acceptor;
double best_prob, prob;
@@ -7193,6 +7425,7 @@ find_singlesplices_minus (int *found_score, List_T hits, List_T *ambiguous, List
Uintlist_T ambcoords;
#endif
Intlist_T amb_knowni, amb_nmismatches;
+ Doublelist_T amb_probs;
debug4s(printf("*** Starting find_singlesplices_minus on %d spliceable segments ***\n",minus_nspliceable));
@@ -7379,279 +7612,470 @@ find_singlesplices_minus (int *found_score, List_T hits, List_T *ambiguous, List
/* Process results for segmenti, sense. Modified from collect_elt_matches in sarray-read.c. */
if (spliceends_sense != NULL) {
+ /* nmismatches here may be different for spliceends from Splice_solve, so pick based on prob and nmismatches */
best_nmismatches = querylength;
+ best_prob = 0.0;
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- debug7(printf("analyzing distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ debug7(printf("analyzing distance %d, donor length %d (%llu..%llu) and acceptor length %d (%llu..%llu), nmismatches %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_genomicstart(Stage3end_substring_donor(hit)),Substring_genomicend(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
+ Substring_genomicstart(Stage3end_substring_acceptor(hit)),Substring_genomicend(Stage3end_substring_acceptor(hit)),
+ Stage3end_nmismatches_whole(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
if ((nmismatches = Stage3end_nmismatches_whole(hit)) < best_nmismatches) {
best_nmismatches = nmismatches;
- best_prob = Stage3end_chimera_prob(hit);
- } else if (nmismatches == best_nmismatches && (prob = Stage3end_chimera_prob(hit)) > best_prob) {
+ }
+ if ((prob = Stage3end_chimera_prob(hit)) > best_prob) {
best_prob = prob;
}
}
n_good_spliceends = 0;
+ accepted_hits = rejected_hits = (List_T) NULL;
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug7(printf("accepting distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP &&
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
+ } else {
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
-
- debug7(printf("Have %d good spliceends\n",n_good_spliceends));
- if (n_good_spliceends == 1) {
+
+ if (n_good_spliceends == 0) {
+ /* Conjunction is too strict. Allow for disjunction instead. */
+ List_free(&rejected_hits);
for (p = spliceends_sense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug7(printf("pushing sense distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP ||
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
- hits = List_push(hits,(void *) hit);
+ n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
} else {
- Stage3end_free(&hit);
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- List_free(&spliceends_sense);
+ }
+
+ for (p = rejected_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ Stage3end_free(&hit);
+ }
+ List_free(&rejected_hits);
+ List_free(&spliceends_sense);
+
+ if (n_good_spliceends == 1) {
+ hits = List_push(hits,List_head(accepted_hits));
+ List_free(&accepted_hits);
} else {
- /* Create ambiguous, sense */
- debug7(printf("Creating ambiguous, sense\n"));
- hit = (Stage3end_T) List_head(spliceends_sense);
- donor = Stage3end_substring_donor(hit);
- acceptor = Stage3end_substring_acceptor(hit);
- sensedir = Stage3end_sensedir(hit);
-
- ambcoords = NULL;
- amb_knowni = (Intlist_T) NULL;
- amb_nmismatches = (Intlist_T) NULL;
-
- if (Substring_left_genomicseg(donor) == /*segmenti_left*/ segmenti->diagonal - querylength) {
- for (p = spliceends_sense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- acceptor = Stage3end_substring_acceptor(hit);
+ /* 1. Multiple hits, sense, left1 (segmenti_left) */
+ debug7(printf("multiple splice hits, sense, minus\n"));
+ donor_hits = acceptor_hits = (List_T) NULL;
+
+ /* minus branch from collect_elt_matches */
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ donor = Stage3end_substring_donor(hit);
+ acceptor = Stage3end_substring_acceptor(hit);
+ if (Substring_genomicend(donor) == segmenti_left) {
+ donor_hits = List_push(donor_hits,(void *) hit);
+ } else if (Substring_genomicend(acceptor) == segmenti_left) {
+ acceptor_hits = List_push(acceptor_hits,(void *) hit);
+ } else {
+ abort();
+ Stage3end_free(&hit);
+ }
+ }
+
+ if (donor_hits != NULL) {
+ hitarray = (Stage3end_T *) List_to_array_n(&n,donor_hits);
+ qsort(hitarray,n,sizeof(Stage3end_T),donor_match_length_cmp);
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ donor = Stage3end_substring_donor(hit);
+ donor_length = Substring_match_length_orig(donor);
+ j = i + 1;
+ while (j < n && Substring_match_length_orig(Stage3end_substring_donor(hitarray[j])) == donor_length) {
+ j++;
+ }
+ if (j == i + 1) {
+ hits = List_push(hits,(void *) hit);
+ } else {
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
+ ambcoords = (Uint8list_T) NULL;
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
+ ambcoords = (Uintlist_T) NULL;
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
- }
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
- nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
- donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(acceptor),
- /*ambcoords_donor*/NULL,ambcoords,
- /*amb_knowni_donor*/NULL,amb_knowni,
- /*amb_nmismatches_donor*/NULL,amb_nmismatches,
- /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
- sensedir,/*sarrayp*/false));
+ for (k = i; k < j; k++) {
+ acceptor = Stage3end_substring_acceptor(hitarray[k]);
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
#else
- Uintlist_free(&ambcoords);
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(acceptor));
+ }
- } else if (Substring_left_genomicseg(acceptor) == /*segmenti_left*/ segmenti->diagonal - querylength) {
- for (p = spliceends_sense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- donor = Stage3end_substring_donor(hit);
+ nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
+ prob = best_prob - Substring_chimera_prob(donor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
+ donor,/*acceptor*/NULL,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(acceptor),/*amb_prob*/prob,
+ /*ambcoords_donor*/NULL,ambcoords,
+ /*amb_knowni_donor*/NULL,amb_knowni,
+ /*amb_nmismatches_donort*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
+ /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
+ Stage3end_sensedir(hit),/*sarrayp*/false));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_nmismatches);
+ Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
+ Uint8list_free(&ambcoords);
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
+ Uintlist_free(&ambcoords);
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ Stage3end_free(&hit);
+ }
+ }
+
+ i = j;
}
+ FREE(hitarray);
+ List_free(&donor_hits);
+ }
- nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
- /*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(donor),
- ambcoords,/*ambcoords_acceptor*/NULL,
- amb_knowni,/*amb_knowni_acceptor*/NULL,
- amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
- /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
- sensedir,/*sarrayp*/false));
+ if (acceptor_hits != NULL) {
+ hitarray = (Stage3end_T *) List_to_array_n(&n,acceptor_hits);
+ qsort(hitarray,n,sizeof(Stage3end_T),acceptor_match_length_cmp);
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ acceptor = Stage3end_substring_acceptor(hit);
+ acceptor_length = Substring_match_length_orig(acceptor);
+ j = i + 1;
+ while (j < n && Substring_match_length_orig(Stage3end_substring_acceptor(hitarray[j])) == acceptor_length) {
+ j++;
+ }
+ if (j == i + 1) {
+ hits = List_push(hits,(void *) hit);
+ } else {
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ ambcoords = (Uint8list_T) NULL;
#else
- Uintlist_free(&ambcoords);
+ ambcoords = (Uintlist_T) NULL;
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
- } else {
- fprintf(stderr,"Unexpected: Neither donor left %u nor acceptor left %u equals segmenti_left %u\n",
- Substring_left_genomicseg(donor),Substring_left_genomicseg(acceptor),segmenti->diagonal - querylength);
- abort();
- }
+ for (k = i; k < j; k++) {
+ donor = Stage3end_substring_donor(hitarray[k]);
+#ifdef LARGE_GENOMES
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
+#else
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
+#endif
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(donor));
+ }
+
+ nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
+ prob = best_prob - Substring_chimera_prob(acceptor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
+ /*donor*/NULL,acceptor,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(donor),/*amb_prob*/prob,
+ ambcoords,/*ambcoords_acceptor*/NULL,
+ amb_knowni,/*amb_knowni_acceptor*/NULL,
+ amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
+ /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
+ Stage3end_sensedir(hit),/*sarrayp*/false));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_nmismatches);
+ Intlist_free(&amb_knowni);
+#ifdef LARGE_GENOMES
+ Uint8list_free(&ambcoords);
+#else
+ Uintlist_free(&ambcoords);
+#endif
- for (p = spliceends_sense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- Stage3end_free(&hit);
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ Stage3end_free(&hit);
+ }
+ }
+
+ i = j;
+ }
+ FREE(hitarray);
+ List_free(&acceptor_hits);
}
- List_free(&spliceends_sense);
+
+ List_free(&accepted_hits);
}
}
/* Process results for segmenti, antisense. Modified from collect_elt_matches in sarray-read.c. */
if (spliceends_antisense != NULL) {
+ /* nmismatches here may be different for spliceends from Splice_solve, so pick based on prob and nmismatches */
best_nmismatches = querylength;
+ best_prob = 0.0;
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- debug7(printf("analyzing distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
- Substring_chimera_prob(Stage3end_substring_donor(hit)),
+ debug7(printf("analyzing distance %d, donor length %d (%llu..%llu) and acceptor length %d (%llu..%llu), nmismatches %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_genomicstart(Stage3end_substring_donor(hit)),Substring_genomicend(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
+ Substring_genomicstart(Stage3end_substring_acceptor(hit)),Substring_genomicend(Stage3end_substring_acceptor(hit)),
+ Stage3end_nmismatches_whole(hit),Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
if ((nmismatches = Stage3end_nmismatches_whole(hit)) < best_nmismatches) {
best_nmismatches = nmismatches;
- best_prob = Stage3end_chimera_prob(hit);
- } else if (nmismatches == best_nmismatches && (prob = Stage3end_chimera_prob(hit)) > best_prob) {
+ }
+ if ((prob = Stage3end_chimera_prob(hit)) > best_prob) {
best_prob = prob;
}
}
n_good_spliceends = 0;
+ accepted_hits = rejected_hits = (List_T) NULL;
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug7(printf("accepting distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP &&
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, donor length %d and acceptor length %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
+ } else {
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
-
- debug7(printf("Have %d good spliceends\n",n_good_spliceends));
- if (n_good_spliceends == 1) {
+
+ if (n_good_spliceends == 0) {
+ /* Conjunction is too strict. Allow for disjunction instead. */
+ List_free(&rejected_hits);
for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_chimera_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug7(printf("pushing antisense distance %d, nmismatches %d, probabilities %f and %f\n",
- Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP ||
+ Stage3end_chimera_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP) {
+ debug7(printf("accepting distance %d, donor length %d and acceptor length %d, probabilities %f and %f\n",
+ Stage3end_distance(hit),Substring_match_length_orig(Stage3end_substring_donor(hit)),
+ Substring_match_length_orig(Stage3end_substring_acceptor(hit)),
Substring_chimera_prob(Stage3end_substring_donor(hit)),
Substring_chimera_prob(Stage3end_substring_acceptor(hit))));
- hits = List_push(hits,(void *) hit);
+ n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
} else {
- Stage3end_free(&hit);
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- List_free(&spliceends_antisense);
+ }
+
+ for (p = rejected_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ Stage3end_free(&hit);
+ }
+ List_free(&rejected_hits);
+ List_free(&spliceends_antisense);
+
+ if (n_good_spliceends == 1) {
+ hits = List_push(hits,List_head(accepted_hits));
+ List_free(&accepted_hits);
} else {
- /* Create ambiguous, antisense */
- debug7(printf("Creating ambiguous, antisense\n"));
- hit = (Stage3end_T) List_head(spliceends_antisense);
- donor = Stage3end_substring_donor(hit);
- acceptor = Stage3end_substring_acceptor(hit);
- sensedir = Stage3end_sensedir(hit);
-
- ambcoords = NULL;
- amb_knowni = (Intlist_T) NULL;
- amb_nmismatches = (Intlist_T) NULL;
-
- if (Substring_left_genomicseg(donor) == /*segmenti_left*/ segmenti->diagonal - querylength) {
- for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- acceptor = Stage3end_substring_acceptor(hit);
+ /* 2. Multiple hits, antisense, left1 (segmenti_left) */
+ debug7(printf("multiple splice hits, antisense, minus\n"));
+ donor_hits = acceptor_hits = (List_T) NULL;
+
+ /* minus branch from collect_elt_matches */
+ for (p = accepted_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ donor = Stage3end_substring_donor(hit);
+ acceptor = Stage3end_substring_acceptor(hit);
+ if (Substring_genomicend(donor) == segmenti_left) {
+ donor_hits = List_push(donor_hits,(void *) hit);
+ } else if (Substring_genomicend(acceptor) == segmenti_left) {
+ acceptor_hits = List_push(acceptor_hits,(void *) hit);
+ } else {
+ abort();
+ Stage3end_free(&hit);
+ }
+ }
+
+ if (donor_hits != NULL) {
+ hitarray = (Stage3end_T *) List_to_array_n(&n,donor_hits);
+ qsort(hitarray,n,sizeof(Stage3end_T),donor_match_length_cmp);
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ donor = Stage3end_substring_donor(hit);
+ donor_length = Substring_match_length_orig(donor);
+ j = i + 1;
+ while (j < n && Substring_match_length_orig(Stage3end_substring_donor(hitarray[j])) == donor_length) {
+ j++;
+ }
+ if (j == i + 1) {
+ hits = List_push(hits,(void *) hit);
+ } else {
+#ifdef LARGE_GENOMES
+ ambcoords = (Uint8list_T) NULL;
+#else
+ ambcoords = (Uintlist_T) NULL;
+#endif
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
+
+ for (k = i; k < j; k++) {
+ acceptor = Stage3end_substring_acceptor(hitarray[k]);
+#ifdef LARGE_GENOMES
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
+#else
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
+#endif
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(acceptor));
+ }
+
+ nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
+ prob = best_prob - Substring_chimera_prob(donor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
+ donor,/*acceptor*/NULL,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(acceptor),/*amb_prob*/prob,
+ /*ambcoords_donor*/NULL,ambcoords,
+ /*amb_knowni_donor*/NULL,amb_knowni,
+ /*amb_nmismatches_donort*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
+ /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
+ Stage3end_sensedir(hit),/*sarrayp*/false));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_nmismatches);
+ Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(acceptor));
+ Uint8list_free(&ambcoords);
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(acceptor));
+ Uintlist_free(&ambcoords);
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ Stage3end_free(&hit);
+ }
+ }
+
+ i = j;
}
+ FREE(hitarray);
+ List_free(&donor_hits);
+ }
- nmismatches_acceptor = best_nmismatches - Substring_nmismatches_whole(donor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- /*nmismatches_donor*/Substring_nmismatches_whole(donor),nmismatches_acceptor,
- donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(acceptor),
- /*ambcoords_donor*/NULL,ambcoords,
- /*amb_knowni_donor*/NULL,amb_knowni,
- /*amb_nmismatches_donor*/NULL,amb_nmismatches,
- /*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
- sensedir,/*sarrayp*/false));
+ if (acceptor_hits != NULL) {
+ hitarray = (Stage3end_T *) List_to_array_n(&n,acceptor_hits);
+ qsort(hitarray,n,sizeof(Stage3end_T),acceptor_match_length_cmp);
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ acceptor = Stage3end_substring_acceptor(hit);
+ acceptor_length = Substring_match_length_orig(acceptor);
+ j = i + 1;
+ while (j < n && Substring_match_length_orig(Stage3end_substring_acceptor(hitarray[j])) == acceptor_length) {
+ j++;
+ }
+ if (j == i + 1) {
+ hits = List_push(hits,(void *) hit);
+ } else {
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ ambcoords = (Uint8list_T) NULL;
#else
- Uintlist_free(&ambcoords);
+ ambcoords = (Uintlist_T) NULL;
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
+ amb_knowni = (Intlist_T) NULL;
+ amb_nmismatches = (Intlist_T) NULL;
+ amb_probs = (Doublelist_T) NULL;
- } else if (Substring_left_genomicseg(acceptor) == /*segmenti_left*/ segmenti->diagonal - querylength) {
- for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- donor = Stage3end_substring_donor(hit);
+ for (k = i; k < j; k++) {
+ donor = Stage3end_substring_donor(hitarray[k]);
#ifdef LARGE_GENOMES
- ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
+ ambcoords = Uint8list_push(ambcoords,Substring_splicecoord(donor));
#else
- ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
+ ambcoords = Uintlist_push(ambcoords,Substring_splicecoord(donor));
#endif
- amb_knowni = Intlist_push(amb_knowni,-1);
- amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
- }
-
- nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
- *ambiguous = List_push(*ambiguous,
- (void *) Stage3end_new_splice(&(*found_score),
- nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
- /*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,
- /*amb_length*/Substring_match_length_orig(donor),
- ambcoords,/*ambcoords_acceptor*/NULL,
- amb_knowni,/*amb_knowni_acceptor*/NULL,
- amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
- /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
- sensedir,/*sarrayp*/false));
+ amb_knowni = Intlist_push(amb_knowni,-1);
+ amb_nmismatches = Intlist_push(amb_nmismatches,Substring_nmismatches_whole(donor));
+ amb_probs = Doublelist_push(amb_probs,Substring_chimera_prob(donor));
+ }
+
+ nmismatches_donor = best_nmismatches - Substring_nmismatches_whole(acceptor);
+ prob = best_prob - Substring_chimera_prob(acceptor);
+ *ambiguous = List_push(*ambiguous,
+ (void *) Stage3end_new_splice(&(*found_score),
+ nmismatches_donor,/*nmismatches_acceptor*/Substring_nmismatches_whole(acceptor),
+ /*donor*/NULL,acceptor,/*distance*/0U,
+ /*shortdistancep*/false,/*penalty*/0,querylength,
+ /*amb_length*/Substring_match_length_orig(donor),/*amb_prob*/prob,
+ ambcoords,/*ambcoords_acceptor*/NULL,
+ amb_knowni,/*amb_knowni_acceptor*/NULL,
+ amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
+ /*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
+ Stage3end_sensedir(hit),/*sarrayp*/false));
+ Doublelist_free(&amb_probs);
+ Intlist_free(&amb_nmismatches);
+ Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
- Uint8list_free(&ambcoords);
+ Uint8list_free(&ambcoords);
#else
- Uintlist_free(&ambcoords);
+ Uintlist_free(&ambcoords);
#endif
- Intlist_free(&amb_knowni);
- Intlist_free(&amb_nmismatches);
- } else {
- fprintf(stderr,"Unexpected: Neither donor left %u nor acceptor left %u equals segmenti_left %u\n",
- Substring_left_genomicseg(donor),Substring_left_genomicseg(acceptor),segmenti->diagonal - querylength);
- abort();
- }
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ Stage3end_free(&hit);
+ }
+ }
- for (p = spliceends_antisense; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- Stage3end_free(&hit);
+ i = j;
+ }
+ FREE(hitarray);
+ List_free(&acceptor_hits);
}
- List_free(&spliceends_antisense);
+
+ List_free(&accepted_hits);
}
}
@@ -7667,24 +8091,28 @@ find_singlesplices_minus (int *found_score, List_T hits, List_T *ambiguous, List
#ifdef LARGE_GENOMES
static Uint8list_T
-lookup_splicesites (Intlist_T splicesites_i, Univcoord_T *splicesites) {
+lookup_splicesites (Doublelist_T *probs_list, Intlist_T splicesites_i, Univcoord_T *splicesites) {
Uint8list_T coords = NULL;
Intlist_T p;
+ *probs_list = (Doublelist_T) NULL;
for (p = splicesites_i; p != NULL; p = Intlist_next(p)) {
coords = Uint8list_push(coords,splicesites[Intlist_head(p)]);
+ *probs_list = Doublelist_push(*probs_list,2.0);
}
return Uint8list_reverse(coords);
}
#else
static Uintlist_T
-lookup_splicesites (Intlist_T splicesites_i, Univcoord_T *splicesites) {
+lookup_splicesites (Doublelist_T *probs_list, Intlist_T splicesites_i, Univcoord_T *splicesites) {
Uintlist_T coords = NULL;
Intlist_T p;
+ *probs_list = (Doublelist_T) NULL;
for (p = splicesites_i; p != NULL; p = Intlist_next(p)) {
coords = Uintlist_push(coords,splicesites[Intlist_head(p)]);
+ *probs_list = Doublelist_push(*probs_list,2.0);
}
return Uintlist_reverse(coords);
@@ -7692,6 +8120,58 @@ lookup_splicesites (Intlist_T splicesites_i, Univcoord_T *splicesites) {
#endif
+static int
+substringD_match_length_cmp (const void *a, const void *b) {
+ Stage3end_T x = * (Stage3end_T *) a;
+ Stage3end_T y = * (Stage3end_T *) b;
+
+ int x_length = Substring_match_length_orig(Stage3end_substringD(x));
+ int y_length = Substring_match_length_orig(Stage3end_substringD(y));
+
+ if (x_length < y_length) {
+ return -1;
+ } else if (y_length < x_length) {
+ return +1;
+ } else {
+ x_length = Substring_match_length_orig(Stage3end_substringA(x));
+ y_length = Substring_match_length_orig(Stage3end_substringA(y));
+ if (x_length < y_length) {
+ return -1;
+ } else if (y_length < x_length) {
+ return +1;
+ } else {
+ return 0;
+ }
+ }
+}
+
+static int
+substringA_match_length_cmp (const void *a, const void *b) {
+ Stage3end_T x = * (Stage3end_T *) a;
+ Stage3end_T y = * (Stage3end_T *) b;
+
+ int x_length = Substring_match_length_orig(Stage3end_substringA(x));
+ int y_length = Substring_match_length_orig(Stage3end_substringA(y));
+
+ if (x_length < y_length) {
+ return -1;
+ } else if (y_length < x_length) {
+ return +1;
+ } else {
+ x_length = Substring_match_length_orig(Stage3end_substringD(x));
+ y_length = Substring_match_length_orig(Stage3end_substringD(y));
+ if (x_length < y_length) {
+ return -1;
+ } else if (y_length < x_length) {
+ return +1;
+ } else {
+ return 0;
+ }
+ }
+}
+
+
+
static List_T
find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
Segment_T *spliceable, int nspliceable, struct Segment_T *segments,
@@ -7758,13 +8238,16 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
Substring_T donor, acceptor, shortexon;
int nhits_local /*= 0*/, npotential_left, npotential_right;
+ int donor_length, acceptor_length;
+ List_T accepted_hits, rejected_hits, single_ambig_hits;
List_T spliceends, p;
- Stage3end_T hit, *array;
+ Stage3end_T hit, *hitarray;
int best_nmismatches, nmismatches;
int n_good_spliceends, n, i;
double best_prob, prob;
Univcoord_T lastpos;
Intlist_T donor_amb_knowni, acceptor_amb_knowni, donor_amb_nmismatches, acceptor_amb_nmismatches;
+ Doublelist_T donor_amb_probs, acceptor_amb_probs, probs_donor, probs_acceptor;
debug(printf("*** Starting find_known_doublesplices on %d segments ***\n",nspliceable));
@@ -8008,6 +8491,7 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
/* Process results for segmentm. */
if (spliceends != NULL) {
best_nmismatches = querylength;
+ best_prob = 0.0;
for (p = spliceends; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
debug7(printf("analyzing distance %d, nmismatches %d, probability %f\n",
@@ -8015,71 +8499,90 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
Stage3end_shortexon_prob(hit)));
if ((nmismatches = Stage3end_nmismatches_whole(hit)) < best_nmismatches) {
best_nmismatches = nmismatches;
- best_prob = Stage3end_shortexon_prob(hit);
- } else if (nmismatches == best_nmismatches && (prob = Stage3end_shortexon_prob(hit)) > best_prob) {
+ }
+ if ((prob = Stage3end_shortexon_prob(hit)) > best_prob) {
best_prob = prob;
}
}
n_good_spliceends = 0;
+ accepted_hits = rejected_hits = (List_T) NULL;
for (p = spliceends; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_shortexon_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP &&
+ (Stage3end_shortexon_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP)) {
debug7(printf("accepting distance %d, nmismatches %d, probability %f\n",
Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
Stage3end_shortexon_prob(hit)));
n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
+ } else {
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- if (n_good_spliceends == 1) {
+ if (n_good_spliceends == 0) {
+ /* Conjunction is too strict. Allow for disjunction instead. */
+ List_free(&rejected_hits);
for (p = spliceends; p != NULL; p = List_next(p)) {
hit = (Stage3end_T) List_head(p);
- if (Stage3end_nmismatches_whole(hit) == best_nmismatches &&
- (Stage3end_shortexon_prob(hit) > best_prob - LOCALSPLICING_SLOP)) {
- debug7(printf("pushing distance %d, nmismatches %d, probability %f\n",
+ if (Stage3end_nmismatches_whole(hit) <= best_nmismatches + LOCALSPLICING_NMATCHES_SLOP ||
+ (Stage3end_shortexon_prob(hit) >= best_prob - LOCALSPLICING_PROB_SLOP)) {
+ debug7(printf("accepting distance %d, nmismatches %d, probability %f\n",
Stage3end_distance(hit),Stage3end_nmismatches_whole(hit),
Stage3end_shortexon_prob(hit)));
- hits = List_push(hits,(void *) hit);
+ n_good_spliceends += 1;
+ accepted_hits = List_push(accepted_hits,(void *) hit);
} else {
- Stage3end_free(&hit);
+ rejected_hits = List_push(rejected_hits,(void *) hit);
}
}
- List_free(&spliceends);
+ }
+
+ for (p = rejected_hits; p != NULL; p = List_next(p)) {
+ hit = (Stage3end_T) List_head(p);
+ Stage3end_free(&hit);
+ }
+ List_free(&rejected_hits);
+ List_free(&spliceends);
+
+ if (n_good_spliceends == 1) {
+ hits = List_push(hits,List_head(accepted_hits));
+ List_free(&accepted_hits);
} else {
- /* Create ambiguous */
- hit = (Stage3end_T) List_head(spliceends);
- donor = Stage3end_substringD(hit);
- acceptor = Stage3end_substringA(hit);
- shortexon = Stage3end_substring1(hit);
- sensedir = Stage3end_sensedir(hit);
-
- donor_ambcoords = acceptor_ambcoords = NULL;
- donor_amb_knowni = acceptor_amb_knowni = (Intlist_T) NULL;
- donor_amb_nmismatches = acceptor_amb_nmismatches = (Intlist_T) NULL;
-
- n = List_length(spliceends);
- array = (Stage3end_T *) MALLOCA(n * sizeof(Stage3end_T));
- List_fill_array((void **) array,spliceends);
- qsort(array,n,sizeof(Stage3end_T),Stage3end_shortexon_substringD_cmp);
- donor = Stage3end_substringD(array[0]);
- lastpos = Substring_left_genomicseg(donor);
+ /* 5. Multiple hits, shortexon */
+ debug7(printf("multiple splice hits, shortexon\n"));
-#ifdef LARGE_GENOMES
- donor_ambcoords = Uint8list_push(donor_ambcoords,Substring_splicecoord(donor));
-#else
- donor_ambcoords = Uintlist_push(donor_ambcoords,Substring_splicecoord(donor));
-#endif
- donor_amb_knowni = Intlist_push(donor_amb_knowni,-1);
- donor_amb_nmismatches = Intlist_push(donor_amb_nmismatches,Substring_nmismatches_whole(donor));
-
- for (i = 1; i < n; i++) {
- donor = Stage3end_substringD(array[i]);
- if (Substring_left_genomicseg(donor) == lastpos) {
- /* Skip */
+ /* Process multiple double ambiguous first */
+ hitarray = (Stage3end_T *) List_to_array_n(&n,accepted_hits);
+ qsort(hitarray,n,sizeof(Stage3end_T),substringD_match_length_cmp);
+ List_free(&accepted_hits);
+ single_ambig_hits = (List_T) NULL;
+
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ donor = Stage3end_substringD(hit);
+ donor_length = Substring_match_length_orig(donor);
+ acceptor = Stage3end_substringA(hit);
+ acceptor_length = Substring_match_length_orig(acceptor);
+ j = i + 1;
+ while (j < n && Substring_match_length_orig(Stage3end_substringD(hitarray[j])) == donor_length &&
+ Substring_match_length_orig(Stage3end_substringA(hitarray[j])) == acceptor_length) {
+ j++;
+ }
+ if (j == i + 1) {
+ /* Save for later analysis */
+ single_ambig_hits = List_push(single_ambig_hits,(void *) hit);
} else {
+ donor_ambcoords = acceptor_ambcoords = NULL;
+ donor_amb_knowni = acceptor_amb_knowni = (Intlist_T) NULL;
+ donor_amb_nmismatches = acceptor_amb_nmismatches = (Intlist_T) NULL;
+ donor_amb_probs = acceptor_amb_probs = (Doublelist_T) NULL;
+
+ qsort(&(hitarray[i]),j-i,sizeof(Stage3end_T),Stage3end_shortexon_substringD_cmp);
+ donor = Stage3end_substringD(hitarray[i]);
#ifdef LARGE_GENOMES
donor_ambcoords = Uint8list_push(donor_ambcoords,Substring_splicecoord(donor));
#else
@@ -8087,27 +8590,25 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
#endif
donor_amb_knowni = Intlist_push(donor_amb_knowni,-1);
donor_amb_nmismatches = Intlist_push(donor_amb_nmismatches,Substring_nmismatches_whole(donor));
- lastpos = Substring_left_genomicseg(donor);
- }
- }
-
- qsort(array,n,sizeof(Stage3end_T),Stage3end_shortexon_substringA_cmp);
- acceptor = Stage3end_substringA(array[0]);
- lastpos = Substring_left_genomicseg(acceptor);
+ donor_amb_probs = Doublelist_push(donor_amb_probs,Substring_chimera_prob(donor));
+ lastpos = Substring_left_genomicseg(donor);
+ for (k = i + 1; k < j; k++) {
+ donor = Stage3end_substringD(hitarray[k]);
+ if (Substring_left_genomicseg(donor) != lastpos) {
#ifdef LARGE_GENOMES
- acceptor_ambcoords = Uint8list_push(acceptor_ambcoords,Substring_splicecoord(acceptor));
+ donor_ambcoords = Uint8list_push(donor_ambcoords,Substring_splicecoord(donor));
#else
- acceptor_ambcoords = Uintlist_push(acceptor_ambcoords,Substring_splicecoord(acceptor));
+ donor_ambcoords = Uintlist_push(donor_ambcoords,Substring_splicecoord(donor));
#endif
- acceptor_amb_knowni = Intlist_push(acceptor_amb_knowni,-1);
- acceptor_amb_nmismatches = Intlist_push(acceptor_amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ donor_amb_knowni = Intlist_push(donor_amb_knowni,-1);
+ donor_amb_nmismatches = Intlist_push(donor_amb_nmismatches,Substring_nmismatches_whole(donor));
+ donor_amb_probs = Doublelist_push(donor_amb_probs,Substring_chimera_prob(donor));
+ }
+ }
- for (i = 1; i < n; i++) {
- acceptor = Stage3end_substringA(array[i]);
- if (Substring_left_genomicseg(acceptor) == lastpos) {
- /* Skip */
- } else {
+ qsort(&(hitarray[i]),j-i,sizeof(Stage3end_T),Stage3end_shortexon_substringA_cmp);
+ acceptor = Stage3end_substringA(hitarray[i]);
#ifdef LARGE_GENOMES
acceptor_ambcoords = Uint8list_push(acceptor_ambcoords,Substring_splicecoord(acceptor));
#else
@@ -8115,66 +8616,223 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
#endif
acceptor_amb_knowni = Intlist_push(acceptor_amb_knowni,-1);
acceptor_amb_nmismatches = Intlist_push(acceptor_amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ acceptor_amb_probs = Doublelist_push(acceptor_amb_probs,Substring_chimera_prob(acceptor));
+
lastpos = Substring_left_genomicseg(acceptor);
+ for (k = i + 1; k < j; k++) {
+ acceptor = Stage3end_substringA(hitarray[k]);
+ if (Substring_left_genomicseg(acceptor) != lastpos) {
+#ifdef LARGE_GENOMES
+ acceptor_ambcoords = Uint8list_push(acceptor_ambcoords,Substring_splicecoord(acceptor));
+#else
+ acceptor_ambcoords = Uintlist_push(acceptor_ambcoords,Substring_splicecoord(acceptor));
+#endif
+ acceptor_amb_knowni = Intlist_push(acceptor_amb_knowni,-1);
+ acceptor_amb_nmismatches = Intlist_push(acceptor_amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ acceptor_amb_probs = Doublelist_push(acceptor_amb_probs,Substring_chimera_prob(acceptor));
+ }
+ }
+
+ shortexon = Stage3end_substring1(hitarray[i]);
+ sensedir = Stage3end_sensedir(hitarray[i]);
+ if (Intlist_length(donor_amb_nmismatches) > 1 && Intlist_length(acceptor_amb_nmismatches) > 1) {
+ hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),/*donor*/NULL,/*acceptor*/NULL,shortexon,
+ /*amb_length_donor*/donor_length,/*amb_length_acceptor*/acceptor_length,
+ /*amb_prob_donor*/Doublelist_max(donor_amb_probs),/*amb_prob_acceptor*/Doublelist_max(acceptor_amb_probs),
+ donor_ambcoords,acceptor_ambcoords,
+ donor_amb_knowni,acceptor_amb_knowni,
+ donor_amb_nmismatches,acceptor_amb_nmismatches,
+ donor_amb_probs,acceptor_amb_probs,
+ /*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/true,
+ splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+
+ } else if (Intlist_length(donor_amb_nmismatches) > 1) {
+ hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),/*donor*/NULL,acceptor,shortexon,
+ /*amb_length_donor*/donor_length,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/Doublelist_max(donor_amb_probs),/*amb_length_acceptor*/0.0,
+ donor_ambcoords,/*acceptor_ambcoords*/NULL,
+ donor_amb_knowni,/*amb_knowni_acceptor*/NULL,
+ donor_amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ donor_amb_probs,/*amb_probs_acceptor*/NULL,
+ /*copy_donor_p*/false,/*copy_acceptor_p*/true,/*copy_shortexon_p*/true,
+ splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+
+ } else if (Intlist_length(acceptor_amb_nmismatches) > 1) {
+ hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),donor,/*acceptor*/NULL,shortexon,
+ /*amb_length_donor*/0,/*amb_length_acceptor*/acceptor_length,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/Doublelist_max(acceptor_amb_probs),
+ /*ambcoords_donor*/NULL,acceptor_ambcoords,
+ /*amb_knowni_donor*/NULL,acceptor_amb_knowni,
+ /*amb_nmismatches_donor*/NULL,acceptor_amb_nmismatches,
+ /*amb_probs_donor*/NULL,acceptor_amb_probs,
+ /*copy_donor_p*/true,/*copy_acceptor_p*/false,/*copy_shortexon_p*/true,
+ splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+
+ } else {
+ /* A singleton, apparently due to many duplicates. Is this possible? */
+ hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),donor,acceptor,shortexon,
+ /*amb_length_donor*/0,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/0.0,
+ /*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
+ /*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
+ /*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
+ /*copy_donor_p*/true,/*copy_acceptor_p*/true,/*copy_shortexon_p*/true,
+ splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+
+ }
+
+ Doublelist_free(&donor_amb_probs);
+ Intlist_free(&donor_amb_nmismatches);
+ Intlist_free(&donor_amb_knowni);
+ Doublelist_free(&acceptor_amb_probs);
+ Intlist_free(&acceptor_amb_nmismatches);
+ Intlist_free(&acceptor_amb_knowni);
+#ifdef LARGE_GENOMES
+ Uint8list_free(&donor_ambcoords);
+ Uint8list_free(&acceptor_ambcoords);
+#else
+ Uintlist_free(&donor_ambcoords);
+ Uintlist_free(&acceptor_ambcoords);
+#endif
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ Stage3end_free(&hit);
+ }
}
+
+ i = j;
}
-
- FREEA(array);
-
- if (Intlist_length(donor_amb_nmismatches) == 1 && Intlist_length(acceptor_amb_nmismatches) == 1) {
- hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),donor,acceptor,shortexon,
- /*amb_length_donor*/0,/*amb_length_acceptor*/0,
- /*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
- /*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
- /*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
- /*copy_donor_p*/true,/*copy_acceptor_p*/true,/*copy_shortexon_p*/true,
- splicing_penalty,querylength,sensedir,/*sarrayp*/false));
-
- } else if (Intlist_length(donor_amb_nmismatches) > 1 && Intlist_length(acceptor_amb_nmismatches) == 1) {
- hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),/*donor*/NULL,acceptor,shortexon,
- /*amb_length_donor*/Substring_match_length_orig(donor),/*amb_length_acceptor*/0,
- donor_ambcoords,/*acceptor_ambcoords*/NULL,
- donor_amb_knowni,/*amb_knowni_acceptor*/NULL,
- donor_amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
- /*copy_donor_p*/false,/*copy_acceptor_p*/true,/*copy_shortexon_p*/true,
- splicing_penalty,querylength,sensedir,/*sarrayp*/false));
-
- } else if (Intlist_length(donor_amb_nmismatches) == 1 && Intlist_length(acceptor_amb_nmismatches) > 1) {
- hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),donor,/*acceptor*/NULL,shortexon,
- /*amb_length_donor*/0,/*amb_length_acceptor*/Substring_match_length_orig(acceptor),
- /*ambcoords_donor*/NULL,acceptor_ambcoords,
- /*amb_knowni_donor*/NULL,acceptor_amb_knowni,
- /*amb_nmismatches_donor*/NULL,acceptor_amb_nmismatches,
- /*copy_donor_p*/true,/*copy_acceptor_p*/false,/*copy_shortexon_p*/true,
- splicing_penalty,querylength,sensedir,/*sarrayp*/false));
- } else {
- hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),/*donor*/NULL,/*acceptor*/NULL,shortexon,
- /*amb_length_donor*/Substring_match_length_orig(donor),
- /*amb_length_acceptor*/Substring_match_length_orig(acceptor),
- donor_ambcoords,acceptor_ambcoords,
- donor_amb_knowni,acceptor_amb_knowni,
- donor_amb_nmismatches,acceptor_amb_nmismatches,
- /*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/true,
- splicing_penalty,querylength,sensedir,/*sarrayp*/false));
- }
+ FREE(hitarray);
- for (p = spliceends; p != NULL; p = List_next(p)) {
- hit = (Stage3end_T) List_head(p);
- Stage3end_free(&hit);
+ /* Process single ambiguous on donor side */
+ hitarray = (Stage3end_T *) List_to_array_n(&n,single_ambig_hits);
+ qsort(hitarray,n,sizeof(Stage3end_T),substringD_match_length_cmp);
+ List_free(&single_ambig_hits);
+ single_ambig_hits = (List_T) NULL;
+
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ donor = Stage3end_substringD(hit);
+ donor_length = Substring_match_length_orig(donor);
+ j = i + 1;
+ while (j < n && Substring_match_length_orig(Stage3end_substringD(hitarray[j])) == donor_length) {
+ j++;
+ }
+ if (j == i + 1) {
+ /* Save for later analysis */
+ single_ambig_hits = List_push(single_ambig_hits,(void *) hit);
+ } else {
+ acceptor_ambcoords = NULL;
+ acceptor_amb_knowni = (Intlist_T) NULL;
+ acceptor_amb_nmismatches = (Intlist_T) NULL;
+ acceptor_amb_probs = (Doublelist_T) NULL;
+
+ for (k = i + 1; k < j; k++) {
+ acceptor = Stage3end_substringA(hitarray[i]);
+#ifdef LARGE_GENOMES
+ acceptor_ambcoords = Uint8list_push(acceptor_ambcoords,Substring_splicecoord(acceptor));
+#else
+ acceptor_ambcoords = Uintlist_push(acceptor_ambcoords,Substring_splicecoord(acceptor));
+#endif
+ acceptor_amb_knowni = Intlist_push(acceptor_amb_knowni,-1);
+ acceptor_amb_nmismatches = Intlist_push(acceptor_amb_nmismatches,Substring_nmismatches_whole(acceptor));
+ acceptor_amb_probs = Doublelist_push(acceptor_amb_probs,Substring_chimera_prob(acceptor));
+ }
+
+ shortexon = Stage3end_substring1(hitarray[i]);
+ sensedir = Stage3end_sensedir(hitarray[i]);
+ hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),donor,/*acceptor*/NULL,shortexon,
+ /*amb_length_donor*/0,/*amb_length_acceptor*/Substring_match_length_orig(acceptor),
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/Doublelist_max(acceptor_amb_probs),
+ /*ambcoords_donor*/NULL,acceptor_ambcoords,
+ /*amb_knowni_donor*/NULL,acceptor_amb_knowni,
+ /*amb_nmismatches_donor*/NULL,acceptor_amb_nmismatches,
+ /*amb_probs_donor*/NULL,acceptor_amb_probs,
+ /*copy_donor_p*/true,/*copy_acceptor_p*/false,/*copy_shortexon_p*/true,
+ splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+ Doublelist_free(&acceptor_amb_probs);
+ Intlist_free(&acceptor_amb_nmismatches);
+ Intlist_free(&acceptor_amb_knowni);
+#ifdef LARGE_GENOMES
+ Uint8list_free(&acceptor_ambcoords);
+#else
+ Uintlist_free(&acceptor_ambcoords);
+#endif
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ Stage3end_free(&hit);
+ }
+ }
+
+ i = j;
}
- List_free(&spliceends);
+ FREE(hitarray);
+ /* Process single ambiguous on acceptor side */
+ hitarray = (Stage3end_T *) List_to_array_n(&n,single_ambig_hits);
+ qsort(hitarray,n,sizeof(Stage3end_T),substringA_match_length_cmp);
+ List_free(&single_ambig_hits);
+
+ i = 0;
+ while (i < n) {
+ hit = hitarray[i];
+ acceptor = Stage3end_substringA(hit);
+ acceptor_length = Substring_match_length_orig(acceptor);
+ j = i + 1;
+ while (j < n && Substring_match_length_orig(Stage3end_substringA(hitarray[j])) == acceptor_length) {
+ j++;
+ }
+ if (j == i + 1) {
+ /* Finally, a confirmed unique */
+ hits = List_push(hits,(void *) hit);
+ } else {
+ donor_ambcoords = NULL;
+ donor_amb_knowni = (Intlist_T) NULL;
+ donor_amb_nmismatches = (Intlist_T) NULL;
+ donor_amb_probs = (Doublelist_T) NULL;
+
+ for (k = i + 1; k < j; k++) {
+ donor = Stage3end_substringD(hitarray[i]);
#ifdef LARGE_GENOMES
- Uint8list_free(&donor_ambcoords);
- Uint8list_free(&acceptor_ambcoords);
+ donor_ambcoords = Uint8list_push(donor_ambcoords,Substring_splicecoord(donor));
#else
- Uintlist_free(&donor_ambcoords);
- Uintlist_free(&acceptor_ambcoords);
+ donor_ambcoords = Uintlist_push(donor_ambcoords,Substring_splicecoord(donor));
#endif
- Intlist_free(&donor_amb_knowni);
- Intlist_free(&acceptor_amb_knowni);
- Intlist_free(&donor_amb_nmismatches);
- Intlist_free(&acceptor_amb_nmismatches);
+ donor_amb_knowni = Intlist_push(donor_amb_knowni,-1);
+ donor_amb_nmismatches = Intlist_push(donor_amb_nmismatches,Substring_nmismatches_whole(donor));
+ donor_amb_probs = Doublelist_push(donor_amb_probs,Substring_chimera_prob(donor));
+ }
+
+ shortexon = Stage3end_substring1(hitarray[i]);
+ sensedir = Stage3end_sensedir(hitarray[i]);
+ hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),/*donor*/NULL,acceptor,shortexon,
+ /*amb_length_donor*/Substring_match_length_orig(donor),/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/Doublelist_max(donor_amb_probs),/*amb_prob_acceptor*/0.0,
+ donor_ambcoords,/*acceptor_ambcoords*/NULL,
+ donor_amb_knowni,/*amb_knowni_acceptor*/NULL,
+ donor_amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ donor_amb_probs,/*amb_probs_acceptor*/NULL,
+ /*copy_donor_p*/false,/*copy_acceptor_p*/true,/*copy_shortexon_p*/true,
+ splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+ Doublelist_free(&donor_amb_probs);
+ Intlist_free(&donor_amb_nmismatches);
+ Intlist_free(&donor_amb_knowni);
+#ifdef LARGE_GENOMES
+ Uint8list_free(&donor_ambcoords);
+#else
+ Uintlist_free(&donor_ambcoords);
+#endif
+ for (k = i; k < j; k++) {
+ hit = hitarray[k];
+ Stage3end_free(&hit);
+ }
+ }
+
+ i = j;
+ }
+ FREE(hitarray);
}
}
}
@@ -8230,18 +8888,22 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
segmentm->chrnum,segmentm->chroffset,segmentm->chrhigh,segmentm->chrlength);
if (shortexon != NULL) {
debug4k(printf("New one-third shortexon at left %llu\n",(unsigned long long) segmentm_left));
- ambcoords_donor = lookup_splicesites(splicesites_i_left,splicesites);
- ambcoords_acceptor = lookup_splicesites(splicesites_i_right,splicesites);
+ ambcoords_donor = lookup_splicesites(&probs_donor,splicesites_i_left,splicesites);
+ ambcoords_acceptor = lookup_splicesites(&probs_acceptor,splicesites_i_right,splicesites);
amb_length_donor = leftpos /*- nmismatches_shortexon_left*/;
amb_length_acceptor = querylength - rightpos /*- nmismatches_shortexon_right*/;
segmentm->usedp = true;
hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),/*donor*/NULL,/*acceptor*/NULL,shortexon,
amb_length_donor,amb_length_acceptor,
+ /*amb_prob_donor*/2.0,/*amb_prob_acceptor*/2.0,
ambcoords_donor,ambcoords_acceptor,
/*amb_knowni_donor*/splicesites_i_left,/*amb_knowni_acceptor*/splicesites_i_right,
/*amb_nmismatches_donor*/nmismatches_list_left,/*amb_nmismatches_acceptor*/nmismatches_list_right,
+ /*amb_probs_donor*/probs_donor,/*amb_nmismatches_acceptor*/probs_acceptor,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+ Doublelist_free(&probs_donor);
+ Doublelist_free(&probs_acceptor);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords_donor);
Uint8list_free(&ambcoords_acceptor);
@@ -8280,16 +8942,19 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
} else {
debug4k(printf("ambp_left true, ambp_right false: New two-thirds shortexon at left %llu\n",
(unsigned long long) segmentm_left));
- ambcoords_donor = lookup_splicesites(splicesites_i_left,splicesites);
+ ambcoords_donor = lookup_splicesites(&probs_donor,splicesites_i_left,splicesites);
amb_length_donor = leftpos /*- nmismatches_shortexon_left*/;
segmentm->usedp = true;
hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),/*donor*/NULL,acceptor,shortexon,
amb_length_donor,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/2.0,/*amb_length_acceptor*/0,
ambcoords_donor,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/splicesites_i_left,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/nmismatches_list_left,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/probs_donor,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+ Doublelist_free(&probs_donor);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords_donor);
#else
@@ -8324,16 +8989,19 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
if (donor != NULL) Substring_free(&donor);
if (shortexon != NULL) Substring_free(&shortexon);
} else {
- ambcoords_acceptor = lookup_splicesites(splicesites_i_right,splicesites);
+ ambcoords_acceptor = lookup_splicesites(&probs_acceptor,splicesites_i_right,splicesites);
amb_length_acceptor = querylength - rightpos /*- nmismatches_shortexon_right*/;
segmentm->usedp = true;
hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),donor,/*acceptor*/NULL,shortexon,
/*amb_length_donor*/0,amb_length_acceptor,
+ /*amb_prob_donor*/0.0,/*amb_length_acceptor*/2.0,
/*ambcoords_donor*/NULL,ambcoords_acceptor,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/splicesites_i_right,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/nmismatches_list_right,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/probs_acceptor,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+ Doublelist_free(&probs_acceptor);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords_acceptor);
#else
@@ -8376,9 +9044,11 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
segmentm->usedp = true;
hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),donor,acceptor,shortexon,
/*amb_length_donor*/0,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,/*sarrayp*/false));
}
@@ -8446,18 +9116,22 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
segmentm->chrnum,segmentm->chroffset,segmentm->chrhigh,segmentm->chrlength);
if (shortexon != NULL) {
debug4k(printf("New one-third shortexon at left %llu\n",(unsigned long long) segmentm_left));
- ambcoords_donor = lookup_splicesites(splicesites_i_right,splicesites);
- ambcoords_acceptor = lookup_splicesites(splicesites_i_left,splicesites);
+ ambcoords_donor = lookup_splicesites(&probs_donor,splicesites_i_right,splicesites);
+ ambcoords_acceptor = lookup_splicesites(&probs_acceptor,splicesites_i_left,splicesites);
amb_length_donor = querylength - rightpos /*- nmismatches_shortexon_right*/;
amb_length_acceptor = leftpos /*- nmismatches_shortexon_left*/;
segmentm->usedp = true;
hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),/*donor*/NULL,/*acceptor*/NULL,shortexon,
amb_length_donor,amb_length_acceptor,
+ /*amb_prob_donor*/2.0,/*amb_prob_acceptor*/2.0,
ambcoords_donor,ambcoords_acceptor,
/*amb_knowni_donor*/splicesites_i_right,/*amb_knowni_acceptor*/splicesites_i_left,
/*amb_nmismatches_donor*/nmismatches_list_right,/*amb_nmismatches_acceptor*/nmismatches_list_left,
+ /*amb_probs_donor*/probs_donor,/*amb_probs_acceptor*/probs_acceptor,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+ Doublelist_free(&probs_donor);
+ Doublelist_free(&probs_acceptor);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords_donor);
Uint8list_free(&ambcoords_acceptor);
@@ -8493,16 +9167,19 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
if (donor != NULL) Substring_free(&donor);
if (shortexon != NULL) Substring_free(&shortexon);
} else {
- ambcoords_acceptor = lookup_splicesites(splicesites_i_left,splicesites);
+ ambcoords_acceptor = lookup_splicesites(&probs_acceptor,splicesites_i_left,splicesites);
amb_length_acceptor = leftpos /*- nmismatches_shortexon_left*/;
segmentm->usedp = true;
hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),donor,/*acceptor*/NULL,shortexon,
/*amb_length_donor*/0,amb_length_acceptor,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/2.0,
/*ambcoords_donor*/NULL,ambcoords_acceptor,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/splicesites_i_left,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/nmismatches_list_left,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/probs_acceptor,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+ Doublelist_free(&probs_acceptor);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords_acceptor);
#else
@@ -8538,16 +9215,19 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
} else {
debug4k(printf("ambp_left false, ambp_right true: New splice at left %llu\n",
(unsigned long long) segmentm_left));
- ambcoords_donor = lookup_splicesites(splicesites_i_right,splicesites);
+ ambcoords_donor = lookup_splicesites(&probs_donor,splicesites_i_right,splicesites);
amb_length_donor = querylength - rightpos /*- nmismatches_shortexon_right*/;
segmentm->usedp = true;
hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),/*donor*/NULL,acceptor,shortexon,
amb_length_donor,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/2.0,/*amb_prob_acceptor*/0.0,
ambcoords_donor,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/splicesites_i_right,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/nmismatches_list_right,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/probs_donor,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,/*sarrayp*/false));
+ Doublelist_free(&probs_donor);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords_donor);
#else
@@ -8588,9 +9268,11 @@ find_doublesplices (int *found_score, List_T hits, List_T *lowprob,
segmentm->usedp = true;
hits = List_push(hits,(void *) Stage3end_new_shortexon(&(*found_score),donor,acceptor,shortexon,
/*amb_length_donor*/0,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/false,/*copy_shortexon_p*/false,
splicing_penalty,querylength,sensedir,/*sarrayp*/false));
}
@@ -10125,20 +10807,22 @@ find_splicepairs_distant (int *found_score, int *ndistantsplicepairs,
*localsplicing = List_push(*localsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
} else if (*ndistantsplicepairs <= MAXCHIMERAPATHS) {
distantsplicing = List_push(distantsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
(*ndistantsplicepairs)++;
@@ -10214,20 +10898,22 @@ find_splicepairs_distant (int *found_score, int *ndistantsplicepairs,
*localsplicing = List_push(*localsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
} else if (*ndistantsplicepairs <= MAXCHIMERAPATHS) {
distantsplicing = List_push(distantsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
(*ndistantsplicepairs)++;
@@ -10303,20 +10989,22 @@ find_splicepairs_distant (int *found_score, int *ndistantsplicepairs,
*localsplicing = List_push(*localsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
} else if (*ndistantsplicepairs <= MAXCHIMERAPATHS) {
distantsplicing = List_push(distantsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
(*ndistantsplicepairs)++;
@@ -10392,20 +11080,22 @@ find_splicepairs_distant (int *found_score, int *ndistantsplicepairs,
*localsplicing = List_push(*localsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
} else if (*ndistantsplicepairs <= MAXCHIMERAPATHS) {
distantsplicing = List_push(distantsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
(*ndistantsplicepairs)++;
@@ -10471,10 +11161,11 @@ find_splicepairs_distant (int *found_score, int *ndistantsplicepairs,
distantsplicing = List_push(distantsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
(*ndistantsplicepairs)++;
@@ -10532,10 +11223,11 @@ find_splicepairs_distant (int *found_score, int *ndistantsplicepairs,
distantsplicing = List_push(distantsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
(*ndistantsplicepairs)++;
@@ -10594,10 +11286,11 @@ find_splicepairs_distant (int *found_score, int *ndistantsplicepairs,
distantsplicing = List_push(distantsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
(*ndistantsplicepairs)++;
@@ -10655,10 +11348,11 @@ find_splicepairs_distant (int *found_score, int *ndistantsplicepairs,
distantsplicing = List_push(distantsplicing,
(void *) Stage3end_new_splice(&(*found_score),nmismatches1,nmismatches2,
donor,acceptor,distance,
- /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/false,distantsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
(*ndistantsplicepairs)++;
@@ -10700,6 +11394,7 @@ find_splicepairs_shortend (int *found_score, List_T hits,
#endif
Intlist_T splicesites_i;
Intlist_T nmismatches_list;
+ Doublelist_T probs_list;
int nmismatches, nmismatches_shortend, nmisses_allowed, support, endlength;
int amb_length;
#ifdef DEBUG4H
@@ -10766,18 +11461,20 @@ find_splicepairs_shortend (int *found_score, List_T hits,
/*collect_all_p*/pairedp == true && first_read_p == true)) != NULL) {
if (endlength < min_shortend || Intlist_length(splicesites_i) > 1) {
- ambcoords = lookup_splicesites(splicesites_i,splicesites);
+ ambcoords = lookup_splicesites(&probs_list,splicesites_i,splicesites);
amb_length = endlength /*- nmismatches_shortend*/;
debug4h(printf("End 1: short-overlap donor_plus: Successful ambiguous from donor #%d with amb_length %d\n",
Substring_splicesites_knowni(donor),amb_length));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches,nmismatches_shortend,
donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,
+ /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,/*amb_prob*/2.0,
/*ambcoords_donor*/NULL,ambcoords,
/*ambi_donor*/NULL,/*ambi_acceptor*/splicesites_i,
/*amb_nmismatches_donor*/NULL,/*nmismatches_acceptor*/nmismatches_list,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/probs_list,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
+ Doublelist_free(&probs_list);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords);
#else
@@ -10796,10 +11493,11 @@ find_splicepairs_shortend (int *found_score, List_T hits,
Substring_splicesites_knowni(donor),Substring_splicesites_knowni(acceptor)));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches,nmismatches_shortend,
donor,acceptor,/*distance*/bestleft-origleft,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
}
@@ -10839,18 +11537,20 @@ find_splicepairs_shortend (int *found_score, List_T hits,
/*collect_all_p*/pairedp == true && first_read_p == false)) != NULL) {
if (endlength < min_shortend || Intlist_length(splicesites_i) > 1) {
- ambcoords = lookup_splicesites(splicesites_i,splicesites);
+ ambcoords = lookup_splicesites(&probs_list,splicesites_i,splicesites);
amb_length = endlength /*- nmismatches_shortend*/;
debug4h(printf("End 2: short-overlap acceptor_plus: Successful ambiguous from acceptor #%d with amb_length %d\n",
Substring_splicesites_knowni(acceptor),amb_length));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches_shortend,nmismatches,
/*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,
+ /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,/*amb_prob*/2.0,
ambcoords,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/splicesites_i,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/nmismatches_list,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/probs_list,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
+ Doublelist_free(&probs_list);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords);
#else
@@ -10869,10 +11569,11 @@ find_splicepairs_shortend (int *found_score, List_T hits,
Substring_splicesites_knowni(acceptor),Substring_splicesites_knowni(donor)));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches_shortend,nmismatches,
donor,acceptor,/*distance*/origleft-bestleft,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
}
@@ -10912,18 +11613,20 @@ find_splicepairs_shortend (int *found_score, List_T hits,
/*collect_all_p*/pairedp == true && first_read_p == true)) != NULL) {
if (endlength < min_shortend || Intlist_length(splicesites_i) > 1) {
- ambcoords = lookup_splicesites(splicesites_i,splicesites);
+ ambcoords = lookup_splicesites(&probs_list,splicesites_i,splicesites);
amb_length = endlength /*- nmismatches_shortend*/;
debug4h(printf("End 3: short-overlap donor_minus: Successful ambiguous from donor #%d with amb_length %d\n",
Substring_splicesites_knowni(donor),amb_length));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches,nmismatches_shortend,
donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,
+ /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,/*amb_prob*/2.0,
/*ambcoords_donor*/NULL,ambcoords,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/splicesites_i,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/nmismatches_list,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/probs_list,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
+ Doublelist_free(&probs_list);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords);
#else
@@ -10942,10 +11645,11 @@ find_splicepairs_shortend (int *found_score, List_T hits,
Substring_splicesites_knowni(donor),Substring_splicesites_knowni(acceptor)));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches,nmismatches_shortend,
donor,acceptor,/*distance*/origleft-bestleft,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
}
@@ -10986,18 +11690,20 @@ find_splicepairs_shortend (int *found_score, List_T hits,
/*collect_all_p*/pairedp == true && first_read_p == false)) != NULL) {
if (endlength < min_shortend || Intlist_length(splicesites_i) > 1) {
- ambcoords = lookup_splicesites(splicesites_i,splicesites);
+ ambcoords = lookup_splicesites(&probs_list,splicesites_i,splicesites);
amb_length = endlength /*- nmismatches_shortend*/;
debug4h(printf("End 4: short-overlap acceptor_minus: Successful ambiguous from acceptor #%d with amb_length %d\n",
Substring_splicesites_knowni(acceptor),amb_length));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches_shortend,nmismatches,
/*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,
+ /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,/*amb_prob*/2.0,
ambcoords,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/splicesites_i,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/nmismatches_list,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/probs_list,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
+ Doublelist_free(&probs_list);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords);
#else
@@ -11016,10 +11722,11 @@ find_splicepairs_shortend (int *found_score, List_T hits,
Substring_splicesites_knowni(acceptor),Substring_splicesites_knowni(donor)));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches_shortend,nmismatches,
donor,acceptor,/*distance*/bestleft-origleft,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false));
}
@@ -11059,18 +11766,20 @@ find_splicepairs_shortend (int *found_score, List_T hits,
/*collect_all_p*/pairedp == true && first_read_p == false)) != NULL) {
if (endlength < min_shortend || Intlist_length(splicesites_i) > 1) {
- ambcoords = lookup_splicesites(splicesites_i,splicesites);
+ ambcoords = lookup_splicesites(&probs_list,splicesites_i,splicesites);
amb_length = endlength /*- nmismatches_shortend*/;
debug4h(printf("End 5: short-overlap antidonor_plus: Successful ambiguous from antidonor #%d with amb_length %d\n",
Substring_splicesites_knowni(donor),amb_length));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches,nmismatches_shortend,
donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,
+ /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,/*amb_prob*/2.0,
/*ambcoords_donor*/NULL,ambcoords,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/splicesites_i,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/nmismatches_list,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/probs_list,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
+ Doublelist_free(&probs_list);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords);
#else
@@ -11089,10 +11798,11 @@ find_splicepairs_shortend (int *found_score, List_T hits,
Substring_splicesites_knowni(donor),Substring_splicesites_knowni(acceptor)));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches,nmismatches_shortend,
donor,acceptor,/*distance*/origleft-bestleft,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
}
@@ -11133,18 +11843,20 @@ find_splicepairs_shortend (int *found_score, List_T hits,
/*collect_all_p*/pairedp == true && first_read_p == true)) != NULL) {
if (endlength < min_shortend || Intlist_length(splicesites_i) > 1) {
- ambcoords = lookup_splicesites(splicesites_i,splicesites);
+ ambcoords = lookup_splicesites(&probs_list,splicesites_i,splicesites);
amb_length = endlength /*- nmismatches_shortend*/;
debug4h(printf("End 6: short-overlap antiacceptor_plus: Successful ambiguous from antiacceptor #%d with amb_length %d\n",
Substring_splicesites_knowni(acceptor),amb_length));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches_shortend,nmismatches,
/*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,
+ /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,/*amb_prob*/2.0,
ambcoords,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/splicesites_i,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/nmismatches_list,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/probs_list,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
+ Doublelist_free(&probs_list);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords);
#else
@@ -11163,10 +11875,11 @@ find_splicepairs_shortend (int *found_score, List_T hits,
Substring_splicesites_knowni(acceptor),Substring_splicesites_knowni(donor)));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches_shortend,nmismatches,
donor,acceptor,/*distance*/bestleft-origleft,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
}
@@ -11207,18 +11920,20 @@ find_splicepairs_shortend (int *found_score, List_T hits,
/*collect_all_p*/pairedp == true && first_read_p == false)) != NULL) {
if (endlength < min_shortend || Intlist_length(splicesites_i) > 1) {
- ambcoords = lookup_splicesites(splicesites_i,splicesites);
+ ambcoords = lookup_splicesites(&probs_list,splicesites_i,splicesites);
amb_length = endlength /*- nmismatches_shortend*/;
debug4h(printf("End 7: short-overlap antidonor_minus: Successful ambiguous from antidonor #%d with amb_length %d\n",
Substring_splicesites_knowni(donor),amb_length));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches,nmismatches_shortend,
donor,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,
+ /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,/*amb_prob*/2.0,
/*ambcoords_donor*/NULL,ambcoords,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/splicesites_i,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/nmismatches_list,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/probs_list,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
+ Doublelist_free(&probs_list);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords);
#else
@@ -11237,10 +11952,11 @@ find_splicepairs_shortend (int *found_score, List_T hits,
Substring_splicesites_knowni(donor),Substring_splicesites_knowni(acceptor)));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches,nmismatches_shortend,
donor,acceptor,/*distance*/bestleft-origleft,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
}
@@ -11280,18 +11996,20 @@ find_splicepairs_shortend (int *found_score, List_T hits,
/*collect_all_p*/pairedp == true && first_read_p == true)) != NULL) {
if (endlength < min_shortend || Intlist_length(splicesites_i) > 1) {
- ambcoords = lookup_splicesites(splicesites_i,splicesites);
+ ambcoords = lookup_splicesites(&probs_list,splicesites_i,splicesites);
amb_length = endlength /*- nmismatches_shortend*/;
debug4h(printf("End 8: short-overlap antiacceptor_minus: Successful ambiguous from antiacceptor #%d with amb_length %d\n",
Substring_splicesites_knowni(acceptor),amb_length));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches_shortend,nmismatches,
/*donor*/NULL,acceptor,/*distance*/0U,
- /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,
+ /*shortdistancep*/false,/*penalty*/0,querylength,amb_length,/*amb_prob*/2.0,
ambcoords,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/splicesites_i,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/nmismatches_list,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/probs_list,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
+ Doublelist_free(&probs_list);
#ifdef LARGE_GENOMES
Uint8list_free(&ambcoords);
#else
@@ -11310,10 +12028,11 @@ find_splicepairs_shortend (int *found_score, List_T hits,
Substring_splicesites_knowni(acceptor),Substring_splicesites_knowni(donor)));
hits = List_push(hits,(void *) Stage3end_new_splice(&(*found_score),nmismatches_shortend,nmismatches,
donor,acceptor,/*distance*/origleft-bestleft,
- /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,first_read_p,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false));
}
@@ -11464,7 +12183,7 @@ complete_set_mm_indels (int *found_score, bool *segments_computed_p,
char *queryuc_ptr, char *queryrc,
#endif
int querylength, int query_lastpos, Floors_T floors,
- int subopt_levels, int indel_penalty_middle, int indel_penalty_end,
+ int indel_penalty_middle, int indel_penalty_end,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
int fast_level, int genestrand, bool first_read_p) {
int firstbound, lastbound;
@@ -11881,6 +12600,7 @@ run_gmap (bool *good_start_p, bool *good_end_p, History_T gmap_history,
struct Pair_T *pairarray;
Univcoord_T start, end;
double min_splice_prob;
+ int goodness;
int npairs, nsegments, nmismatches_whole, nindels, nintrons, nindelbreaks;
int cdna_direction, sensedir;
int matches, unknowns, mismatches, qopens, qindels, topens, tindels;
@@ -11970,7 +12690,7 @@ run_gmap (bool *good_start_p, bool *good_end_p, History_T gmap_history,
for (p = all_stage2results; p != NULL; p = List_next(p)) {
stage2 = (Stage2_T) List_head(p);
- if ((pairarray = Stage3_compute(&pairs,&npairs,&cdna_direction,&sensedir,
+ if ((pairarray = Stage3_compute(&pairs,&npairs,&goodness,&cdna_direction,&sensedir,
&matches,&nmatches_posttrim,&max_match_length,
&ambig_end_length_5,&ambig_end_length_3,
&ambig_splicetype_5,&ambig_splicetype_3,
@@ -12172,8 +12892,15 @@ align_single_hit_with_gmap (History_T gmap_history, Stage3end_T hit,
int starti, endi, i;
+ /* Both events are tested by Stage3end_anomalous_splice_p */
if ((chrnum = Stage3end_chrnum(hit)) == 0) {
+ /* Translocation */
return (List_T) NULL;
+
+ } else if (Stage3end_hittype(hit) == SAMECHR_SPLICE) {
+ /* A genomic event that doesn't get reflected in chrnum */
+ return (List_T) NULL;
+
} else {
chroffset = Stage3end_chroffset(hit);
chrhigh = Stage3end_chrhigh(hit);
@@ -12195,7 +12922,7 @@ align_single_hit_with_gmap (History_T gmap_history, Stage3end_T hit,
} else {
knownsplice_limit_low = mappingstart = segmentstart = origlow;
}
- debug13(printf("Original bounds: knownsplice_limit_low %u, mappingstart %u\n",
+ debug13(printf("Original bounds A: knownsplice_limit_low %u, mappingstart %u\n",
knownsplice_limit_low - chroffset,mappingstart - chroffset));
if (extend_right_p == true) {
@@ -12208,7 +12935,7 @@ align_single_hit_with_gmap (History_T gmap_history, Stage3end_T hit,
} else {
knownsplice_limit_high = mappingend = segmentend = orighigh;
}
- debug13(printf("Original bounds: knownsplice_limit_high %u, mappingend %u\n",
+ debug13(printf("Original bounds B: knownsplice_limit_high %u, mappingend %u\n",
knownsplice_limit_high - chroffset,mappingend - chroffset));
debug13(printf("plus hit %u..%u (extend_left_p %d, extend_right_p %d) (sensedir %d) => segment bounds %u..%u\n",
@@ -12405,7 +13132,7 @@ align_single_hit_with_gmap (History_T gmap_history, Stage3end_T hit,
} else {
knownsplice_limit_low = mappingstart = segmentstart = origlow;
}
- debug13(printf("Original bounds: knownsplice_limit_low %u, mappingstart %u\n",
+ debug13(printf("Original bounds C: knownsplice_limit_low %u, mappingstart %u\n",
knownsplice_limit_low - chroffset,mappingstart - chroffset));
if (extend_left_p == true) {
@@ -12418,7 +13145,7 @@ align_single_hit_with_gmap (History_T gmap_history, Stage3end_T hit,
} else {
knownsplice_limit_high = mappingend = segmentend = orighigh;
}
- debug13(printf("Original bounds: knownsplice_limit_high %u, mappingend %u\n",
+ debug13(printf("Original bounds D: knownsplice_limit_high %u, mappingend %u\n",
knownsplice_limit_high - chroffset,mappingend - chroffset));
debug13(printf("minus hit %u..%u (extend_left_p %d, extend_right_p %d) (sensedir %d), => segmentbounds %u..%u\n",
@@ -12845,8 +13572,7 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
- int user_maxlevel, int subopt_levels,
- int indel_penalty_middle, int indel_penalty_end,
+ int user_maxlevel, int indel_penalty_middle, int indel_penalty_end,
int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
bool allvalidp, bool keep_floors_p, int genestrand, bool first_read_p) {
@@ -12855,6 +13581,7 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
longsinglesplicing = NULL, distantsplicing = NULL, good_gmap_hits = NULL, terminals = NULL;
List_T gmap_hits, p, a;
Stage3end_T hit, gmap;
+ int nmisses_allowed_sarray;
int found_score, done_level, opt_level, fast_level, mismatch_level, nmismatches, max_mismatches_allowed;
int max_splice_mismatches, i;
int missing_hit, missing_gmap;
@@ -12916,12 +13643,13 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
nhits = 0;
+ nmisses_allowed_sarray = *cutoff_level;
+
#ifndef LARGE_GENOMES
if (use_only_sarray_p == true) {
Sarray_search_greedy(&(*cutoff_level),&subs,&indels,&ambiguous,&singlesplicing,&doublesplicing,
- queryuc_ptr,queryrc,querylength,
- query_compress_fwd,query_compress_rev,/*nmisses_allowed*/querylength,
- genestrand,first_read_p);
+ queryuc_ptr,queryrc,querylength,query_compress_fwd,query_compress_rev,
+ nmisses_allowed_sarray,genestrand,first_read_p);
singlesplicing = Splice_group_by_segmenti(&found_score,singlesplicing,&ambiguous,querylength,
first_read_p,/*sarrayp*/true);
singlesplicing = Splice_group_by_segmentj(&found_score,singlesplicing,&ambiguous,querylength,
@@ -12947,9 +13675,8 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
} else if (use_sarray_p == true) {
/* Replaces spanning set */
Sarray_search_greedy(&found_score,&subs,&indels,&ambiguous,&singlesplicing,&doublesplicing,
- queryuc_ptr,queryrc,querylength,
- query_compress_fwd,query_compress_rev,/*nmisses_allowed*/querylength,
- genestrand,first_read_p);
+ queryuc_ptr,queryrc,querylength,query_compress_fwd,query_compress_rev,
+ nmisses_allowed_sarray,genestrand,first_read_p);
singlesplicing = Splice_group_by_segmenti(&found_score,singlesplicing,&ambiguous,querylength,
first_read_p,/*sarrayp*/true);
singlesplicing = Splice_group_by_segmentj(&found_score,singlesplicing,&ambiguous,querylength,
@@ -13034,8 +13761,7 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
#if defined(DEBUG2) || defined(DEBUG2E)
queryuc_ptr,queryrc,
#endif
- querylength,query_lastpos,floors,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ querylength,query_lastpos,floors,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level,genestrand,first_read_p);
#else
@@ -13049,8 +13775,7 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
queryuc_ptr,queryrc,
#endif
querylength,query_lastpos,plus_indexdb,minus_indexdb,indexdb_size_threshold,
- floors_array,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ floors_array,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level,/*omit_frequent_p*/false,/*omit_repetitive_p*/false,keep_floors_p,
genestrand,first_read_p);
@@ -13063,8 +13788,7 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
queryuc_ptr,queryrc,
#endif
querylength,query_lastpos,plus_indexdb,minus_indexdb,indexdb_size_threshold,
- floors_array,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ floors_array,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level,/*omit_frequent_p*/true,
/*omit_repetitive_p*/(masktype == MASK_REPETITIVE || masktype == MASK_GREEDY_REPETITIVE) ? true : false,
@@ -13081,8 +13805,7 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
queryuc_ptr,queryrc,
#endif
querylength,query_lastpos,plus_indexdb,minus_indexdb,indexdb_size_threshold,
- floors_array,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ floors_array,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level,/*omit_frequent_p*/false,/*omit_repetitive_p*/false,keep_floors_p,
genestrand,first_read_p);
@@ -13595,7 +14318,7 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
debug(printf("No GMAP improvement: Before remove_overlaps at cutoff level %d: %d\n",*cutoff_level,List_length(hits)));
hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
querylength,/*keep_gmap_p*/true,/*finalp*/true);
- hits = Stage3end_filter_terminals(hits,querylength);
+ hits = Stage3end_reject_trimlengths(hits);
hits = Stage3end_remove_overlaps(hits,/*finalp*/true);
hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
querylength,/*keep_gmap_p*/false,/*finalp*/true);
@@ -13606,7 +14329,7 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
debug(printf("GMAP improvement: Before remove_overlaps at cutoff level %d: %d\n",*cutoff_level,List_length(hits)));
hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
querylength,/*keep_gmap_p*/true,/*finalp*/false);
- hits = Stage3end_filter_terminals(hits,querylength);
+ /* Don't reject based on trimlength until after GMAP improvements */
hits = Stage3end_remove_overlaps(hits,/*finalp*/false);
hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
querylength,/*keep_gmap_p*/false,/*finalp*/false);
@@ -13620,7 +14343,7 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
first_read_p);
hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
querylength,/*keep_gmap_p*/true,/*finalp*/true);
- hits = Stage3end_filter_terminals(hits,querylength);
+ hits = Stage3end_reject_trimlengths(hits);
hits = Stage3end_remove_overlaps(hits,/*finalp*/true);
hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
querylength,/*keep_gmap_p*/false,/*finalp*/true);
@@ -13638,8 +14361,7 @@ static Stage3end_T *
single_read (int *npaths, int *first_absmq, int *second_absmq,
Shortread_T queryseq, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
int indexdb_size_threshold, Genome_T genome, Floors_T *floors_array,
- double user_maxlevel_float, int subopt_levels,
- int indel_penalty_middle, int indel_penalty_end,
+ double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
@@ -13721,8 +14443,7 @@ single_read (int *npaths, int *first_absmq, int *second_absmq,
indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp,keep_floors_p,/*genestrand*/0,/*first_read_p*/true);
@@ -13751,8 +14472,7 @@ static Stage3end_T *
single_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_absmq,
Shortread_T queryseq, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
int indexdb_size_threshold, Genome_T genome, Floors_T *floors_array,
- double user_maxlevel_float, int subopt_levels,
- int indel_penalty_middle, int indel_penalty_end,
+ double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
@@ -13828,8 +14548,7 @@ single_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
indexdb_fwd,indexdb_rev,indexdb_size_threshold,
floors_array,oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp,keep_floors_p,/*genestrand*/+1,/*first_read_p*/true);
@@ -13843,8 +14562,7 @@ single_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
indexdb_fwd,indexdb_rev,indexdb_size_threshold,
floors_array,oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp,keep_floors_p,/*genestrand*/+2,/*first_read_p*/true);
@@ -13853,7 +14571,7 @@ single_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
hits = List_append(hits_geneplus,hits_geneminus);
hits = Stage3end_optimal_score(hits,cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
querylength,/*keep_gmap_p*/true,/*finalp*/true);
- hits = Stage3end_filter_terminals(hits,querylength);
+ hits = Stage3end_reject_trimlengths(hits);
hits = Stage3end_remove_overlaps(hits,/*finalp*/true);
hits = Stage3end_optimal_score(hits,cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
querylength,/*keep_gmap_p*/false,/*finalp*/true);
@@ -13883,8 +14601,7 @@ Stage3end_T *
Stage1_single_read (int *npaths, int *first_absmq, int *second_absmq,
Shortread_T queryseq, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
int indexdb_size_threshold, Genome_T genome, Floors_T *floors_array,
- double user_maxlevel_float, int subopt_levels,
- int indel_penalty_middle, int indel_penalty_end,
+ double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
@@ -13895,7 +14612,7 @@ Stage1_single_read (int *npaths, int *first_absmq, int *second_absmq,
if (mode == STANDARD || mode == CMET_STRANDED || mode == ATOI_STRANDED) {
return single_read(&(*npaths),&(*first_absmq),&(*second_absmq),
queryseq,indexdb_fwd,indexdb_rev,indexdb_size_threshold,
- genome,floors_array,user_maxlevel_float,subopt_levels,
+ genome,floors_array,user_maxlevel_float,
indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
@@ -13904,7 +14621,7 @@ Stage1_single_read (int *npaths, int *first_absmq, int *second_absmq,
} else if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED) {
return single_read_tolerant_nonstranded(&(*npaths),&(*first_absmq),&(*second_absmq),queryseq,
indexdb_fwd,indexdb_rev,indexdb_size_threshold,
- genome,floors_array,user_maxlevel_float,subopt_levels,
+ genome,floors_array,user_maxlevel_float,
indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
@@ -13960,10 +14677,15 @@ align_halfmapping_with_gmap (History_T gmap_history, Stage3end_T hit5, Stage3end
if (hit3 == NULL) {
+ /* Both events are tested by Stage3end_anomalous_splice_p */
if ((chrnum = Stage3end_chrnum(hit5)) == 0) {
/* Translocation */
return (List_T) NULL;
+ } else if (Stage3end_hittype(hit5) == SAMECHR_SPLICE) {
+ /* A genomic event that doesn't get reflected in chrnum */
+ return (List_T) NULL;
+
} else if ((watsonp = Stage3end_plusp(hit5)) == true) {
chroffset = Stage3end_chroffset(hit5);
chrhigh = Stage3end_chrhigh(hit5);
@@ -14006,7 +14728,7 @@ align_halfmapping_with_gmap (History_T gmap_history, Stage3end_T hit5, Stage3end
#else
mappingend = add_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist_novelend,chrhigh);
#endif
- debug13(printf("Original bounds: knownsplice_limit_low %u, knownsplice_limit_high %u, mappingend %u\n",
+ debug13(printf("Original bounds E: knownsplice_limit_low %u, knownsplice_limit_high %u, mappingend %u\n",
knownsplice_limit_low - chroffset,knownsplice_limit_high - chroffset,mappingend - chroffset));
close_mappingend_last = middle_mappingend_last = Stage3end_genomicend(hit5);
@@ -14147,7 +14869,7 @@ align_halfmapping_with_gmap (History_T gmap_history, Stage3end_T hit5, Stage3end
#else
mappingstart = subtract_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist_novelend,chroffset);
#endif
- debug13(printf("Original bounds: knownsplice_limit_low %u, knownsplice_limit_high %u, mappingstart %u\n",
+ debug13(printf("Original bounds F: knownsplice_limit_low %u, knownsplice_limit_high %u, mappingstart %u\n",
knownsplice_limit_low - chroffset,knownsplice_limit_high - chroffset,mappingstart - chroffset));
close_mappingstart_last = middle_mappingstart_last = Stage3end_genomicend(hit5);
@@ -14258,10 +14980,15 @@ align_halfmapping_with_gmap (History_T gmap_history, Stage3end_T hit5, Stage3end
}
} else if (hit5 == NULL) {
+ /* Both events are tested by Stage3end_anomalous_splice_p */
if ((chrnum = Stage3end_chrnum(hit3)) == 0) {
/* Translocation */
return (List_T) NULL;
+ } else if (Stage3end_hittype(hit3) == SAMECHR_SPLICE) {
+ /* A genomic event that doesn't get reflected in chrnum */
+ return (List_T) NULL;
+
} else if ((watsonp = Stage3end_plusp(hit3)) == true) {
chroffset = Stage3end_chroffset(hit3);
chrhigh = Stage3end_chrhigh(hit3);
@@ -15145,8 +15872,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
- int user_maxlevel_5, int user_maxlevel_3, int subopt_levels,
- int indel_penalty_middle, int indel_penalty_end,
+ int user_maxlevel_5, int user_maxlevel_3, int indel_penalty_middle, int indel_penalty_end,
int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
bool allvalidp5, bool allvalidp3, Chrpos_T pairmax,
@@ -15161,6 +15887,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
List_T subs5 = NULL, indels5 = NULL, ambiguous5 = NULL, singlesplicing5 = NULL, doublesplicing5 = NULL, terminals5 = NULL;
List_T subs3 = NULL, indels3 = NULL, ambiguous3 = NULL, singlesplicing3 = NULL, doublesplicing3 = NULL, terminals3 = NULL;
List_T longsinglesplicing5 = NULL, longsinglesplicing3 = NULL;
+ int nmisses_allowed_sarray_5, nmisses_allowed_sarray_3;
int ignore_found_score, done_level_5, done_level_3, opt_level, fast_level_5, fast_level_3,
mismatch_level_5, mismatch_level_3, nmismatches, max_mismatches_allowed;
int max_splice_mismatches_5 = -1, max_splice_mismatches_3 = -1, i;
@@ -15272,12 +15999,14 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
nhits5 = nhits3 = 0;
+ nmisses_allowed_sarray_5 = *cutoff_level_5;
+ nmisses_allowed_sarray_3 = *cutoff_level_3;
+
#ifndef LARGE_GENOMES
if (use_only_sarray_p == true) {
Sarray_search_greedy(&(*cutoff_level_5),&subs5,&indels5,&ambiguous5,&singlesplicing5,&doublesplicing5,
- queryuc_ptr_5,queryrc5,querylength5,
- query5_compress_fwd,query5_compress_rev,/*nmisses_allowed*/querylength5,
- genestrand,/*first_read_p*/true);
+ queryuc_ptr_5,queryrc5,querylength5,query5_compress_fwd,query5_compress_rev,
+ nmisses_allowed_sarray_5,genestrand,/*first_read_p*/true);
singlesplicing5 = Splice_group_by_segmenti(&ignore_found_score,singlesplicing5,&ambiguous5,querylength5,
/*first_read_p*/true,/*sarrayp*/true);
singlesplicing5 = Splice_group_by_segmentj(&ignore_found_score,singlesplicing5,&ambiguous5,querylength5,
@@ -15286,9 +16015,8 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
Sarray_search_greedy(&(*cutoff_level_3),&subs3,&indels3,&ambiguous3,&singlesplicing3,&doublesplicing3,
- queryuc_ptr_3,queryrc3,querylength3,
- query3_compress_fwd,query3_compress_rev,/*nmisses_allowed*/querylength3,
- genestrand,/*first_read_p*/false);
+ queryuc_ptr_3,queryrc3,querylength3,query3_compress_fwd,query3_compress_rev,
+ nmisses_allowed_sarray_3,genestrand,/*first_read_p*/false);
singlesplicing3 = Splice_group_by_segmenti(&ignore_found_score,singlesplicing3,&ambiguous3,querylength3,
/*first_read_p*/false,/*sarrayp*/true);
singlesplicing3 = Splice_group_by_segmentj(&ignore_found_score,singlesplicing3,&ambiguous3,querylength3,
@@ -15334,9 +16062,8 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
} else if (use_sarray_p == true) {
/* Replaces spanning set */
Sarray_search_greedy(&ignore_found_score,&subs5,&indels5,&ambiguous5,&singlesplicing5,&doublesplicing5,
- queryuc_ptr_5,queryrc5,querylength5,
- query5_compress_fwd,query5_compress_rev,/*nmisses_allowed*/querylength5,
- genestrand,/*first_read_p*/true);
+ queryuc_ptr_5,queryrc5,querylength5,query5_compress_fwd,query5_compress_rev,
+ nmisses_allowed_sarray_5,genestrand,/*first_read_p*/true);
singlesplicing5 = Splice_group_by_segmenti(&ignore_found_score,singlesplicing5,&ambiguous5,querylength5,
/*first_read_p*/true,/*sarrayp*/true);
singlesplicing5 = Splice_group_by_segmentj(&ignore_found_score,singlesplicing5,&ambiguous5,querylength5,
@@ -15345,9 +16072,8 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
Sarray_search_greedy(&ignore_found_score,&subs3,&indels3,&ambiguous3,&singlesplicing3,&doublesplicing3,
- queryuc_ptr_3,queryrc3,querylength3,
- query3_compress_fwd,query3_compress_rev,/*nmisses_allowed*/querylength3,
- genestrand,/*first_read_p*/false);
+ queryuc_ptr_3,queryrc3,querylength3,query3_compress_fwd,query3_compress_rev,
+ nmisses_allowed_sarray_3,genestrand,/*first_read_p*/false);
singlesplicing3 = Splice_group_by_segmenti(&ignore_found_score,singlesplicing3,&ambiguous3,querylength3,
/*first_read_p*/false,/*sarrayp*/true);
singlesplicing3 = Splice_group_by_segmentj(&ignore_found_score,singlesplicing3,&ambiguous3,querylength3,
@@ -15643,8 +16369,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
#if defined(DEBUG2) || defined(DEBUG2E)
queryuc_ptr_5,queryrc5,
#endif
- querylength5,query5_lastpos,floors5,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ querylength5,query5_lastpos,floors5,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level_5,genestrand,/*first_read_p*/true);
@@ -15660,8 +16385,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
queryuc_ptr_5,queryrc5,
#endif
querylength5,query5_lastpos,plus_indexdb_5,minus_indexdb_5,
- indexdb_size_threshold,floors_array,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ indexdb_size_threshold,floors_array,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level_5,/*omit_frequent_p*/false,/*omit_repetitive_p*/false,keep_floors_p,
genestrand,/*first_read_p*/true);
@@ -15675,8 +16399,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
queryuc_ptr_5,queryrc5,
#endif
querylength5,query5_lastpos,plus_indexdb_5,minus_indexdb_5,
- indexdb_size_threshold,floors_array,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ indexdb_size_threshold,floors_array,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level_5,/*omit_frequent_p*/true,
/*omit_repetitive_p*/(masktype == MASK_REPETITIVE || masktype == MASK_GREEDY_REPETITIVE) ? true : false,
@@ -15695,8 +16418,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
queryuc_ptr_5,queryrc5,
#endif
querylength5,query5_lastpos,plus_indexdb_5,minus_indexdb_5,
- indexdb_size_threshold,floors_array,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ indexdb_size_threshold,floors_array,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level_5,/*omit_frequent_p*/false,/*omit_repetitive_p*/false,keep_floors_p,
genestrand,/*first_read_p*/true);
@@ -15726,8 +16448,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
#if defined(DEBUG2) || defined(DEBUG2E)
queryuc_ptr_3,queryrc3,
#endif
- querylength3,query3_lastpos,floors3,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ querylength3,query3_lastpos,floors3,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level_3,genestrand,/*first_read_p*/false);
@@ -15742,8 +16463,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
queryrc3,
#endif
querylength3,query3_lastpos,plus_indexdb_3,minus_indexdb_3,
- indexdb_size_threshold,floors_array,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ indexdb_size_threshold,floors_array,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level_3,/*omit_frequent_p*/false,/*omit_repetitive_p*/false,keep_floors_p,
genestrand,/*first_read_p*/false);
@@ -15757,8 +16477,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
queryuc_ptr_3,queryrc3,
#endif
querylength3,query3_lastpos,plus_indexdb_3,minus_indexdb_3,
- indexdb_size_threshold,floors_array,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ indexdb_size_threshold,floors_array,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level_3,/*omit_frequent_p*/true,
/*omit_repetitive_p*/(masktype == MASK_REPETITIVE || masktype == MASK_GREEDY_REPETITIVE) ? true : false,
@@ -15777,8 +16496,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
queryuc_ptr_3,queryrc3,
#endif
querylength3,query3_lastpos,plus_indexdb_3,minus_indexdb_3,
- indexdb_size_threshold,floors_array,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ indexdb_size_threshold,floors_array,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
fast_level_3,/*omit_frequent_p*/false,/*omit_repetitive_p*/false,keep_floors_p,
genestrand,/*first_read_p*/false);
@@ -16326,7 +17044,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
Stage3end_nmatches_posttrim(gmap3),querylength3,*cutoff_level_3,nconcordant));
}
hitpairs = List_push(hitpairs,(void *) newpair);
- } else if (Stage3end_terminal_length(hit5) >= terminal_output_minlength) {
+ } else if (Stage3end_trimlength(hit5) < reject_trimlength) {
if (Stage3end_nmatches_posttrim(gmap3) >= querylength3 - (*cutoff_level_3) &&
Stage3end_gmap_max_match_length(gmap3) >= querylength3/2) {
/* Want high standard for nconcordant, since this precludes finding terminals */
@@ -16397,7 +17115,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
Stage3end_nmatches_posttrim(gmap5),querylength5,*cutoff_level_5,nconcordant));
}
hitpairs = List_push(hitpairs,(void *) newpair);
- } else if (Stage3end_terminal_length(hit3) >= terminal_output_minlength) {
+ } else if (Stage3end_trimlength(hit3) < reject_trimlength) {
if (Stage3end_nmatches_posttrim(gmap5) >= querylength5 - (*cutoff_level_5) &&
Stage3end_gmap_max_match_length(gmap5) >= querylength5/2) {
/* Want high standard for nconcordant, since this precludes finding terminals */
@@ -16959,7 +17677,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
/*pairtype*/CONCORDANT,localsplicing_penalty,
/*private5p*/true,/*private3p*/true,/*expect_concordant_p*/true)) == NULL) {
/* Stage3end_free(&gmap3); -- done by Stage3pair_new */
- } else if (Stage3end_terminal_length(hit5) >= terminal_output_minlength) {
+ } else if (Stage3end_trimlength(hit5) < reject_trimlength) {
/* Save hit5-gmap3 */
*with_terminal = List_push(*with_terminal,(void *) newpair);
} else {
@@ -17006,7 +17724,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
/*pairtype*/CONCORDANT,localsplicing_penalty,
/*private5p*/true,/*private3p*/true,/*expect_concordant_p*/true)) == NULL) {
/* Stage3end_free(&gmap5); -- done by Stage3pair_new */
- } else if (Stage3end_terminal_length(hit3) >= terminal_output_minlength) {
+ } else if (Stage3end_trimlength(hit3) < reject_trimlength) {
/* Save gmap5-hit3 */
*with_terminal = List_push(*with_terminal,(void *) newpair);
} else {
@@ -17177,8 +17895,7 @@ realign_separately (Stage3end_T **stage3array5, int *nhits5, int *first_absmq5,
Shortread_T queryseq3, char *queryuc_ptr_3, char *queryrc3, char *quality_string_3, int querylength3, int query3_lastpos,
Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int indexdb_size_threshold,
Genome_T genome, Floors_T *floors_array,
- int user_maxlevel_5, int user_maxlevel_3, int subopt_levels,
- int indel_penalty_middle, int indel_penalty_end,
+ int user_maxlevel_5, int user_maxlevel_3, int indel_penalty_middle, int indel_penalty_end,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
@@ -17201,8 +17918,7 @@ realign_separately (Stage3end_T **stage3array5, int *nhits5, int *first_absmq5,
indexdb_fwd,indexdb_rev,indexdb_size_threshold,
floors_array,oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel_5,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_5,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp5,keep_floors_p,genestrand,/*first_read_p*/true);
@@ -17230,8 +17946,7 @@ realign_separately (Stage3end_T **stage3array5, int *nhits5, int *first_absmq5,
indexdb_fwd,indexdb_rev,indexdb_size_threshold,
floors_array,oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel_3,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp3,keep_floors_p,genestrand,/*first_read_p*/false);
@@ -17266,7 +17981,7 @@ consolidate_paired_results (int *npaths, int *first_absmq, int *second_absmq, Pa
struct Segment_T **minus_segments_genestrand_3, int *minus_nsegments_genestrand_3,
Shortread_T queryseq5, char *queryuc_ptr_5, char *quality_string_5, int querylength5, int query5_lastpos,
Shortread_T queryseq3, char *queryuc_ptr_3, char *quality_string_3, int querylength3, int query3_lastpos,
- Genome_T genome, int subopt_levels, int cutoff_level_5, int cutoff_level_3,
+ Genome_T genome, int cutoff_level_5, int cutoff_level_3,
int localsplicing_penalty,
Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
@@ -17670,7 +18385,7 @@ consolidate_paired_results (int *npaths, int *first_absmq, int *second_absmq, Pa
if (result == NULL) {
singlehits5 = Stage3end_optimal_score(hits5,cutoff_level_5,subopt_levels,query5_compress_fwd,query5_compress_rev,
querylength5,/*keep_gmap_p*/true,/*finalp*/true);
- singlehits5 = Stage3end_filter_terminals(singlehits5,querylength5);
+ singlehits5 = Stage3end_reject_trimlengths(singlehits5);
singlehits5 = Stage3end_linearize_5(singlehits5);
singlehits5 = Stage3end_remove_overlaps(singlehits5,/*finalp*/true);
singlehits5 = Stage3end_optimal_score(singlehits5,cutoff_level_5,subopt_levels,query5_compress_fwd,query5_compress_rev,
@@ -17679,7 +18394,7 @@ consolidate_paired_results (int *npaths, int *first_absmq, int *second_absmq, Pa
singlehits3 = Stage3end_optimal_score(hits3,cutoff_level_3,subopt_levels,query3_compress_fwd,query3_compress_rev,
querylength3,/*keep_gmap_p*/true,/*finalp*/true);
- singlehits3 = Stage3end_filter_terminals(singlehits3,querylength3);
+ singlehits3 = Stage3end_reject_trimlengths(singlehits3);
singlehits3 = Stage3end_linearize_3(singlehits3);
singlehits3 = Stage3end_remove_overlaps(singlehits3,/*finalp*/true);
singlehits3 = Stage3end_optimal_score(singlehits3,cutoff_level_3,subopt_levels,query3_compress_fwd,query3_compress_rev,
@@ -17822,8 +18537,7 @@ paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T *final
Shortread_T queryseq5, Shortread_T queryseq3,
Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int indexdb_size_threshold,
Genome_T genome, Floors_T *floors_array,
- double user_maxlevel_float, int subopt_levels,
- int indel_penalty_middle, int indel_penalty_end,
+ double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
@@ -17928,8 +18642,7 @@ paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T *final
indexdb_fwd,indexdb_rev,indexdb_size_threshold,
floors_array,oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel_3,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp3,keep_floors_p,/*genestrand*/0,/*first_read_p*/false);
@@ -17999,8 +18712,7 @@ paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T *final
indexdb_fwd,indexdb_rev,indexdb_size_threshold,
floors_array,oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel_5,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_5,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp5,keep_floors_p,/*genestrand*/0,/*first_read_p*/true);
@@ -18084,8 +18796,7 @@ paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T *final
oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel_5,user_maxlevel_3,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp5,allvalidp3,pairmax,maxpairedpaths,keep_floors_p,
@@ -18105,8 +18816,7 @@ paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T *final
queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
indexdb_fwd,indexdb_rev,indexdb_size_threshold,genome,floors_array,
- user_maxlevel_5,user_maxlevel_3,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
oligoindices_major,oligoindices_minor,
@@ -18136,7 +18846,7 @@ paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T *final
&this3->plus_segments,&this3->plus_nsegments,&this3->minus_segments,&this3->minus_nsegments,
queryseq5,queryuc_ptr_5,quality_string_5,querylength5,query5_lastpos,
queryseq3,queryuc_ptr_3,quality_string_3,querylength3,query3_lastpos,
- genome,subopt_levels,cutoff_level_5,cutoff_level_3,
+ genome,cutoff_level_5,cutoff_level_3,
localsplicing_penalty,
oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
@@ -18163,8 +18873,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
Shortread_T queryseq5, Shortread_T queryseq3,
Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int indexdb_size_threshold,
Genome_T genome, Floors_T *floors_array,
- double user_maxlevel_float, int subopt_levels,
- int indel_penalty_middle, int indel_penalty_end,
+ double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
@@ -18272,8 +18981,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
indexdb_fwd,indexdb_fwd,indexdb_size_threshold,
floors_array,oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel_3,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp3,keep_floors_p,/*genestrand*/+1,/*first_read_p*/false);
@@ -18289,8 +18997,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
indexdb_fwd,indexdb_rev,indexdb_size_threshold,
floors_array,oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel_3,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp3,keep_floors_p,/*genestrand*/+2,/*first_read_p*/false);
@@ -18358,8 +19065,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
indexdb_fwd,indexdb_rev,indexdb_size_threshold,
floors_array,oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel_5,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_5,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp5,keep_floors_p,/*genestrand*/+1,/*first_read_p*/true);
@@ -18375,8 +19081,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
indexdb_fwd,indexdb_rev,indexdb_size_threshold,
floors_array,oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel_5,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_5,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp5,keep_floors_p,/*genestrand*/+2,/*first_read_p*/true);
@@ -18465,8 +19170,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel_5,user_maxlevel_3,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp5,allvalidp3,pairmax,maxpairedpaths,keep_floors_p,
@@ -18497,8 +19201,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
- user_maxlevel_5,user_maxlevel_3,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
allvalidp5,allvalidp3,pairmax,maxpairedpaths,keep_floors_p,
@@ -18523,8 +19226,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
indexdb_fwd,indexdb_rev,indexdb_size_threshold,genome,floors_array,
- user_maxlevel_5,user_maxlevel_3,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
oligoindices_major,oligoindices_minor,
@@ -18565,7 +19267,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
plus_segments_genestrand_3,plus_nsegments_genestrand_3,minus_segments_genestrand_3,minus_nsegments_genestrand_3,
queryseq5,queryuc_ptr_5,quality_string_5,querylength5,query5_lastpos,
queryseq3,queryuc_ptr_3,quality_string_3,querylength3,query3_lastpos,
- genome,subopt_levels,cutoff_level_5,cutoff_level_3,
+ genome,cutoff_level_5,cutoff_level_3,
localsplicing_penalty,
oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
@@ -18598,8 +19300,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
indexdb_fwd,indexdb_rev,indexdb_size_threshold,genome,floors_array,
- user_maxlevel_5,user_maxlevel_3,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
oligoindices_major,oligoindices_minor,
@@ -18640,7 +19341,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
plus_segments_genestrand_3,plus_nsegments_genestrand_3,minus_segments_genestrand_3,minus_nsegments_genestrand_3,
queryseq5,queryuc_ptr_5,quality_string_5,querylength5,query5_lastpos,
queryseq3,queryuc_ptr_3,quality_string_3,querylength3,query3_lastpos,
- genome,subopt_levels,cutoff_level_5,cutoff_level_3,
+ genome,cutoff_level_5,cutoff_level_3,
localsplicing_penalty,
oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
@@ -18693,7 +19394,7 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
plus_segments_genestrand_3,plus_nsegments_genestrand_3,minus_segments_genestrand_3,minus_nsegments_genestrand_3,
queryseq5,queryuc_ptr_5,quality_string_5,querylength5,query5_lastpos,
queryseq3,queryuc_ptr_3,quality_string_3,querylength3,query3_lastpos,
- genome,subopt_levels,cutoff_level_5,cutoff_level_3,
+ genome,cutoff_level_5,cutoff_level_3,
localsplicing_penalty,
oligoindices_major,oligoindices_minor,
pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
@@ -18720,8 +19421,7 @@ Stage1_paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T
Shortread_T queryseq5, Shortread_T queryseq3,
Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int indexdb_size_threshold,
Genome_T genome, Floors_T *floors_array,
- double user_maxlevel_float, int subopt_levels,
- int indel_penalty_middle, int indel_penalty_end,
+ double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
@@ -18734,8 +19434,7 @@ Stage1_paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T
&(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
&(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
queryseq5,queryseq3,indexdb_fwd,indexdb_rev,indexdb_size_threshold,
- genome,floors_array,user_maxlevel_float,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ genome,floors_array,user_maxlevel_float,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
oligoindices_major,oligoindices_minor,
@@ -18746,8 +19445,7 @@ Stage1_paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T
&(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
&(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
queryseq5,queryseq3,indexdb_fwd,indexdb_rev,indexdb_size_threshold,
- genome,floors_array,user_maxlevel_float,subopt_levels,
- indel_penalty_middle,indel_penalty_end,
+ genome,floors_array,user_maxlevel_float,indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,distantsplicing_penalty,min_shortend,
oligoindices_major,oligoindices_minor,
@@ -18772,14 +19470,13 @@ void
Stage1hr_setup (bool use_sarray_p_in, bool use_only_sarray_p_in, int index1part_in, int index1interval_in,
int spansize_in, Univ_IIT_T chromosome_iit_in, int nchromosomes_in,
Genome_T genomealt, Mode_T mode_in, int maxpaths_search_in,
- int terminal_threshold_in, int terminal_output_minlength_in,
+ int terminal_threshold_in, int reject_trimlength_in,
Univcoord_T *splicesites_in, Splicetype_T *splicetypes_in,
Chrpos_T *splicedists_in, int nsplicesites_in,
- bool novelsplicingp_in, bool knownsplicingp_in,
- bool distances_observed_p_in,
- Chrpos_T max_middle_insertions_in, Chrpos_T max_middle_deletions_in,
+ bool novelsplicingp_in, bool knownsplicingp_in, bool distances_observed_p_in,
+ int subopt_levels_in, Chrpos_T max_middle_insertions_in, Chrpos_T max_middle_deletions_in,
Chrpos_T shortsplicedist_in, Chrpos_T shortsplicedist_known_in, Chrpos_T shortsplicedist_novelend_in,
Chrpos_T min_intronlength_in,
@@ -18826,7 +19523,7 @@ Stage1hr_setup (bool use_sarray_p_in, bool use_only_sarray_p_in, int index1part_
maxpaths_search = maxpaths_search_in;
terminal_threshold = terminal_threshold_in;
- terminal_output_minlength = terminal_output_minlength_in;
+ reject_trimlength = reject_trimlength_in;
splicesites = splicesites_in;
splicetypes = splicetypes_in;
@@ -18837,6 +19534,7 @@ Stage1hr_setup (bool use_sarray_p_in, bool use_only_sarray_p_in, int index1part_
knownsplicingp = knownsplicingp_in;
distances_observed_p = distances_observed_p_in;
+ subopt_levels = subopt_levels_in;
max_middle_insertions = max_middle_insertions_in;
max_middle_deletions = max_middle_deletions_in;
diff --git a/src/stage1hr.h b/src/stage1hr.h
index 64091e2..4e0be6f 100644
--- a/src/stage1hr.h
+++ b/src/stage1hr.h
@@ -1,4 +1,4 @@
-/* $Id: stage1hr.h 148721 2014-09-24 00:45:45Z twu $ */
+/* $Id: stage1hr.h 154778 2014-12-06 03:32:33Z twu $ */
#ifndef STAGE1HR_INCLUDED
#define STAGE1HR_INCLUDED
@@ -59,8 +59,7 @@ extern Stage3end_T *
Stage1_single_read (int *npaths, int *first_absmq, int *second_absmq,
Shortread_T queryseq, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
int indexdb_size_threshold, Genome_T genome, Floors_T *floors_array,
- double usermax_level_float, int subopt_levels,
- int indel_penalty_middle, int indel_penalty_end,
+ double usermax_level_float, int indel_penalty_middle, int indel_penalty_end,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
@@ -75,8 +74,7 @@ Stage1_paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T
Shortread_T queryseq5, Shortread_T queryseq3,
Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int indexdb_size_threshold,
Genome_T genome, Floors_T *floors_array,
- double usermax_level_float, int subopt_levels,
- int indel_penalty_middle, int indel_penalty_end,
+ double usermax_level_float, int indel_penalty_middle, int indel_penalty_end,
bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
@@ -91,14 +89,13 @@ extern void
Stage1hr_setup (bool use_sarray_p_in, bool use_only_sarray_p_in, int index1part_in, int index1interval_in,
int spansize_in, Univ_IIT_T chromosome_iit_in, int nchromosomes_in,
Genome_T genomealt, Mode_T mode_in, int maxpaths_search_in,
- int terminal_threshold_in, int terminal_output_minlength_in,
+ int terminal_threshold_in, int reject_trimlength,
Univcoord_T *splicesites_in, Splicetype_T *splicetypes_in,
Chrpos_T *splicedists_in, int nsplicesites_in,
- bool novelsplicingp_in, bool knownsplicingp_in,
- bool distances_observed_p_in,
- Chrpos_T max_middle_insertions_in, Chrpos_T max_middle_deletions_in,
+ bool novelsplicingp_in, bool knownsplicingp_in, bool distances_observed_p_in,
+ int subopt_levels_in, Chrpos_T max_middle_insertions_in, Chrpos_T max_middle_deletions_in,
Chrpos_T shortsplicedist_in, Chrpos_T shortsplicedist_known_in, Chrpos_T shortsplicedist_novelend_in,
Chrpos_T min_intronlength_in,
diff --git a/src/stage2.c b/src/stage2.c
index dfd572d..86af87c 100644
--- a/src/stage2.c
+++ b/src/stage2.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage2.c 153953 2014-11-24 17:51:10Z twu $";
+static char rcsid[] = "$Id: stage2.c 156846 2015-01-16 01:53:19Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -289,11 +289,26 @@ Stage2_free (T *old) {
static T
Stage2_new (List_T middle, List_T all_starts, List_T all_ends) {
T new = (T) MALLOC(sizeof(*new));
+#ifdef DEBUG0
+ List_T p;
+#endif
new->middle = middle;
new->all_starts = all_starts;
new->all_ends = all_ends;
+#ifdef DEBUG0
+ printf("Starts:\n");
+ for (p = all_starts; p != NULL; p = List_next(p)) {
+ Pair_dump_list(List_head(p),true);
+ }
+
+ printf("Ends:\n");
+ for (p = all_ends; p != NULL; p = List_next(p)) {
+ Pair_dump_list(List_head(p),true);
+ }
+#endif
+
return new;
}
@@ -2525,6 +2540,7 @@ revise_active_lookback (int **active, int *firstactive, int *nactive,
}
nactive[querypos] = 0;
+ firstactive[querypos] = -1;
ptr = &(firstactive[querypos]);
hit = low_hit;
while (hit < high_hit) {
@@ -2601,6 +2617,7 @@ revise_active_lookforward (int **active, int *firstactive, int *nactive,
}
nactive[querypos] = 0;
+ firstactive[querypos] = -1;
ptr = &(firstactive[querypos]);
hit = high_hit - 1;
while (hit >= low_hit) {
diff --git a/src/stage3.c b/src/stage3.c
index 695cf80..a0b0d3d 100644
--- a/src/stage3.c
+++ b/src/stage3.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage3.c 153955 2014-11-24 17:54:45Z twu $";
+static char rcsid[] = "$Id: stage3.c 160004 2015-03-03 02:08:27Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -759,6 +759,19 @@ Stage3_cmp (const void *a, const void *b) {
return -1;
} else if (y->goodness > x->goodness) {
return +1;
+
+ /* If we can achieve same goodness with fewer pairs, then it is a better alignment */
+ } else if (x->npairs < y->npairs) {
+ return -1;
+ } else if (y->npairs < x->npairs) {
+ return +1;
+
+ /* If we can achieve same goodness with more matches, then it is a better alignment */
+ } else if (x->matches > y->matches) {
+ return -1;
+ } else if (y->matches > x->matches) {
+ return +1;
+
} else if (x->straintype < y->straintype) {
return -1;
} else if (y->straintype < x->straintype) {
@@ -2615,8 +2628,8 @@ static List_T
clean_path_end3 (List_T path, int ambig_end_length_3) {
Pair_T lastpair;
+ debug(printf("Starting clean_path_end3\n"));
if (ambig_end_length_3 == 0) {
- debug(printf("clean_path_end3\n"));
/* Remove any remaining nonmatches, gaps, or indels at 3' end */
if (path != NULL) {
lastpair = path->first;
@@ -2646,6 +2659,7 @@ clean_path_end3 (List_T path, int ambig_end_length_3) {
#endif
}
+ debug(printf("Ending clean_path_end3\n"));
return path;
}
@@ -2654,8 +2668,8 @@ static List_T
clean_pairs_end5 (List_T pairs, int ambig_end_length_5) {
Pair_T firstpair;
+ debug(printf("Starting clean_pairs_end5\n"));
if (ambig_end_length_5 == 0) {
- debug(printf("clean_pairs_end5\n"));
/* Remove any remaining nonmatches, gaps, or indels at 5' end */
if (pairs != NULL) {
firstpair = pairs->first;
@@ -2685,6 +2699,7 @@ clean_pairs_end5 (List_T pairs, int ambig_end_length_5) {
#endif
}
+ debug(printf("Ending clean_pairs_end5\n"));
return pairs;
}
@@ -2694,7 +2709,7 @@ static List_T
clean_path_end3_gap_indels (List_T path) {
Pair_T lastpair;
- debug(printf("clean_path_end3_gap_indels\n"));
+ debug(printf("Starting clean_path_end3_gap_indels\n"));
/* Remove any remaining gap/indels at 3' end, which can happen rarely */
if (path != NULL) {
lastpair = path->first;
@@ -2723,6 +2738,7 @@ clean_path_end3_gap_indels (List_T path) {
}
#endif
+ debug(printf("Ending clean_path_end3_gap_indels\n"));
return path;
}
@@ -2731,7 +2747,7 @@ static List_T
clean_pairs_end5_gap_indels (List_T pairs) {
Pair_T firstpair;
- debug(printf("clean_pairs_end5_gap_indels\n"));
+ debug(printf("Starting clean_pairs_end5_gap_indels\n"));
/* Remove any remaining gap/indels at 5' end, which can happen rarely */
if (pairs != NULL) {
firstpair = pairs->first;
@@ -2760,6 +2776,7 @@ clean_pairs_end5_gap_indels (List_T pairs) {
}
#endif
+ debug(printf("Ending clean_pairs_end5_gap_indels\n"));
return pairs;
}
@@ -2771,7 +2788,7 @@ clean_end_chimera (List_T end) {
List_T peeled = NULL;
int n = 0;
- debug10(printf("clean_path_end_chimera\n"));
+ debug10(printf("Starting clean_path_end_chimera\n"));
while (end != NULL && n < 20) {
lastpair = end->first;
peeled = List_transfer_one(peeled,&end);
@@ -2790,6 +2807,7 @@ clean_end_chimera (List_T end) {
}
end = Pairpool_transfer(end,peeled);
+ debug10(printf("Ending clean_path_end_chimera\n"));
return end;
}
@@ -3427,6 +3445,7 @@ sufficient_splice_prob_local (int support, int nmismatches, double distal_splice
+#ifdef GSNAP
static int
exon_length_5 (List_T pairs) {
int exon_length = 0;
@@ -3446,8 +3465,10 @@ exon_length_5 (List_T pairs) {
return exon_length;
}
}
+#endif
+#ifdef GSNAP
static int
exon_length_3 (List_T path) {
int exon_length = 0;
@@ -3467,6 +3488,7 @@ exon_length_3 (List_T path) {
return exon_length;
}
}
+#endif
/* Also handles case where novelsplicingp == false */
@@ -3480,7 +3502,9 @@ trim_end5_exon_indels (bool *trim5p, int ambig_end_length, List_T pairs, int pai
) {
List_T path, exon, pairptr, p;
Pair_T pair, medial, indel = NULL, splice = NULL;
+ int max_nmatches, max_nmismatches;
int nmatches = 0, nmismatches /* = -1 because of the gap */, i;
+ int max_score, score;
bool nearindelp = false, nearmismatchp = false, is_canonical;
double medial_prob;
int nindels;
@@ -3499,22 +3523,19 @@ trim_end5_exon_indels (bool *trim5p, int ambig_end_length, List_T pairs, int pai
pair = pairs->first;
debug3(printf("querystart %d\n",pair->querypos));
/* Normally expect pair->querypos to be 0, and want to start with -1 because of the gap */
+#if 0
if (pair->querypos <= ambig_end_length) {
nmismatches = -1;
} else {
nmismatches = (pair->querypos - ambig_end_length) - 1;
}
+#endif
}
exon = (List_T) NULL;
while (pairs != NULL && !pair->gapp && pair->comp != INDEL_COMP) {
pairptr = pairs;
pairs = Pairpool_pop(pairs,&pair);
- if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
- nmatches++;
- } else {
- nmismatches++;
- }
#ifdef WASTE
exon = Pairpool_push_existing(exon,pairpool,pair);
#else
@@ -3524,6 +3545,29 @@ trim_end5_exon_indels (bool *trim5p, int ambig_end_length, List_T pairs, int pai
debug3(printf("End exon:\n"));
debug3(Pair_dump_list(exon,true));
+
+ nmatches = nmismatches = 0;
+ max_score = score = 0;
+ /* Skip the intron gap */
+ for (p = List_next(exon); p != NULL; p = List_next(p)) {
+ pair = (Pair_T) List_head(p);
+ if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
+ score += 1;
+ nmatches += 1;
+ } else {
+ score -= 3;
+ nmismatches += 1;
+ }
+ if (score > max_score) {
+ max_score = score;
+ max_nmatches = nmatches;
+ max_nmismatches = nmismatches;
+ }
+ debug3(printf("5' querypos %d => score %d, max_nmatches %d, max_nmismatches %d\n",
+ pair->querypos,score,max_nmatches,max_nmismatches));
+ }
+
+
if (pair->comp == INDEL_COMP) {
/* Handle end indel */
/* indel = pair; */
@@ -3586,7 +3630,7 @@ trim_end5_exon_indels (bool *trim5p, int ambig_end_length, List_T pairs, int pai
#endif
}
- debug3(printf("Before indel/gap, nmatches %d, nmismatches %d\n",nmatches,nmismatches));
+ debug3(printf("Before indel/gap, nmatches %d, nmismatches %d\n",max_nmatches,max_nmismatches));
if (pairs == NULL) {
debug3(printf("No indel/gap\n"));
path = exon;
@@ -3610,21 +3654,21 @@ trim_end5_exon_indels (bool *trim5p, int ambig_end_length, List_T pairs, int pai
path = exon; /* exon already has the gap */
*trim5p = false;
- } else if (nearindelp == true && nmatches < INDEL_SPLICE_ENDLENGTH) {
- debug3(printf("near indel with nmatches %d too low, so trimming it\n",nmatches));
+ } else if (nearindelp == true && max_nmatches < INDEL_SPLICE_ENDLENGTH) {
+ debug3(printf("near indel with nmatches %d too low, so trimming it\n",max_nmatches));
path = (List_T) NULL;
*trim5p = true;
} else if (splice == NULL) {
debug3(printf("nindels %d\n",nindels));
- if (nmatches < min_indel_end_matches) {
- debug3(printf("Not enough matches %d < %d, so trimming it\n",nmatches,min_indel_end_matches));
+ if (max_nmatches < min_indel_end_matches) {
+ debug3(printf("Not enough matches %d < %d, so trimming it\n",max_nmatches,min_indel_end_matches));
path = (List_T) NULL;
*trim5p = true;
} else if (nindels > 3) {
/* Large indel */
- if (nmatches - nmismatches > nindels) {
+ if (max_nmatches - max_nmismatches > nindels) {
debug3(printf("Large indel: More matches than mismatches, so keeping it\n"));
path = exon; /* exon already has the indel */
*trim5p = false;
@@ -3637,7 +3681,7 @@ trim_end5_exon_indels (bool *trim5p, int ambig_end_length, List_T pairs, int pai
} else {
/* Small indel */
- if (nmatches - nmismatches > 2) {
+ if (max_nmatches - max_nmismatches > 2) {
debug3(printf("Small indel: More matches than mismatches, so keeping it\n"));
path = exon; /* exon already has the indel */
*trim5p = false;
@@ -3650,7 +3694,7 @@ trim_end5_exon_indels (bool *trim5p, int ambig_end_length, List_T pairs, int pai
}
} else {
- if (splice->knowngapp == true && nmismatches == 0) {
+ if (splice->knowngapp == true && max_nmismatches == 0) {
debug3(printf("Intron is known and no mismatches, so keeping it\n"));
path = exon; /* exon already has the gap */
*trim5p = false;
@@ -3660,22 +3704,29 @@ trim_end5_exon_indels (bool *trim5p, int ambig_end_length, List_T pairs, int pai
path = (List_T) NULL;
*trim5p = true;
+#if 0
} else if (enough_matches(nmatches-nmismatches,splice->genomejump/*,splice->donor_prob,splice->acceptor_prob*/) == false) {
debug3(printf("nmatches %d - nmismatches %d not enough for genomejump %d, so trimming it\n",
nmatches,nmismatches,splice->genomejump));
path = (List_T) NULL;
*trim5p = true;
+#endif
- } else if (sufficient_splice_prob_local(List_length(exon),nmismatches,
+ } else if (max_score < 12) {
+ debug3(printf("max_score %d < 12, so trimming it\n",max_score));
+ path = (List_T) NULL;
+ *trim5p = true;
+
+ } else if (sufficient_splice_prob_local(List_length(exon),max_nmismatches,
/*distal_spliceprob*/cdna_direction >= 0 ? splice->donor_prob : splice->acceptor_prob,
/*medial_spliceprob*/cdna_direction >= 0 ? splice->acceptor_prob : splice->donor_prob)) {
/* Want to keep for comparison of fwd and rev, even if probabilities are poor */
- debug3(printf("Keeping first 5' exon with %d matches and %d mismatches\n",nmatches,nmismatches));
+ debug3(printf("Keeping first 5' exon with %d matches and %d mismatches\n",max_nmatches,max_nmismatches));
path = exon; /* exon already has the gap */
*trim5p = false;
} else {
- debug3(printf("Fall through: trimming noncanonical 5' exon\n"));
+ debug3(printf("Fall through (bad probabilities): trimming noncanonical 5' exon\n"));
medial_prob = (cdna_direction >= 0) ? splice->acceptor_prob : splice->donor_prob;
if (canonicalp(splice->knowngapp,splice->comp,splice->donor_prob,splice->acceptor_prob,cdna_direction) == true &&
@@ -3720,7 +3771,9 @@ trim_end3_exon_indels (bool *trim3p, int ambig_end_length, List_T path, int pair
) {
List_T pairs, exon, pairptr, p;
Pair_T pair, medial, indel = NULL, splice = NULL;
+ int max_nmatches, max_nmismatches;
int nmatches = 0, nmismatches /* = -1 because of the gap */, i;
+ int max_score, score;
bool nearindelp = false, nearmismatchp = false, is_canonical;
double medial_prob;
int nindels;
@@ -3738,23 +3791,20 @@ trim_end3_exon_indels (bool *trim3p, int ambig_end_length, List_T path, int pair
} else {
pair = path->first;
debug3(printf("queryend %d\n",pair->querypos));
+#if 0
/* Normally expect pair->querypos to be 0, and want to start with -1 because of the gap */
if (pair->querypos >= (querylength - 1) - ambig_end_length) {
nmismatches = -1;
} else {
nmismatches = (querylength - 1) - ambig_end_length - pair->querypos - 1;
}
+#endif
}
exon = (List_T) NULL;
while (path != NULL && !pair->gapp && pair->comp != INDEL_COMP) {
pairptr = path;
path = Pairpool_pop(path,&pair);
- if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
- nmatches++;
- } else {
- nmismatches++;
- }
#ifdef WASTE
exon = Pairpool_push_existing(exon,pairpool,pair);
#else
@@ -3764,6 +3814,28 @@ trim_end3_exon_indels (bool *trim3p, int ambig_end_length, List_T path, int pair
debug3(printf("End exon:\n"));
debug3(Pair_dump_list(exon,true));
+
+ nmatches = nmismatches = 0;
+ max_score = score = 0;
+ /* Skip the intron gap */
+ for (p = List_next(exon); p != NULL; p = List_next(p)) {
+ pair = (Pair_T) List_head(p);
+ if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
+ score += 1;
+ nmatches += 1;
+ } else {
+ score -= 3;
+ nmismatches += 1;
+ }
+ if (score > max_score) {
+ max_score = score;
+ max_nmatches = nmatches;
+ max_nmismatches = nmismatches;
+ }
+ debug3(printf("3' querypos %d => score %d, max_nmatches %d, max_nmismatches %d\n",
+ pair->querypos,score,max_nmatches,max_nmismatches));
+ }
+
if (pair->comp == INDEL_COMP) {
/* Handle end indel */
/* indel = pair; */
@@ -3826,7 +3898,7 @@ trim_end3_exon_indels (bool *trim3p, int ambig_end_length, List_T path, int pair
#endif
}
- debug3(printf("Before indel/gap, nmatches %d, nmismatches %d\n",nmatches,nmismatches));
+ debug3(printf("Before indel/gap, nmatches %d, nmismatches %d\n",max_nmatches,max_nmismatches));
if (path == NULL) {
debug3(printf("No indel/gap\n"));
pairs = exon;
@@ -3850,21 +3922,21 @@ trim_end3_exon_indels (bool *trim3p, int ambig_end_length, List_T path, int pair
pairs = exon; /* exon already has the gap */
*trim3p = false;
- } else if (nearindelp == true && nmatches < INDEL_SPLICE_ENDLENGTH) {
- debug3(printf("near indel with nmatches %d too low, so trimming it\n",nmatches));
+ } else if (nearindelp == true && max_nmatches < INDEL_SPLICE_ENDLENGTH) {
+ debug3(printf("near indel with nmatches %d too low, so trimming it\n",max_nmatches));
pairs = (List_T) NULL;
*trim3p = true;
} else if (splice == NULL) {
debug3(printf("nindels %d\n",nindels));
- if (nmatches < min_indel_end_matches) {
- debug3(printf("Not enough matches %d < %d, so trimming it\n",nmatches,min_indel_end_matches));
+ if (max_nmatches < min_indel_end_matches) {
+ debug3(printf("Not enough matches %d < %d, so trimming it\n",max_nmatches,min_indel_end_matches));
pairs = (List_T) NULL;
*trim3p = true;
} else if (nindels > 3) {
/* Large indel */
- if (nmatches - nmismatches > nindels) {
+ if (max_nmatches - max_nmismatches > nindels) {
debug3(printf("Large indel: More matches than mismatches, so keeping it\n"));
pairs = exon; /* exon already has the indel */
*trim3p = false;
@@ -3877,7 +3949,7 @@ trim_end3_exon_indels (bool *trim3p, int ambig_end_length, List_T path, int pair
} else {
/* Small indel */
- if (nmatches - nmismatches > 2) {
+ if (max_nmatches - max_nmismatches > 2) {
debug3(printf("Small indel: More matches than mismatches, so keeping it\n"));
pairs = exon; /* exon already has the indel */
*trim3p = false;
@@ -3890,7 +3962,7 @@ trim_end3_exon_indels (bool *trim3p, int ambig_end_length, List_T path, int pair
}
} else {
- if (splice->knowngapp == true && nmismatches == 0) {
+ if (splice->knowngapp == true && max_nmismatches == 0) {
debug3(printf("Intron is known and no mismatches, so keeping it\n"));
pairs = exon; /* exon already has the gap */
*trim3p = false;
@@ -3900,22 +3972,29 @@ trim_end3_exon_indels (bool *trim3p, int ambig_end_length, List_T path, int pair
pairs = (List_T) NULL;
*trim3p = true;
+#if 0
} else if (enough_matches(nmatches-nmismatches,splice->genomejump/*,splice->donor_prob,splice->acceptor_prob*/) == false) {
debug3(printf("nmatches %d - nmismatches %d not enough for genomejump %d, so trimming it\n",
nmatches,nmismatches,splice->genomejump));
pairs = (List_T) NULL;
*trim3p = true;
+#endif
+
+ } else if (max_score < 12) {
+ debug3(printf("max_score %d < 12, so trimming it\n",max_score));
+ pairs = (List_T) NULL;
+ *trim3p = true;
- } else if (sufficient_splice_prob_local(List_length(exon),nmismatches,
+ } else if (sufficient_splice_prob_local(List_length(exon),max_nmismatches,
/*distal_spliceprob*/cdna_direction >= 0 ? splice->acceptor_prob : splice->donor_prob,
/*medial_spliceprob*/cdna_direction >= 0 ? splice->donor_prob : splice->acceptor_prob)) {
/* Want to keep for comparison of fwd and rev, even if probabilities are poor */
- debug3(printf("Keeping last 3' exon with %d matches and %d mismatches\n",nmatches,nmismatches));
+ debug3(printf("Keeping last 3' exon with %d matches and %d mismatches\n",max_nmatches,max_nmismatches));
pairs = exon; /* exon already has the gap */
*trim3p = false;
} else {
- debug3(printf("Fall through: trimming noncanonical 3' exon\n"));
+ debug3(printf("Fall through (bad probabilities): trimming noncanonical 3' exon\n"));
medial_prob = (cdna_direction >= 0) ? splice->donor_prob : splice->acceptor_prob;
if (canonicalp(splice->knowngapp,splice->comp,splice->donor_prob,splice->acceptor_prob,cdna_direction) == true &&
@@ -4256,7 +4335,7 @@ make_pairarray_merge (T this_left, int cdna_direction, int sensedir, bool watson
struct Pair_T *pairarray, *pairarray_save;
List_T printpairs, printpath, path, p;
Pair_T oldpair, newpair;
- int ncanonical, nsemicanonical, nnoncanonical;
+ int ncanonical, nsemicanonical;
double min_splice_prob;
pairarray_save = this_left->pairarray;
@@ -4271,10 +4350,6 @@ make_pairarray_merge (T this_left, int cdna_direction, int sensedir, bool watson
debug10(Pair_dump_list(this_left->pairs,true));
this_left->cdna_direction = cdna_direction;
- Pair_fracidentity(&this_left->matches,&this_left->unknowns,&this_left->mismatches,
- &this_left->qopens,&this_left->qindels,&this_left->topens,&this_left->tindels,
- &ncanonical,&nsemicanonical,&nnoncanonical,
- &min_splice_prob,this_left->pairs,this_left->cdna_direction);
printpairs = Pairpool_copy(this_left->pairs,pairpool);
@@ -4324,6 +4399,12 @@ make_pairarray_merge (T this_left, int cdna_direction, int sensedir, bool watson
this_left->pairarray = pairarray;
this_left->pairarray_freeable_p = true;
+ this_left->goodness =
+ Pair_fracidentity_array(&this_left->matches,&this_left->unknowns,&this_left->mismatches,
+ &this_left->qopens,&this_left->qindels,&this_left->topens,&this_left->tindels,
+ &ncanonical,&nsemicanonical,&this_left->noncanonical,
+ &min_splice_prob,this_left->pairarray,this_left->npairs,this_left->cdna_direction);
+
return true;
}
@@ -4336,19 +4417,19 @@ make_pairarrays_chimera (T this_left, T this_right,
List_T printpairs_left, printpath_left, printpairs_right, printpath_right, p;
Pair_T oldpair, newpair;
int newnpairs;
- int ncanonical, nsemicanonical, nnoncanonical;
+ int ncanonical, nsemicanonical;
double min_splice_prob;
/* Revise statistics */
Pair_fracidentity(&this_left->matches,&this_left->unknowns,&this_left->mismatches,
&this_left->qopens,&this_left->qindels,&this_left->topens,&this_left->tindels,
- &ncanonical,&nsemicanonical,&nnoncanonical,
+ &ncanonical,&nsemicanonical,&this_left->noncanonical,
&min_splice_prob,this_left->pairs,this_left->cdna_direction);
Pair_fracidentity(&this_right->matches,&this_right->unknowns,&this_right->mismatches,
&this_right->qopens,&this_right->qindels,&this_right->topens,&this_right->tindels,
- &ncanonical,&nsemicanonical,&nnoncanonical,
+ &ncanonical,&nsemicanonical,&this_right->noncanonical,
&min_splice_prob,this_right->pairs,this_right->cdna_direction);
@@ -4383,21 +4464,34 @@ make_pairarrays_chimera (T this_left, T this_right,
/* Need to have a single pairarray for this_left, so we can translate protein correctly */
newpair = this_left->pairarray = (struct Pair_T *) MALLOC_OUT((newnpairs + gaplength)*sizeof(struct Pair_T));
this_left->pairarray_freeable_p = true;
+
for (p = printpairs_left; p != NULL; p = p->rest) {
oldpair = (Pair_T) p->first;
memcpy(newpair++,oldpair,sizeof(struct Pair_T));
}
Pair_set_genomepos(this_left->pairarray,this_left->npairs,this_left->chroffset,this_left->chrhigh,
this_left->watsonp);
+ this_left->goodness =
+ Pair_fracidentity_array(&this_left->matches,&this_left->unknowns,&this_left->mismatches,
+ &this_left->qopens,&this_left->qindels,&this_left->topens,&this_left->tindels,
+ &ncanonical,&nsemicanonical,&this_left->noncanonical,
+ &min_splice_prob,this_left->pairarray,this_left->npairs,this_left->cdna_direction);
+
newpair = this_right->pairarray = &(this_left->pairarray[this_left->npairs + gaplength]);
this_right->pairarray_freeable_p = false;
+
for (p = printpairs_right; p != NULL; p = p->rest) {
oldpair = (Pair_T) p->first;
memcpy(newpair++,oldpair,sizeof(struct Pair_T));
}
Pair_set_genomepos(this_right->pairarray,this_right->npairs,this_right->chroffset,this_right->chrhigh,
this_right->watsonp);
+ this_right->goodness =
+ Pair_fracidentity_array(&this_right->matches,&this_right->unknowns,&this_right->mismatches,
+ &this_right->qopens,&this_right->qindels,&this_right->topens,&this_right->tindels,
+ &ncanonical,&nsemicanonical,&this_right->noncanonical,
+ &min_splice_prob,this_right->pairarray,this_right->npairs,this_right->cdna_direction);
}
return;
@@ -4408,36 +4502,12 @@ make_pairarrays_chimera (T this_left, T this_right,
#define MAPQ_MAXIMUM_SCORE 40
void
-Stage3_recompute_goodness (List_T stage3list) {
+Stage3_compute_mapq (List_T stage3list) {
T this;
List_T p;
int best_absmq_score;
float total = 0.0, q;
-#ifdef DEPEND_ON_QUALITY
- for (p = stage3list; p != NULL && high_quality_p == false; p = List_next(p)) {
- this = (T) List_head(p);
- if (this->trimmed_coverage > 0.80 && this->defect_rate < 0.10) {
- high_quality_p = true;
- }
- }
-
- /* Subtracts points for non-canonical introns */
- if (high_quality_p == true) {
- for (p = stage3list; p != NULL; p = List_next(p)) {
- this = (T) List_head(p);
- this->goodness = this->matches + MISMATCH*this->mismatches
- + QOPEN*this->qopens + QINDEL*this->qindels + TOPEN*this->topens + TINDEL*this->tindels
- - CANONICAL_POINTS*this->noncanonical;
- }
- } else {
- for (p = stage3list; p != NULL; p = List_next(p)) {
- this = (T) List_head(p);
- this->goodness = this->matches;
- }
- }
-
-#else
if (stage3list != NULL) {
/* Use the first entry to initialize best_absmq_score */
p = stage3list;
@@ -4462,6 +4532,7 @@ Stage3_recompute_goodness (List_T stage3list) {
for (p = stage3list; p != NULL; p = List_next(p)) {
this = (T) List_head(p);
+
if ((q = 1.0 - fasterexp(this->absmq_score) / total) < 1.0e-4 /* 10^-4.0 */) {
this->mapq_score = 40;
} else {
@@ -4473,12 +4544,7 @@ Stage3_recompute_goodness (List_T stage3list) {
this->absmq_score = 0;
}
- this->goodness = this->matches + MISMATCH*this->mismatches
- + QOPEN*this->qopens + QINDEL*this->qindels + TOPEN*this->topens + TINDEL*this->tindels
- - CANONICAL_POINTS*this->noncanonical;
}
-
-#endif
}
@@ -4826,7 +4892,7 @@ initial_cdna_direction (List_T pairs_fwd, List_T pairs_rev,
T
-Stage3_new (struct Pair_T *pairarray, List_T pairs, int npairs, int cdna_direction, int sensedir,
+Stage3_new (struct Pair_T *pairarray, List_T pairs, int npairs, int goodness, int cdna_direction, int sensedir,
int stage2_source, int stage2_indexsize,
int matches, int unknowns, int mismatches, int qopens, int qindels,
int topens, int tindels, int ncanonical, int nsemicanonical, int nnoncanonical,
@@ -4855,7 +4921,7 @@ Stage3_new (struct Pair_T *pairarray, List_T pairs, int npairs, int cdna_directi
new->tindels = tindels;
new->noncanonical = nsemicanonical + nnoncanonical;
- new->goodness = 0;
+ new->goodness = goodness;
#ifdef PMAP
/* Should be +1 */
@@ -4912,6 +4978,9 @@ Stage3_new (struct Pair_T *pairarray, List_T pairs, int npairs, int cdna_directi
new->trimmed_coverage = (double) (end->querypos - start->querypos + 1)/(double) (trimlength + skiplength);
+ debug(printf("Creating stage3 at chr %d:%u..%u, goodness %d, matches %d, npairs %d\n",
+ chrnum,Stage3_chrstart(new),Stage3_chrend(new),new->goodness,new->matches,new->npairs));
+
if (straintype == 0) {
return new;
} else {
@@ -6877,7 +6946,8 @@ traverse_single_gap (bool *filledp, int *dynprogindex, List_T pairs, List_T *pat
}
debug(printf("queryjump = %d, genomejump = %d, Orig score: %d\n",queryjump,genomejump,origscore));
- if (abs(queryjump - genomejump) <= 3) {
+ if (0 && abs(queryjump - genomejump) <= 3) {
+ /* This leads to bad CIGAR strings */
debug(printf("Minor difference in queryjump and genomejump, so accepting this solution\n"));
pairs = Pairpool_transfer(pairs,gappairs);
*filledp = true;
@@ -7833,11 +7903,6 @@ distalmedial_ending5 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
List_T *pairs, int leftquerypos, int leftgenomepos, Pair_T rightpair,
Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
char *queryseq_ptr, char *queryuc_ptr,
int cdna_direction, bool watsonp, bool jump_late_p, Pairpool_T pairpool,
Dynprog_T dynprog, int maxpeelback, int extramaterial_end,
@@ -7929,11 +7994,6 @@ distalmedial_ending5 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
&(queryseq_ptr[querydp3_medialgap]),&(queryuc_ptr[querydp3_medialgap]),
queryjump,genomejump,querydp3_medialgap,genomedp3_medialgap,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,querylength,query_compress,
-#endif
-#endif
cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate);
if (*ambig_end_length > 0) {
@@ -7995,11 +8055,6 @@ extend_ending5 (bool *knownsplicep, int *dynprogindex_minor,
List_T *pairs, int leftquerypos, int leftgenomepos, Pair_T rightpair,
Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
char *queryseq_ptr, char *queryuc_ptr,
int cdna_direction, bool watsonp, bool jump_late_p, Pairpool_T pairpool,
Dynprog_T dynprog, int maxpeelback, int extramaterial_end,
@@ -8059,11 +8114,6 @@ extend_ending5 (bool *knownsplicep, int *dynprogindex_minor,
&(queryseq_ptr[querydp3_distalgap]),&(queryuc_ptr[querydp3_distalgap]),
queryjump,genomejump,querydp3_distalgap,genomedp3_distalgap,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,querylength,query_compress,
-#endif
-#endif
cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate);
if (*ambig_end_length > 0) {
@@ -8112,11 +8162,6 @@ distalmedial_ending3 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
List_T *path, Pair_T leftpair, int rightquerypos, int querylength,
Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
char *queryseq_ptr, char *queryuc_ptr,
int cdna_direction, bool watsonp, bool jump_late_p,
Pairpool_T pairpool, Dynprog_T dynprog, int maxpeelback, int extramaterial_end,
@@ -8213,11 +8258,6 @@ distalmedial_ending3 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
&(queryseq_ptr[querydp5_medialgap]),&(queryuc_ptr[querydp5_medialgap]),
queryjump,genomejump,querydp5_medialgap,genomedp5_medialgap,
querylength,chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,querylength,query_compress,
-#endif
-#endif
cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate);
if (*ambig_end_length > 0) {
@@ -8279,11 +8319,6 @@ extend_ending3 (bool *knownsplicep, int *dynprogindex_minor, int *finalscore,
List_T *path, Pair_T leftpair, int rightquerypos,
int querylength, Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
char *queryseq_ptr, char *queryuc_ptr,
int cdna_direction, bool watsonp, bool jump_late_p,
Pairpool_T pairpool, Dynprog_T dynprog, int maxpeelback, int extramaterial_end,
@@ -8343,11 +8378,6 @@ extend_ending3 (bool *knownsplicep, int *dynprogindex_minor, int *finalscore,
&(queryseq_ptr[querydp5_distalgap]),&(queryuc_ptr[querydp5_distalgap]),
queryjump,genomejump,querydp5_distalgap,genomedp5_distalgap,
querylength,chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,querylength,query_compress,
-#endif
-#endif
cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate);
if (*ambig_end_length > 0) {
@@ -8670,11 +8700,6 @@ build_path_end3 (bool *knownsplicep, int *ambig_end_length_3, Splicetype_T *ambi
bool *chop_exon_p, int *dynprogindex_minor,
List_T path, Univcoord_T chroffset, Univcoord_T chrhigh, int querylength,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
char *queryseq_ptr, char *queryuc_ptr,
int cdna_direction, bool watsonp, bool jump_late_p, int maxpeelback,
int maxpeelback_distalmedial, int nullgap,
@@ -8743,11 +8768,6 @@ build_path_end3 (bool *knownsplicep, int *ambig_end_length_3, Splicetype_T *ambi
&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
&path,leftpair,rightquerypos,querylength,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,pairpool,dynprogL,maxpeelback,
extramaterial_end,extraband_end,defect_rate,endalign);
@@ -8759,11 +8779,6 @@ build_path_end3 (bool *knownsplicep, int *ambig_end_length_3, Splicetype_T *ambi
&finalscore,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
&path,leftpair,rightquerypos,querylength,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,pairpool,dynprogL,maxpeelback_distalmedial,
extramaterial_end,extraband_end,defect_rate);
@@ -8797,11 +8812,6 @@ build_pairs_end5 (bool *knownsplicep, int *ambig_end_length_5, Splicetype_T *amb
bool *chop_exon_p, int *dynprogindex_minor, List_T pairs,
Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
char *queryseq_ptr, char *queryuc_ptr,
int cdna_direction, bool watsonp, bool jump_late_p, int maxpeelback,
int maxpeelback_distalmedial, int nullgap,
@@ -8866,11 +8876,6 @@ build_pairs_end5 (bool *knownsplicep, int *ambig_end_length_5, Splicetype_T *amb
&finalscore,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
&pairs,leftquerypos,/*leftgenomepos*/-1,rightpair,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,pairpool,dynprogR,maxpeelback,
extramaterial_end,extraband_end,defect_rate,endalign);
@@ -8882,11 +8887,6 @@ build_pairs_end5 (bool *knownsplicep, int *ambig_end_length_5, Splicetype_T *amb
&finalscore,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
&pairs,leftquerypos,/*leftgenomepos*/-1,rightpair,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,pairpool,dynprogR,maxpeelback_distalmedial,
extramaterial_end,extraband_end,defect_rate);
@@ -9437,8 +9437,8 @@ score_introns (double *max_intron_score, double *avg_donor_score, double *avg_ac
double donor_score, acceptor_score;
int nintrons = 0;
int i;
- int intron_matches, intron_denominator, total_matches, total_denominator;
- double theta;
+ int total_matches, total_denominator;
+ int max_neighborhood_score, neighborhood_score;
#if 0
char gbuffer1[MAXENT_MAXLENGTH];
#endif
@@ -9501,43 +9501,41 @@ score_introns (double *max_intron_score, double *avg_donor_score, double *avg_ac
path = Pairpool_pop(path,&pair);
/* Look at right neighborhood */
- intron_matches = intron_denominator = 0;
+ max_neighborhood_score = neighborhood_score = 0;
for (p = pairs, i = 0; p != NULL && i < 25 && ((Pair_T) (p->first))->gapp == false; p = p->rest, i++) {
rightpair = p->first;
if (rightpair->comp == MATCH_COMP || rightpair->comp == DYNPROG_MATCH_COMP || rightpair->comp == AMBIGUOUS_COMP) {
- intron_matches++;
+ neighborhood_score += 1;
+ } else {
+ neighborhood_score -= 3;
+ }
+ if (neighborhood_score > max_neighborhood_score) {
+ max_neighborhood_score = neighborhood_score;
}
- intron_denominator++;
}
- theta = (double) (total_matches - intron_matches)/(double) (total_denominator - intron_denominator + 1);
- if (theta > 1.0) {
- theta = 1.0;
- }
- debug11(printf("right neighborhood: intron_matches %d, intron_denominator %d, theta %f => pvalue %g\n",
- intron_matches,intron_denominator,theta,Pbinom(intron_matches,intron_denominator,theta)));
- if (Pbinom(intron_matches,intron_denominator,theta) < 0.05) { /* was 1e-3 */
+ debug11(printf("right neighborhood: max_neighborhood_score %d\n",max_neighborhood_score));
+ if (max_neighborhood_score < 6) {
/* Not a good intron */
/* *nbadintrons += 1; */
} else {
/* Look at left neighborhood */
- intron_matches = intron_denominator = 0;
+ max_neighborhood_score = neighborhood_score = 0;
for (p = path, i = 0; p != NULL && i < 25 && ((Pair_T) (p->first))->gapp == false; p = p->rest, i++) {
leftpair = p->first;
if (leftpair->comp == MATCH_COMP || leftpair->comp == DYNPROG_MATCH_COMP || leftpair->comp == AMBIGUOUS_COMP) {
- intron_matches++;
+ neighborhood_score += 1;
+ } else {
+ neighborhood_score -= 3;
+ }
+ if (neighborhood_score > max_neighborhood_score) {
+ max_neighborhood_score = neighborhood_score;
}
- intron_denominator++;
}
- theta = (double) (total_matches - intron_matches)/(double) (total_denominator - intron_denominator + 1);
- if (theta > 1.0) {
- theta = 1.0;
- }
- debug11(printf("left neighborhood: intron_matches %d, intron_denominator %d, theta %f => pvalue %g\n",
- intron_matches,intron_denominator,theta,Pbinom(intron_matches,intron_denominator,theta)));
- if (Pbinom(intron_matches,intron_denominator,theta) < 0.05) { /* was 1e-3 */
+ debug11(printf("left neighborhood: max_neighborhood_score %d\n",max_neighborhood_score));
+ if (max_neighborhood_score < 6) {
/* Not a good intron */
/* *nbadintrons += 1; */
@@ -10678,12 +10676,6 @@ score_nconsecutive (List_T pairs) {
static List_T
path_compute_dir (double *defect_rate, List_T pairs,
int cdna_direction, bool watsonp, int genestrand, bool jump_late_p,
-
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
#ifdef PMAP
char *queryaaseq_ptr,
#endif
@@ -10955,7 +10947,7 @@ path_compute_dir (double *defect_rate, List_T pairs,
#ifdef GSNAP
/* Too expensive to loop */
dual_break_p = false;
- filterp = false;
+ /* filterp = false; */
#endif
iter0++;
debug(printf("At end of outer loop: filterp %d, dual_break_p %d\n",filterp,dual_break_p));
@@ -10970,11 +10962,6 @@ static List_T
path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5, double *ambig_prob_5,
double defect_rate, List_T pairs, int cdna_direction,
bool watsonp, int genestrand, bool jump_late_p, int querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
char *queryseq_ptr, char *queryuc_ptr,
Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
@@ -11038,11 +11025,6 @@ path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5, do
&chop_exon_p,&dynprogindex_minor,pairs,
chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,
maxpeelback,maxpeelback_distalmedial,
@@ -11086,11 +11068,6 @@ path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5, do
&chop_exon_p,&dynprogindex_minor,pairs,
chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,
maxpeelback,maxpeelback_distalmedial,
@@ -11113,11 +11090,6 @@ path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5, do
&chop_exon_p,&dynprogindex_minor,pairs,
chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,
maxpeelback,maxpeelback_distalmedial,
@@ -11134,11 +11106,6 @@ path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5, do
&chop_exon_p,&dynprogindex_minor,pairs,
chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,
maxpeelback,maxpeelback_distalmedial,
@@ -11157,11 +11124,6 @@ static List_T
path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3, double *ambig_prob_3,
double defect_rate, List_T path, int cdna_direction,
bool watsonp, int genestrand, bool jump_late_p, int querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
char *queryseq_ptr, char *queryuc_ptr,
Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
@@ -11225,11 +11187,6 @@ path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3, do
&chop_exon_p,&dynprogindex_minor,path,
chroffset,chrhigh,querylength,
knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,
maxpeelback,maxpeelback_distalmedial,
@@ -11272,11 +11229,6 @@ path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3, do
&chop_exon_p,&dynprogindex_minor,path,
chroffset,chrhigh,querylength,
knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,
maxpeelback,maxpeelback_distalmedial,
@@ -11298,11 +11250,6 @@ path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3, do
&chop_exon_p,&dynprogindex_minor,path,
chroffset,chrhigh,querylength,
knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,
maxpeelback,maxpeelback_distalmedial,
@@ -11319,11 +11266,6 @@ path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3, do
&chop_exon_p,&dynprogindex_minor,path,
chroffset,chrhigh,querylength,
knownsplice_limit_low,knownsplice_limit_high,
- #ifdef GSNAP
- #ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
- #endif
- #endif
queryseq_ptr,queryuc_ptr,
cdna_direction,watsonp,jump_late_p,
maxpeelback,maxpeelback_distalmedial,
@@ -11340,11 +11282,6 @@ path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3, do
static List_T
path_compute_final (double defect_rate, List_T pairs, int cdna_direction, bool watsonp, int genestrand,
bool jump_late_p, int querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
#ifdef PMAP
char *queryaaseq_ptr,
#endif
@@ -11425,6 +11362,7 @@ path_compute_final (double defect_rate, List_T pairs, int cdna_direction, bool w
+#ifdef GSNAP
static List_T
trim_novel_spliceends (List_T pairs,
int *ambig_end_length_5, int *ambig_end_length_3,
@@ -11833,6 +11771,7 @@ trim_novel_spliceends (List_T pairs,
return pairs;
}
+#endif
@@ -11841,13 +11780,7 @@ path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
Splicetype_T *ambig_splicetype_5, Splicetype_T *ambig_splicetype_3,
double *ambig_prob_5, double *ambig_prob_3,
List_T pairs, int *cdna_direction, int *sensedir, bool watsonp, bool jump_late_p,
- int querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
- char *queryseq_ptr, char *queryuc_ptr,
+ int querylength, char *queryseq_ptr, char *queryuc_ptr,
Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
int maxpeelback, int maxpeelback_distalmedial, int nullgap,
@@ -11909,11 +11842,6 @@ path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
&chop_exon_p,&dynprogindex_minor,pairs,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
*cdna_direction,watsonp,jump_late_p,
maxpeelback,maxpeelback_distalmedial,
@@ -11932,11 +11860,6 @@ path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
&chop_exon_p,&dynprogindex_minor,path,
chroffset,chrhigh,querylength,
knownsplice_limit_low,knownsplice_limit_high,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,
*cdna_direction,watsonp,jump_late_p,
maxpeelback,maxpeelback_distalmedial,
@@ -11969,7 +11892,7 @@ path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
/* Using alloca for last_genomedp5 and last_genomedp3 can cause stack overflow */
struct Pair_T *
-Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sensedir,
+Stage3_compute (List_T *finalpairs, int *npairs, int *goodness, int *cdna_direction, int *sensedir,
int *matches, int *nmatches_posttrim, int *max_match_length,
int *ambig_end_length_5, int *ambig_end_length_3,
Splicetype_T *ambig_splicetype_5, Splicetype_T *ambig_splicetype_3,
@@ -11977,11 +11900,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
int *unknowns, int *mismatches, int *qopens, int *qindels, int *topens, int *tindels,
int *ncanonical, int *nsemicanonical, int *nnoncanonical, double *min_splice_prob,
Stage2_T stage2,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
#ifdef PMAP
char *queryaaseq_ptr,
#endif
@@ -12066,11 +11984,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
last_genomedp3_fwd = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
path_fwd = path_compute_dir(&defect_rate_fwd,pairs_fwd,/*cdna_direction*/+1,
watsonp,genestrand,jump_late_p,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
#ifdef PMAP
queryaaseq_ptr,
#endif
@@ -12096,11 +12009,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
last_genomedp3_rev = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
path_rev = path_compute_dir(&defect_rate_rev,pairs_rev,/*cdna_direction*/-1,
watsonp,genestrand,jump_late_p,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
#ifdef PMAP
queryaaseq_ptr,
#endif
@@ -12179,11 +12087,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
best_path = path_compute_end3(&fwd_ambig_end_length_3,&fwd_ambig_splicetype_3,&fwd_ambig_prob_3,
defect_rate_fwd,path_fwd,/*cdna_direction*/+1,watsonp,genestrand,
jump_late_p,querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,maxpeelback_distalmedial,nullgap,
@@ -12213,11 +12116,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
copy = (List_T) List_head(p);
path_fwd = path_compute_dir(&defect_rate_temp,/*pairs*/List_reverse(copy),/*cdna_direction*/+1,
watsonp,genestrand,jump_late_p,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
#ifdef PMAP
queryaaseq_ptr,
#endif
@@ -12232,11 +12130,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
temp_path = path_compute_end3(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
defect_rate_temp,path_fwd,/*cdna_direction*/+1,watsonp,genestrand,
jump_late_p,querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,maxpeelback_distalmedial,nullgap,
@@ -12267,11 +12160,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
best_pairs = path_compute_end5(&fwd_ambig_end_length_5,&fwd_ambig_splicetype_5,&fwd_ambig_prob_5,
defect_rate_fwd,pairs_fwd,/*cdna_direction*/+1,watsonp,genestrand,
jump_late_p,querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,maxpeelback_distalmedial,nullgap,
@@ -12301,11 +12189,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
copy = (List_T) List_head(p);
path_fwd = path_compute_dir(&defect_rate_temp,/*pairs*/copy,/*cdna_direction*/+1,
watsonp,genestrand,jump_late_p,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
#ifdef PMAP
queryaaseq_ptr,
#endif
@@ -12320,11 +12203,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
temp_pairs = path_compute_end5(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
defect_rate_temp,/*pairs*/List_reverse(path_fwd),
/*cdna_direction*/+1,watsonp,genestrand,jump_late_p,querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,maxpeelback_distalmedial,nullgap,
@@ -12365,11 +12243,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
best_path = path_compute_end3(&rev_ambig_end_length_3,&rev_ambig_splicetype_3,&rev_ambig_prob_3,
defect_rate_rev,path_rev,/*cdna_direction*/-1,watsonp,genestrand,
jump_late_p,querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,maxpeelback_distalmedial,nullgap,
@@ -12391,11 +12264,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
copy = (List_T) List_head(p);
path_rev = path_compute_dir(&defect_rate_temp,/*pairs*/List_reverse(copy),/*cdna_direction*/-1,
watsonp,genestrand,jump_late_p,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
#ifdef PMAP
queryaaseq_ptr,
#endif
@@ -12410,11 +12278,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
temp_path = path_compute_end3(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
defect_rate_temp,path_rev,/*cdna_direction*/-1,watsonp,genestrand,
jump_late_p,querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,maxpeelback_distalmedial,nullgap,
@@ -12445,11 +12308,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
best_pairs = path_compute_end5(&rev_ambig_end_length_5,&rev_ambig_splicetype_5,&rev_ambig_prob_5,
defect_rate_rev,pairs_rev,/*cdna_direction*/-1,watsonp,genestrand,
jump_late_p,querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,maxpeelback_distalmedial,nullgap,
@@ -12471,11 +12329,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
copy = (List_T) List_head(p);
path_rev = path_compute_dir(&defect_rate_temp,/*pairs*/copy,/*cdna_direction*/-1,
watsonp,genestrand,jump_late_p,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
#ifdef PMAP
queryaaseq_ptr,
#endif
@@ -12490,11 +12343,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
temp_pairs = path_compute_end5(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
defect_rate_temp,/*pairs*/List_reverse(path_rev),
/*cdna_direction*/-1,watsonp,genestrand,jump_late_p,querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,maxpeelback_distalmedial,nullgap,
@@ -12538,11 +12386,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
#endif
pairs_fwd = path_compute_final(defect_rate_fwd,pairs_fwd,/*cdna_direction*/+1,
watsonp,genestrand,jump_late_p,querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
#ifdef PMAP
queryaaseq_ptr,
#endif
@@ -12558,11 +12401,6 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
pairs_rev = path_compute_final(defect_rate_rev,pairs_rev,/*cdna_direction*/-1,
watsonp,genestrand,jump_late_p,querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
#ifdef PMAP
queryaaseq_ptr,
#endif
@@ -12675,6 +12513,7 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
if (pairs_pretrim == NULL) {
*npairs = 0;
+ *goodness = 0;
*nmatches_posttrim = 0;
*ambig_end_length_5 = *ambig_end_length_3 = 0;
*ambig_prob_5 = *ambig_prob_3 = 0.0;
@@ -12707,13 +12546,7 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
&(*ambig_splicetype_5),&(*ambig_splicetype_3),
&(*ambig_prob_5),&(*ambig_prob_3),
pairs_pretrim,&(*cdna_direction),&(*sensedir),watsonp,
- jump_late_p,querylength,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- cutoff_level,queryptr,query_compress,
-#endif
-#endif
- queryseq_ptr,queryuc_ptr,
+ jump_late_p,querylength,queryseq_ptr,queryuc_ptr,
chrnum,chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,maxpeelback_distalmedial,nullgap,
extramaterial_end,extraband_end,
@@ -12727,14 +12560,14 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
/* printf("ambig_end_length = %d, %d\n",*ambig_end_length_5,*ambig_end_length_3); */
- Pair_fracidentity(&(*matches),&(*unknowns),&(*mismatches),
- &(*qopens),&(*qindels),&(*topens),&(*tindels),
- &(*ncanonical),&(*nsemicanonical),&(*nnoncanonical),
- &(*min_splice_prob),*finalpairs,*cdna_direction);
-
pairarray = make_pairarray(&(*npairs),&(*finalpairs),*cdna_direction,*sensedir,watsonp,
pairpool,queryseq_ptr,chroffset,chrhigh,
ngap,query_subseq_offset,skiplength,diagnosticp);
+ *goodness = Pair_fracidentity_array(&(*matches),&(*unknowns),&(*mismatches),
+ &(*qopens),&(*qindels),&(*topens),&(*tindels),
+ &(*ncanonical),&(*nsemicanonical),&(*nnoncanonical),
+ &(*min_splice_prob),pairarray,*npairs,*cdna_direction);
+
debug0(printf("Result (%d pairs): %d matches, %d mismatches, %d qopens, %d qindels, %d topens, %d tindels\n",
*npairs,*matches,*mismatches,*qopens,*qindels,*topens,*tindels));
@@ -12958,6 +12791,10 @@ Stage3_extend_right (T this, int goal, int querylength,
bool mismatchp, protectedp;
int n_peeled_indels;
+ int ncanonical, nsemicanonical;
+ double min_splice_prob;
+
+
debug10(printf("Entered Stage3_extend_right with goal %d\n",goal));
debug10(printf("LEFT BEFORE FILL\n"));
debug10(Pair_dump_list(this->pairs,true));
@@ -13123,6 +12960,11 @@ Stage3_extend_right (T this, int goal, int querylength,
this->watsonp,pairpool,queryseq_ptr,
this->chroffset,this->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0,
/*diagnosticp*/false);
+ this->goodness = Pair_fracidentity_array(&this->matches,&this->unknowns,&this->mismatches,
+ &this->qopens,&this->qindels,&this->topens,&this->tindels,
+ &ncanonical,&nsemicanonical,&this->noncanonical,
+ &min_splice_prob,this->pairarray,this->npairs,this->cdna_direction);
+
if (this->pairarray == NULL) {
this->pairarray_freeable_p = false;
} else {
@@ -13148,6 +12990,10 @@ Stage3_extend_left (T this, int goal,
bool mismatchp, protectedp;
int n_peeled_indels;
+ int ncanonical, nsemicanonical;
+ double min_splice_prob;
+
+
debug10(printf("Entered Stage3_extend_left with goal %d\n",goal));
debug10(printf("RIGHT BEFORE FILL\n"));
debug10(Pair_dump_list(this->pairs,true));
@@ -13311,6 +13157,11 @@ Stage3_extend_left (T this, int goal,
this->watsonp,pairpool,queryseq_ptr,
this->chroffset,this->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0,
/*diagnosticp*/false);
+ this->goodness = Pair_fracidentity_array(&this->matches,&this->unknowns,&this->mismatches,
+ &this->qopens,&this->qindels,&this->topens,&this->tindels,
+ &ncanonical,&nsemicanonical,&this->noncanonical,
+ &min_splice_prob,this->pairarray,this->npairs,this->cdna_direction);
+
if (this->pairarray == NULL) {
this->pairarray_freeable_p = false;
} else {
@@ -13348,6 +13199,10 @@ merge_local_single (T this_left, T this_right,
List_T path;
bool watsonp, filledp;
+ int ncanonical, nsemicanonical;
+ double min_splice_prob;
+
+
#ifdef EXTRACT_GENOMICSEG
char *genomicseg_ptr = NULL;
#endif
@@ -13508,10 +13363,20 @@ merge_local_single (T this_left, T this_right,
this_left->watsonp,pairpool,queryseq_ptr,
this_left->chroffset,this_left->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0,
/*diagnosticp*/false);
+ this_left->goodness = Pair_fracidentity_array(&this_left->matches,&this_left->unknowns,&this_left->mismatches,
+ &this_left->qopens,&this_left->qindels,&this_left->topens,&this_left->tindels,
+ &ncanonical,&nsemicanonical,&this_left->noncanonical,
+ &min_splice_prob,this_left->pairarray,this_left->npairs,this_left->cdna_direction);
+
this_right->pairarray = make_pairarray(&this_right->npairs,&this_right->pairs,this_right->cdna_direction,this_right->sensedir,
- this_right->watsonp,pairpool,queryseq_ptr,
- this_right->chroffset,this_right->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0,
- /*diagnosticp*/false);
+ this_right->watsonp,pairpool,queryseq_ptr,
+ this_right->chroffset,this_right->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0,
+ /*diagnosticp*/false);
+ this_right->goodness = Pair_fracidentity_array(&this_right->matches,&this_right->unknowns,&this_right->mismatches,
+ &this_right->qopens,&this_right->qindels,&this_right->topens,&this_right->tindels,
+ &ncanonical,&nsemicanonical,&this_right->noncanonical,
+ &min_splice_prob,this_right->pairarray,this_right->npairs,this_right->cdna_direction);
+
} else {
this_left->pairs = List_append(this_left->pairs,this_right->pairs);
this_right->pairs = (List_T) NULL;
@@ -13559,7 +13424,7 @@ recompute_for_cdna_direction (int *cdna_direction, List_T pairs, int genestrand,
sufflookback,nsufflookback,maxintronlen_bound,/*close_indels_mode*/+1,
paired_favor_mode,zero_offset);
pairs_fwd = score_introns(&max_intron_score_fwd,&avg_donor_score_fwd,&avg_acceptor_score_fwd,
- &ncanonical_rev,&nbadintrons_fwd,path_fwd,/*cdna_direction*/+1,watsonp,
+ &ncanonical_fwd,&nbadintrons_fwd,path_fwd,/*cdna_direction*/+1,watsonp,
chrnum,chroffset,chrhigh,
#ifdef WASTE
pairpool,
@@ -13795,7 +13660,6 @@ Stage3_merge_local (T this_left, T this_right,
return false;
}
-#if 0
} else if (intronlength < 0) { /* Was intronlength < -EXTRAQUERYGAP, but this missed some short insertions */
/* If traverse_cdna_gap fails, causes seg faults later on */
/* cDNA gap */
@@ -13851,7 +13715,6 @@ Stage3_merge_local (T this_left, T this_right,
/*diagnosticp*/false,/*new_gap_p*/true) == false) {
return false;
}
-#endif
} else {
/* Single gap */
diff --git a/src/stage3.h b/src/stage3.h
index 74a9f1a..1ef1adc 100644
--- a/src/stage3.h
+++ b/src/stage3.h
@@ -1,4 +1,4 @@
-/* $Id: stage3.h 149319 2014-09-30 02:15:42Z twu $ */
+/* $Id: stage3.h 157977 2015-02-03 18:46:53Z twu $ */
#ifndef STAGE3_INCLUDED
#define STAGE3_INCLUDED
@@ -158,7 +158,7 @@ extern bool
Stage3_overlap (T x, T y);
extern void
-Stage3_recompute_goodness (List_T stage3list);
+Stage3_compute_mapq (List_T stage3list);
extern void
Stage3_recompute_coverage (List_T stage3list, Sequence_T queryseq);
extern void
@@ -255,8 +255,8 @@ Stage3_print_compressed (FILE *fp, T this, Sequence_T queryseq, Univ_IIT_T chrom
bool checksump, int chimerapos, int chimeraequivpos,
double donor_prob, double acceptor_prob, int chimera_cdna_direction);
extern T
-Stage3_new (struct Pair_T *pairarray, List_T pairs, int npairs, int cdna_direction, int sensedir,
- int stage2_source, int stage2_indexsize,
+Stage3_new (struct Pair_T *pairarray, List_T pairs, int npairs, int goodness,
+ int cdna_direction, int sensedir, int stage2_source, int stage2_indexsize,
int matches, int unknowns, int mismatches, int qopens, int qindels,
int topens, int tindels, int ncanonical, int nsemicanonical, int nnoncanonical,
Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
@@ -273,7 +273,7 @@ extern int
Stage3_good_part (struct Pair_T *pairarray, int npairs, int pos5, int pos3);
extern struct Pair_T *
-Stage3_compute (List_T *pairs, int *npairs, int *cdna_direction, int *sensedir,
+Stage3_compute (List_T *pairs, int *npairs, int *goodness, int *cdna_direction, int *sensedir,
int *matches, int *nmatches_posttrim, int *max_match_length,
int *ambig_end_length_5, int *ambig_end_length_3,
Splicetype_T *ambig_splicetype_5, Splicetype_T *ambig_splicetype_3,
@@ -281,11 +281,6 @@ Stage3_compute (List_T *pairs, int *npairs, int *cdna_direction, int *sensedir,
int *unknowns, int *mismatches, int *qopens, int *qindels, int *topens, int *tindels,
int *ncanonical, int *nsemicanonical, int *nnoncanonical, double *min_splice_prob,
Stage2_T stage2,
-#ifdef GSNAP
-#ifdef END_KNOWNSPLICING_SHORTCUT
- int cutoff_level, char *queryptr, Compress_T query_compress,
-#endif
-#endif
#ifdef PMAP
char *queryaaseq_ptr,
#endif
diff --git a/src/stage3hr.c b/src/stage3hr.c
index 366b0ae..cdb27d6 100644
--- a/src/stage3hr.c
+++ b/src/stage3hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage3hr.c 154079 2014-11-25 18:31:31Z twu $";
+static char rcsid[] = "$Id: stage3hr.c 160319 2015-03-06 00:19:30Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -189,7 +189,7 @@ static int *tally_divint_crosstable;
static IIT_T runlength_iit;
static int *runlength_divint_crosstable;
-static int terminal_output_minlength;
+static int reject_trimlength;
static int pairmax;
#ifdef USE_BINGO
@@ -230,7 +230,7 @@ Stage3hr_setup (bool invert_first_p_in, bool invert_second_p_in,
IIT_T genes_iit_in, int *genes_divint_crosstable_in,
IIT_T tally_iit_in, int *tally_divint_crosstable_in,
IIT_T runlength_iit_in, int *runlength_divint_crosstable_in,
- int terminal_output_minlength_in, bool distances_observed_p, int pairmax_in,
+ int reject_trimlength_in, bool distances_observed_p, int pairmax_in,
Chrpos_T expected_pairlength, Chrpos_T pairlength_deviation,
int localsplicing_penalty_in, int indel_penalty_middle_in,
int antistranded_penalty_in, bool favor_multiexon_p_in,
@@ -252,7 +252,7 @@ Stage3hr_setup (bool invert_first_p_in, bool invert_second_p_in,
favor_multiexon_p = favor_multiexon_p_in;
gmap_min_nconsecutive = gmap_min_nconsecutive_in;
- terminal_output_minlength = terminal_output_minlength_in;
+ reject_trimlength = reject_trimlength_in;
pairmax = pairmax_in;
if (pairlength_deviation > expected_pairlength) {
expected_pairlength_low = 0;
@@ -507,12 +507,16 @@ struct T {
int nchimera_known;
int nchimera_novel;
- int start_amb_length; /* For splice, shortexon, and GMAP */
+ int start_amb_length; /* For splice, shortexon, and GMAP */
int end_amb_length; /* For splice, shortexon, and GMAP */
int amb_length_donor; /* For shortexon only */
int amb_length_acceptor; /* For shortexon only */
- double start_amb_prob; /* For GMAP currently */
- double end_amb_prob; /* For GMAP currently */
+
+ double start_amb_prob; /* For determining score_eventrim */
+ double end_amb_prob; /* For determining score_eventrim */
+ double amb_prob_donor; /* For shortexon */
+ double amb_prob_acceptor; /* For shortexon */
+
Endtype_T gmap_start_endtype; /* For GMAP, which has no substrings */
Endtype_T gmap_end_endtype; /* For GMAP, which has no substrings */
@@ -533,8 +537,13 @@ struct T {
int *start_amb_nmismatches; /* Pointer to either amb_nmismatches_donor or amb_nmismatches_acceptor */
int *end_amb_nmismatches; /* Pointer to either amb_nmismatches_donor or amb_nmismatches_acceptor */
+ double *start_amb_probs; /* Pointer to either amb_probs_donor or amb_probs_acceptor */
+ double *end_amb_probs; /* Pointer to either amb_probs_donor or amb_probs_acceptor */
+
int *amb_nmismatches_donor;
int *amb_nmismatches_acceptor;
+ double *amb_probs_donor;
+ double *amb_probs_acceptor;
/* Single: substring1 */
@@ -674,7 +683,9 @@ Stage3end_set_improved_by_gmap (T this) {
bool
Stage3end_anomalous_splice_p (T this) {
- if (this->hittype == SAMECHR_SPLICE) {
+ if (this->chrnum == 0) {
+ return true;
+ } else if (this->hittype == SAMECHR_SPLICE) {
return true;
} else {
return false;
@@ -1242,6 +1253,27 @@ Stage3end_amb_length_end (T this) {
return this->end_amb_length;
}
+Univcoord_T *
+Stage3end_start_ambcoords (T this) {
+ return this->start_ambcoords;
+}
+
+Univcoord_T *
+Stage3end_end_ambcoords (T this) {
+ return this->end_ambcoords;
+}
+
+int
+Stage3end_start_nambcoords (T this) {
+ return this->start_nambcoords;
+}
+
+int
+Stage3end_end_nambcoords (T this) {
+ return this->end_nambcoords;
+}
+
+
int
Stage3end_gmap_querystart (T this) {
@@ -1264,13 +1296,11 @@ Stage3end_terminal_trim (T this) {
}
int
-Stage3end_terminal_length (T this) {
- assert(this->hittype == TERMINAL);
- return Substring_queryend(this->substring1) - Substring_querystart(this->substring1) + 1;
+Stage3end_trimlength (T this) {
+ return this->trim_left + this->trim_right;
}
-
static Overlap_T
Stage3end_gene_overlap (T this) {
Overlap_T overlap;
@@ -1592,6 +1622,8 @@ Stage3end_free (T *old) {
FREE_OUT((*old)->amb_knowni_acceptor);
FREE_OUT((*old)->amb_nmismatches_donor);
FREE_OUT((*old)->amb_nmismatches_acceptor);
+ FREE_OUT((*old)->amb_probs_donor);
+ FREE_OUT((*old)->amb_probs_acceptor);
if ((*old)->deletion != NULL) {
FREE_OUT((*old)->deletion);
@@ -1636,9 +1668,9 @@ Stage3end_list_free (List_T *values) {
bool
Stage3pair_anomalous_splice_p (Stage3pair_T this) {
- if (this->hit5 != NULL && this->hit5->hittype == SAMECHR_SPLICE) {
+ if (this->hit5 != NULL && (this->hit5->chrnum == 0 || this->hit5->hittype == SAMECHR_SPLICE)) {
return true;
- } else if (this->hit3 != NULL && this->hit3->hittype == SAMECHR_SPLICE) {
+ } else if (this->hit3 != NULL && (this->hit3->chrnum == 0 || this->hit3->hittype == SAMECHR_SPLICE)) {
return true;
} else {
return false;
@@ -1823,14 +1855,14 @@ find_ilengths (int *ilength_low, int *ilength_high, Stage3end_T hit, Univcoord_T
common_genomicpos,hit->substring0,hit->substring1,hit->substring2));
/* Add + 1 when subtracting alignstart, but not when starting from alignend */
if (Substring_overlap_point_trimmed_p(hit->substring0,common_genomicpos)) {
- debug13(printf("substring0\n"));
+ debug13(printf("substring0: %u..%u\n",Substring_alignstart_trim(hit->substring0),Substring_alignend_trim(hit->substring0)));
*ilength_low = (common_genomicpos - Substring_alignstart_trim(hit->substring0) + 1);
*ilength_high = (Substring_alignend_trim(hit->substring0) - common_genomicpos /*+ 1*/)
+ Substring_genomic_alignment_length(hit->substring1)
+ Substring_genomic_alignment_length(hit->substring2);
} else if (Substring_overlap_point_trimmed_p(hit->substring1,common_genomicpos)) {
- debug13(printf("substring1\n"));
+ debug13(printf("substring1: %u..%u\n",Substring_alignstart_trim(hit->substring1),Substring_alignend_trim(hit->substring1)));
*ilength_low = Substring_genomic_alignment_length(hit->substring0) +
common_genomicpos - Substring_alignstart_trim(hit->substring1) + 1;
*ilength_high = (Substring_alignend_trim(hit->substring1) - common_genomicpos /*+ 1*/)
@@ -1840,7 +1872,7 @@ find_ilengths (int *ilength_low, int *ilength_high, Stage3end_T hit, Univcoord_T
}
} else if (Substring_overlap_point_trimmed_p(hit->substring2,common_genomicpos)) {
- debug13(printf("substring2\n"));
+ debug13(printf("substring2: %u..%u\n",Substring_alignstart_trim(hit->substring2),Substring_alignend_trim(hit->substring2)));
*ilength_low = Substring_genomic_alignment_length(hit->substring0) +
Substring_genomic_alignment_length(hit->substring1) +
(common_genomicpos - Substring_alignstart_trim(hit->substring2) + 1);
@@ -1856,28 +1888,27 @@ find_ilengths (int *ilength_low, int *ilength_high, Stage3end_T hit, Univcoord_T
} else {
debug13(printf("minus. Checking common genomicpos %llu against substring0 %p, substring1 %p, substring2 %p\n",
common_genomicpos,hit->substring0,hit->substring1,hit->substring2));
- /* Add + 1 when starting from alignstart, but not when subtracting alignend */
if (Substring_overlap_point_trimmed_p(hit->substring0,common_genomicpos)) {
- debug13(printf("substring0\n"));
+ debug13(printf("substring0: %u..%u\n",Substring_alignstart_trim(hit->substring0),Substring_alignend_trim(hit->substring0)));
*ilength_low = Substring_genomic_alignment_length(hit->substring2) +
Substring_genomic_alignment_length(hit->substring1) +
- (common_genomicpos - Substring_alignend_trim(hit->substring0) /*+ 1*/);
- *ilength_high = (Substring_alignstart_trim(hit->substring0) - common_genomicpos + 1);
+ (common_genomicpos - Substring_alignend_trim(hit->substring0) + 1);
+ *ilength_high = (Substring_alignstart_trim(hit->substring0) - common_genomicpos /*+ 1*/);
} else if (Substring_overlap_point_trimmed_p(hit->substring1,common_genomicpos)) {
- debug13(printf("substring1\n"));
+ debug13(printf("substring1: %u..%u\n",Substring_alignstart_trim(hit->substring1),Substring_alignend_trim(hit->substring1)));
*ilength_low = Substring_genomic_alignment_length(hit->substring2) +
- (common_genomicpos - Substring_alignend_trim(hit->substring1) /*+ 1*/);
- *ilength_high = (Substring_alignstart_trim(hit->substring1) - common_genomicpos + 1)
+ (common_genomicpos - Substring_alignend_trim(hit->substring1) + 1);
+ *ilength_high = (Substring_alignstart_trim(hit->substring1) - common_genomicpos /*+ 1*/)
+ Substring_genomic_alignment_length(hit->substring0);
if (hit->hittype == INSERTION) {
*ilength_low += hit->nindels;
}
} else if (Substring_overlap_point_trimmed_p(hit->substring2,common_genomicpos)) {
- debug13(printf("substring2\n"));
- *ilength_low = (common_genomicpos - Substring_alignend_trim(hit->substring2) /*+ 1*/);
- *ilength_high = (Substring_alignstart_trim(hit->substring2) - common_genomicpos + 1)
+ debug13(printf("substring2: %u..%u\n",Substring_alignstart_trim(hit->substring2),Substring_alignend_trim(hit->substring2)));
+ *ilength_low = (common_genomicpos - Substring_alignend_trim(hit->substring2) + 1);
+ *ilength_high = (Substring_alignstart_trim(hit->substring2) - common_genomicpos /*+ 1*/)
+ Substring_genomic_alignment_length(hit->substring1)
+ Substring_genomic_alignment_length(hit->substring0);
if (hit->hittype == INSERTION) {
@@ -2511,6 +2542,9 @@ Stage3pair_overlap (int *hardclip5_low, int *hardclip5_high, int *hardclip3_low,
return 0;
} else if (hit3->hittype == SAMECHR_SPLICE || hit3->hittype == TRANSLOC_SPLICE) {
return 0;
+ } else if (hit5->plusp != hit3->plusp) {
+ debug13(printf("The two ends are not on the same strand, so returning 0\n"));
+ return 0;
} else {
debug13(printf("hit5 trim_left %d + amb_start %d, trim_right %d + amb_end %d, hit3 trim_left %d + amb_start %d, trim_right %d + amb_end %d\n",
hit5->trim_left,hit5->start_amb_length,hit5->trim_right,hit5->end_amb_length,
@@ -2539,11 +2573,19 @@ Stage3pair_overlap (int *hardclip5_low, int *hardclip5_high, int *hardclip3_low,
common_left = (ilength5_low < ilength3_low) ? ilength5_low : ilength3_low;
common_right = (ilength5_high < ilength3_high) ? ilength5_high : ilength3_high;
- common_shift = (common_right - common_left)/2;
- debug13(printf("Common_shift is %d\n",common_shift));
+ if (common_right > common_left) {
+ common_shift = common_right/2 - (common_left - 1)/2;
+ debug13(printf("Common shift is %d = common_right %d/2 - (common_left %d - 1)/2\n",
+ common_shift,common_right,common_left));
+ } else {
+ common_shift = (common_right - 1)/2 - common_left/2;
+ debug13(printf("Common shift is %d = (common_right %d - 1)/2 - common_left %d/2\n",
+ common_shift,common_right,common_left));
+ }
- if ((ilength53 = ilength5_low + ilength3_high - 1) > (ilength35 = ilength3_low + ilength5_high - 1)) {
- debug13(printf("plus, ilength53 is longer\n"));
+ if ((ilength53 = ilength5_low + ilength3_high - 1) >= (ilength35 = ilength3_low + ilength5_high - 1)) {
+ /* Use >=, not >, so we favor clipping heads over clipping tails in case of a tie */
+ debug13(printf("plus, ilength53 is longer. Clipping heads.\n"));
if ((overlap = totallength - ilength53) < 0) {
debug13(printf("Overlap %d is negative, so returning 0\n",overlap));
return 0;
@@ -2576,7 +2618,7 @@ Stage3pair_overlap (int *hardclip5_low, int *hardclip5_high, int *hardclip3_low,
*hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
} else {
- debug13(printf("plus, ilength35 is longer\n"));
+ debug13(printf("plus, ilength35 is longer. Clipping tails.\n"));
if ((overlap = totallength - ilength35) < 0) {
debug13(printf("Overlap %d is negative, so returning 0\n",overlap));
return 0;
@@ -2637,11 +2679,19 @@ Stage3pair_overlap (int *hardclip5_low, int *hardclip5_high, int *hardclip3_low,
common_left = (ilength5_low < ilength3_low) ? ilength5_low : ilength3_low;
common_right = (ilength5_high < ilength3_high) ? ilength5_high : ilength3_high;
- common_shift = (common_right - common_left)/2;
- debug13(printf("Common shift is %d\n",common_shift));
+ if (common_right > common_left) {
+ common_shift = common_right/2 - (common_left - 1)/2;
+ debug13(printf("Common shift is %d = common_right %d/2 - (common_left %d - 1)/2\n",
+ common_shift,common_right,common_left));
+ } else {
+ common_shift = (common_right - 1)/2 - common_left/2;
+ debug13(printf("Common shift is %d = (common_right %d - 1)/2 - common_left %d/2\n",
+ common_shift,common_right,common_left));
+ }
if ((ilength53 = ilength5_low + ilength3_high - 1) > (ilength35 = ilength3_low + ilength5_high - 1)) {
- debug13(printf("minus, ilength53 is longer\n"));
+ /* Use >, not >=, so we favor clipping heads over clipping tails in case of a tie */
+ debug13(printf("minus, ilength53 is longer. Clipping tails.\n"));
if ((overlap = totallength - ilength53) < 0) {
debug13(printf("Overlap %d is negative, so returning 0\n",overlap));
return 0;
@@ -2674,7 +2724,7 @@ Stage3pair_overlap (int *hardclip5_low, int *hardclip5_high, int *hardclip3_low,
*hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
} else {
- debug13(printf("minus, ilength35lh %d is longer than ilength53lh %d\n",ilength35,ilength53));
+ debug13(printf("minus, ilength35 is longer. Clipping heads.\n"));
if ((overlap = totallength - ilength35) < 0) {
debug13(printf("Overlap %d is negative, so returning 0\n",overlap));
return 0;
@@ -2917,15 +2967,19 @@ Stage3end_copy (T old) {
new->start_amb_length = old->start_amb_length;
new->end_amb_length = old->end_amb_length;
- new->start_amb_prob = old->start_amb_prob;
- new->end_amb_prob = old->end_amb_prob;
new->amb_length_donor = old->amb_length_donor;
new->amb_length_acceptor = old->amb_length_acceptor;
+ new->start_amb_prob = old->start_amb_prob;
+ new->end_amb_prob = old->end_amb_prob;
+ new->amb_prob_donor = old->amb_length_donor;
+ new->amb_prob_acceptor = old->amb_length_acceptor;
+
if ((new->nambcoords_donor = old->nambcoords_donor) == 0) {
new->ambcoords_donor = (Univcoord_T *) NULL;
new->amb_knowni_donor = (int *) NULL;
new->amb_nmismatches_donor = (int *) NULL;
+ new->amb_probs_donor = (double *) NULL;
} else {
new->ambcoords_donor = (Univcoord_T *) CALLOC_OUT(old->nambcoords_donor,sizeof(Univcoord_T));
memcpy(new->ambcoords_donor,old->ambcoords_donor,old->nambcoords_donor*sizeof(Univcoord_T));
@@ -2933,12 +2987,15 @@ Stage3end_copy (T old) {
memcpy(new->amb_knowni_donor,old->amb_knowni_donor,old->nambcoords_donor*sizeof(int));
new->amb_nmismatches_donor = (int *) CALLOC_OUT(old->nambcoords_donor,sizeof(int));
memcpy(new->amb_nmismatches_donor,old->amb_nmismatches_donor,old->nambcoords_donor*sizeof(int));
+ new->amb_probs_donor = (double *) CALLOC_OUT(old->nambcoords_donor,sizeof(double));
+ memcpy(new->amb_probs_donor,old->amb_probs_donor,old->nambcoords_donor*sizeof(double));
}
if ((new->nambcoords_acceptor = old->nambcoords_acceptor) == 0) {
new->ambcoords_acceptor = (Univcoord_T *) NULL;
new->amb_knowni_acceptor = (int *) NULL;
new->amb_nmismatches_acceptor = (int *) NULL;
+ new->amb_probs_acceptor = (double *) NULL;
} else {
new->ambcoords_acceptor = (Univcoord_T *) CALLOC_OUT(old->nambcoords_acceptor,sizeof(Univcoord_T));
memcpy(new->ambcoords_acceptor,old->ambcoords_acceptor,old->nambcoords_acceptor*sizeof(Univcoord_T));
@@ -2946,6 +3003,8 @@ Stage3end_copy (T old) {
memcpy(new->amb_knowni_acceptor,old->amb_knowni_acceptor,old->nambcoords_acceptor*sizeof(int));
new->amb_nmismatches_acceptor = (int *) CALLOC_OUT(old->nambcoords_acceptor,sizeof(int));
memcpy(new->amb_nmismatches_acceptor,old->amb_nmismatches_acceptor,old->nambcoords_acceptor*sizeof(int));
+ new->amb_probs_acceptor = (double *) CALLOC_OUT(old->nambcoords_acceptor,sizeof(double));
+ memcpy(new->amb_probs_acceptor,old->amb_probs_acceptor,old->nambcoords_acceptor*sizeof(double));
}
if (old->sensedir == SENSE_FORWARD) {
@@ -2953,22 +3012,26 @@ Stage3end_copy (T old) {
new->start_nambcoords = new->nambcoords_donor;
new->start_amb_knowni = new->amb_knowni_donor;
new->start_amb_nmismatches = new->amb_nmismatches_donor;
+ new->start_amb_probs = new->amb_probs_donor;
new->end_ambcoords = new->ambcoords_acceptor;
new->end_nambcoords = new->nambcoords_acceptor;
new->end_amb_knowni = new->amb_knowni_acceptor;
new->end_amb_nmismatches = new->amb_nmismatches_acceptor;
+ new->end_amb_probs = new->amb_probs_acceptor;
} else {
new->start_ambcoords = new->ambcoords_acceptor;
new->start_nambcoords = new->nambcoords_acceptor;
new->start_amb_knowni = new->amb_knowni_acceptor;
new->start_amb_nmismatches = new->amb_nmismatches_acceptor;
+ new->start_amb_probs = new->amb_probs_acceptor;
new->end_ambcoords = new->ambcoords_donor;
new->end_nambcoords = new->nambcoords_donor;
new->end_amb_knowni = new->amb_knowni_donor;
new->end_amb_nmismatches = new->amb_nmismatches_donor;
+ new->end_amb_probs = new->amb_probs_donor;
}
@@ -3315,6 +3378,8 @@ Stage3end_new_exact (int *found_score, Univcoord_T left, int genomiclength, Comp
new->amb_knowni_donor = new->amb_knowni_acceptor = (int *) NULL;
new->start_amb_nmismatches = new->end_amb_nmismatches = (int *) NULL;
new->amb_nmismatches_donor = new->amb_nmismatches_acceptor = (int *) NULL;
+ new->start_amb_probs = new->end_amb_probs = (double *) NULL;
+ new->amb_probs_donor = new->amb_probs_acceptor = (double *) NULL;
new->start_nambcoords = new->end_nambcoords = 0;
new->nambcoords_donor = new->nambcoords_acceptor = 0;
new->nchimera_known = 0;
@@ -3465,6 +3530,9 @@ Stage3end_new_substitution (int *found_score, int nmismatches_whole, Univcoord_T
new->amb_knowni_donor = new->amb_knowni_acceptor = (int *) NULL;
new->start_amb_nmismatches = new->end_amb_nmismatches = (int *) NULL;
new->amb_nmismatches_donor = new->amb_nmismatches_acceptor = (int *) NULL;
+ new->start_amb_probs = new->end_amb_probs = (double *) NULL;
+ new->amb_probs_donor = new->amb_probs_acceptor = (double *) NULL;
+
new->start_nambcoords = new->end_nambcoords = 0;
new->nambcoords_donor = new->nambcoords_acceptor = 0;
new->nchimera_known = 0;
@@ -3662,6 +3730,9 @@ Stage3end_new_insertion (int *found_score, int nindels, int indel_pos, int nmism
new->amb_knowni_donor = new->amb_knowni_acceptor = (int *) NULL;
new->start_amb_nmismatches = new->end_amb_nmismatches = (int *) NULL;
new->amb_nmismatches_donor = new->amb_nmismatches_acceptor = (int *) NULL;
+ new->start_amb_probs = new->end_amb_probs = (double *) NULL;
+ new->amb_probs_donor = new->amb_probs_acceptor = (double *) NULL;
+
new->start_nambcoords = new->end_nambcoords = 0;
new->nambcoords_donor = new->nambcoords_acceptor = 0;
new->nchimera_known = 0;
@@ -3891,6 +3962,9 @@ Stage3end_new_deletion (int *found_score, int nindels, int indel_pos, int nmisma
new->amb_knowni_donor = new->amb_knowni_acceptor = (int *) NULL;
new->start_amb_nmismatches = new->end_amb_nmismatches = (int *) NULL;
new->amb_nmismatches_donor = new->amb_nmismatches_acceptor = (int *) NULL;
+ new->start_amb_probs = new->end_amb_probs = (double *) NULL;
+ new->amb_probs_donor = new->amb_probs_acceptor = (double *) NULL;
+
new->start_nambcoords = new->end_nambcoords = 0;
new->nambcoords_donor = new->nambcoords_acceptor = 0;
new->nchimera_known = 0;
@@ -3914,7 +3988,7 @@ Stage3end_new_deletion (int *found_score, int nindels, int indel_pos, int nmisma
T
Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_acceptor,
Substring_T donor, Substring_T acceptor, Chrpos_T distance,
- bool shortdistancep, int splicing_penalty, int querylength, int amb_length,
+ bool shortdistancep, int splicing_penalty, int querylength, int amb_length, double amb_prob,
#ifdef LARGE_GENOMES
Uint8list_T ambcoords_donor, Uint8list_T ambcoords_acceptor,
#else
@@ -3922,6 +3996,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
#endif
Intlist_T amb_knowni_donor, Intlist_T amb_knowni_acceptor,
Intlist_T amb_nmismatches_donor, Intlist_T amb_nmismatches_acceptor,
+ Doublelist_T amb_probs_donor, Doublelist_T amb_probs_acceptor,
bool copy_donor_p, bool copy_acceptor_p, bool first_read_p, int sensedir,
bool sarrayp) {
T new;
@@ -3934,7 +4009,9 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new = (T) MALLOC_OUT(sizeof(*new));
debug0(printf("Stage3end_new_splice %p with sensedir %d, donor substring %p and acceptor substring %p, and amb_length %d\n",
new,sensedir,donor,acceptor,amb_length));
+#if 0
assert(Substring_match_length_orig(donor) + Substring_match_length_orig(acceptor) + amb_length == querylength);
+#endif
new->deletion = (char *) NULL;
new->querylength_adj = new->querylength = querylength;
@@ -4109,6 +4186,8 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->amb_knowni_acceptor = Intlist_to_array_out(&ignore,amb_knowni_acceptor);
new->amb_nmismatches_donor = Intlist_to_array_out(&ignore,amb_nmismatches_donor);
new->amb_nmismatches_acceptor = Intlist_to_array_out(&ignore,amb_nmismatches_acceptor);
+ new->amb_probs_donor = Doublelist_to_array_out(&ignore,amb_probs_donor);
+ new->amb_probs_acceptor = Doublelist_to_array_out(&ignore,amb_probs_acceptor);
if (sensedir == SENSE_FORWARD) {
@@ -4118,19 +4197,21 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->start_ambiguous_p = true;
new->start_amb_length = amb_length;
- new->start_amb_prob = 5.0;
+ new->start_amb_prob = amb_prob;
new->start_ambcoords = new->ambcoords_donor;
new->start_nambcoords = new->nambcoords_donor;
new->start_amb_knowni = new->amb_knowni_donor;
new->start_amb_nmismatches = new->amb_nmismatches_donor;
+ new->start_amb_probs = new->amb_probs_donor;
new->end_ambiguous_p = false;
new->end_amb_length = 0;
- new->end_amb_prob = 5.0;
+ new->end_amb_prob = 0.0;
new->end_ambcoords = NULL;
new->end_nambcoords = 0;
new->end_amb_knowni = NULL;
new->end_amb_nmismatches = NULL;
+ new->end_amb_probs = NULL;
} else if (acceptor == NULL) {
new->genomicstart = Substring_genomicstart(donor);
@@ -4138,19 +4219,21 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->end_ambiguous_p = true;
new->end_amb_length = amb_length;
- new->end_amb_prob = 5.0;
+ new->end_amb_prob = amb_prob;
new->end_ambcoords = new->ambcoords_acceptor;
new->end_nambcoords = new->nambcoords_acceptor;
new->end_amb_knowni = new->amb_knowni_acceptor;
new->end_amb_nmismatches = new->amb_nmismatches_acceptor;
+ new->end_amb_probs = new->amb_probs_acceptor;
new->start_ambiguous_p = false;
new->start_amb_length = 0;
- new->start_amb_prob = 5.0;
+ new->start_amb_prob = 0.0;
new->start_ambcoords = NULL;
new->start_nambcoords = 0;
new->start_amb_knowni = NULL;
new->start_amb_nmismatches = NULL;
+ new->start_amb_probs = NULL;
} else {
new->genomicstart = Substring_genomicstart(donor);
@@ -4158,11 +4241,12 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->start_ambiguous_p = new->end_ambiguous_p = false;
new->start_amb_length = new->end_amb_length = 0;
- new->start_amb_prob = new->end_amb_prob = 5.0;
+ new->start_amb_prob = new->end_amb_prob = 0.0;
new->start_ambcoords = new->end_ambcoords = NULL;
new->start_nambcoords = new->end_nambcoords = 0;
new->start_amb_knowni = new->end_amb_knowni = NULL;
new->start_amb_nmismatches = new->end_amb_nmismatches = NULL;
+ new->start_amb_probs = new->end_amb_probs = NULL;
}
} else {
@@ -4172,19 +4256,21 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->end_ambiguous_p = true;
new->end_amb_length = amb_length;
- new->end_amb_prob = 5.0;
+ new->end_amb_prob = amb_prob;
new->end_ambcoords = new->ambcoords_donor;
new->end_nambcoords = new->nambcoords_donor;
new->end_amb_knowni = new->amb_knowni_donor;
new->end_amb_nmismatches = new->amb_nmismatches_donor;
+ new->end_amb_probs = new->amb_probs_donor;
new->start_ambiguous_p = false;
new->start_amb_length = 0;
- new->start_amb_prob = 5.0;
+ new->start_amb_prob = 0.0;
new->start_ambcoords = NULL;
new->start_nambcoords = 0;
new->start_amb_knowni = NULL;
new->start_amb_nmismatches = NULL;
+ new->start_amb_probs = NULL;
} else if (acceptor == NULL) {
new->genomicstart = Substring_genomicstart(donor);
@@ -4192,31 +4278,34 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->start_ambiguous_p = true;
new->start_amb_length = amb_length;
- new->start_amb_prob = 5.0;
+ new->start_amb_prob = amb_prob;
new->start_ambcoords = new->ambcoords_acceptor;
new->start_nambcoords = new->nambcoords_acceptor;
new->start_amb_knowni = new->amb_knowni_acceptor;
new->start_amb_nmismatches = new->amb_nmismatches_acceptor;
+ new->start_amb_probs = new->amb_probs_acceptor;
new->end_ambiguous_p = false;
new->end_amb_length = 0;
- new->end_amb_prob = 5.0;
+ new->end_amb_prob = 0.0;
new->end_ambcoords = NULL;
new->end_nambcoords = 0;
new->end_amb_knowni = NULL;
new->end_amb_nmismatches = NULL;
+ new->end_amb_probs = NULL;
} else {
new->genomicstart = Substring_genomicstart(acceptor);
new->genomicend = Substring_genomicend(donor);
new->start_amb_length = new->end_amb_length = 0;
- new->start_amb_prob = new->end_amb_prob = 5.0;
+ new->start_amb_prob = new->end_amb_prob = 0.0;
new->start_ambiguous_p = new->end_ambiguous_p = false;
new->start_ambcoords = new->end_ambcoords = NULL;
new->start_nambcoords = new->end_nambcoords = 0;
new->start_amb_knowni = new->end_amb_knowni = NULL;
new->start_amb_nmismatches = new->end_amb_nmismatches = NULL;
+ new->start_amb_probs = new->end_amb_probs = NULL;
}
}
@@ -4230,8 +4319,8 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->high = new->genomicstart;
}
- debug0(printf(" hittype is %s, genomicpos %u..%u\n",
- hittype_string(new->hittype),new->genomicstart - new->chroffset,new->genomicend - new->chroffset));
+ debug0(printf(" hittype is %s, plusp %d, genomicpos %u..%u\n",
+ hittype_string(new->hittype),new->plusp,new->genomicstart - new->chroffset,new->genomicend - new->chroffset));
debug0(printf("start_ambiguous_p %d (%d starts), end_ambiguous_p %d (%d ends)\n",
new->start_ambiguous_p,new->start_nambcoords,new->end_ambiguous_p,new->end_nambcoords));
#ifdef DEBUG0
@@ -4258,6 +4347,8 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->nchimera_known = Substring_nchimera_known(donor) + Substring_nchimera_known(acceptor);
new->nchimera_novel = Substring_nchimera_novel(donor) + Substring_nchimera_novel(acceptor);
+#if 0
+ /* Adversely affects comparison based on nchimera_known */
if (new->start_ambiguous_p == true && favor_ambiguous_p == true) {
new->nchimera_known++;
/* new->nchimera_novel--; */
@@ -4266,9 +4357,10 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->nchimera_known++;
/* new->nchimera_novel--; */
}
+#endif
- if (new->chrnum == 0 || (donor != NULL && acceptor != NULL && shortdistancep == false
- /* && merge_samechr_p == false*/)) {
+ if (new->chrnum == 0) {
+ /* Previously also did this for (donor != NULL && acceptor != NULL && shortdistancep == false), but this led to the wrong chrpos for SAM output */
/* Checking for merge_samechr_p leads to wrong mappingstart and mappingend for running GMAP */
/* Always want the original query end */
if (first_read_p == true) {
@@ -4332,6 +4424,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->plusp = Substring_plusp(substring_for_concordance);
} else {
+ /* Ordinary splice */
new->effective_chrnum = new->chrnum;
new->other_chrnum = 0;
@@ -4481,7 +4574,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
/* Donor ----(A distance)---- [A Shortexon D] ----(D distance)---- Acceptor */
T
Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T acceptor, Substring_T shortexon,
- int amb_length_donor, int amb_length_acceptor,
+ int amb_length_donor, int amb_length_acceptor, double amb_prob_donor, double amb_prob_acceptor,
#ifdef LARGE_GENOMES
Uint8list_T ambcoords_donor, Uint8list_T ambcoords_acceptor,
#else
@@ -4489,6 +4582,7 @@ Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T accept
#endif
Intlist_T amb_knowni_donor, Intlist_T amb_knowni_acceptor,
Intlist_T amb_nmismatches_donor, Intlist_T amb_nmismatches_acceptor,
+ Doublelist_T amb_probs_donor, Doublelist_T amb_probs_acceptor,
bool copy_donor_p, bool copy_acceptor_p, bool copy_shortexon_p,
int splicing_penalty, int querylength, int sensedir, bool sarrayp) {
T new;
@@ -4588,6 +4682,8 @@ Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T accept
new->amb_length_donor = amb_length_donor;
new->amb_length_acceptor = amb_length_acceptor;
+ new->amb_prob_donor = amb_prob_donor;
+ new->amb_prob_acceptor = amb_prob_acceptor;
#ifdef LARGE_GENOMES
new->ambcoords_donor = Uint8list_to_array_out(&new->nambcoords_donor,ambcoords_donor);
@@ -4601,6 +4697,8 @@ Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T accept
new->amb_knowni_acceptor = Intlist_to_array_out(&ignore,amb_knowni_acceptor);
new->amb_nmismatches_donor = Intlist_to_array_out(&ignore,amb_nmismatches_donor);
new->amb_nmismatches_acceptor = Intlist_to_array_out(&ignore,amb_nmismatches_acceptor);
+ new->amb_probs_donor = Doublelist_to_array_out(&ignore,amb_probs_donor);
+ new->amb_probs_acceptor = Doublelist_to_array_out(&ignore,amb_probs_acceptor);
if (sensedir == SENSE_FORWARD) {
@@ -4608,18 +4706,20 @@ Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T accept
new->genomicend = (acceptor != NULL ? Substring_genomicend(acceptor) : Substring_genomicend(shortexon));
new->start_amb_length = new->amb_length_donor;
- new->start_amb_prob = 5.0;
+ new->start_amb_prob = new->amb_prob_donor;
new->start_ambcoords = new->ambcoords_donor;
new->start_nambcoords = new->nambcoords_donor;
new->start_amb_knowni = new->amb_knowni_donor;
new->start_amb_nmismatches = new->amb_nmismatches_donor;
+ new->start_amb_probs = new->amb_probs_donor;
new->end_amb_length = new->amb_length_acceptor;
- new->end_amb_prob = 5.0;
+ new->end_amb_prob = new->amb_prob_acceptor;
new->end_ambcoords = new->ambcoords_acceptor;
new->end_nambcoords = new->nambcoords_acceptor;
new->end_amb_knowni = new->amb_knowni_acceptor;
new->end_amb_nmismatches = new->amb_nmismatches_acceptor;
+ new->end_amb_probs = new->amb_probs_acceptor;
new->start_ambiguous_p = (ambcoords_donor != NULL) ? true : false;
new->end_ambiguous_p = (ambcoords_acceptor != NULL) ? true : false;
@@ -4629,18 +4729,20 @@ Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T accept
new->genomicend = (donor != NULL ? Substring_genomicend(donor) : Substring_genomicend(shortexon));
new->start_amb_length = new->amb_length_acceptor;
- new->start_amb_prob = 5.0;
+ new->start_amb_prob = new->amb_prob_acceptor;
new->start_ambcoords = new->ambcoords_acceptor;
new->start_nambcoords = new->nambcoords_acceptor;
new->start_amb_knowni = new->amb_knowni_acceptor;
new->start_amb_nmismatches = new->amb_nmismatches_acceptor;
+ new->start_amb_probs = new->amb_probs_acceptor;
new->end_amb_length = new->amb_length_donor;
- new->end_amb_prob = 5.0;
+ new->end_amb_prob = new->amb_prob_donor;
new->end_ambcoords = new->ambcoords_donor;
new->end_nambcoords = new->nambcoords_donor;
new->end_amb_knowni = new->amb_knowni_donor;
new->end_amb_nmismatches = new->amb_nmismatches_donor;
+ new->end_amb_probs = new->amb_probs_donor;
new->start_ambiguous_p = (ambcoords_acceptor != NULL) ? true : false;
new->end_ambiguous_p = (ambcoords_donor != NULL) ? true : false;
@@ -4667,6 +4769,8 @@ Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T accept
new->nchimera_known = Substring_nchimera_known(shortexon) + Substring_nchimera_known(donor) + Substring_nchimera_known(acceptor);
new->nchimera_novel = Substring_nchimera_novel(shortexon) + Substring_nchimera_novel(donor) + Substring_nchimera_novel(acceptor);
+#if 0
+ /* Adversely affects comparison based on nchimera_known */
if (new->start_ambiguous_p == true && favor_ambiguous_p == true) {
new->nchimera_known++;
/* new->nchimera_novel--; */
@@ -4675,7 +4779,7 @@ Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T accept
new->nchimera_known++;
/* new->nchimera_novel--; */
}
-
+#endif
new->effective_chrnum = new->chrnum;
new->other_chrnum = 0;
@@ -4996,7 +5100,7 @@ Stage3end_new_terminal (int querystart, int queryend, Univcoord_T left, Compress
new->tally = -1L;
new->start_amb_length = new->end_amb_length = 0;
- new->start_amb_prob = new->end_amb_prob = 5.0;
+ new->start_amb_prob = new->end_amb_prob = 0.0;
new->amb_length_donor = new->amb_length_acceptor = 0;
new->start_ambiguous_p = new->end_ambiguous_p = false;
@@ -5006,6 +5110,8 @@ Stage3end_new_terminal (int querystart, int queryend, Univcoord_T left, Compress
new->amb_knowni_donor = new->amb_knowni_acceptor = (int *) NULL;
new->start_amb_nmismatches = new->end_amb_nmismatches = (int *) NULL;
new->amb_nmismatches_donor = new->amb_nmismatches_acceptor = (int *) NULL;
+ new->start_amb_probs = new->end_amb_probs = (double *) NULL;
+ new->amb_probs_donor = new->amb_probs_acceptor = (double *) NULL;
new->start_nambcoords = new->end_nambcoords = 0;
new->nambcoords_donor = new->nambcoords_acceptor = 0;
new->nchimera_known = 0;
@@ -5281,6 +5387,8 @@ ATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAG
new->amb_knowni_donor = new->amb_knowni_acceptor = (int *) NULL;
new->start_amb_nmismatches = new->end_amb_nmismatches = (int *) NULL;
new->amb_nmismatches_donor = new->amb_nmismatches_acceptor = (int *) NULL;
+ new->start_amb_probs = new->end_amb_probs = (double *) NULL;
+ new->amb_probs_donor = new->amb_probs_acceptor = (double *) NULL;
new->start_nambcoords = new->end_nambcoords = 0;
new->nambcoords_donor = new->nambcoords_acceptor = 0;
new->nchimera_known = 0;
@@ -6909,19 +7017,13 @@ AGTGATGAATCCAAGAGGCGTTTCTATAAGAATTGGCATAAATCTAAGAAGAAGGCCCACCTGATGGAGATCCAG */
List_T
-Stage3end_filter_terminals (List_T hits, int querylength) {
+Stage3end_reject_trimlengths (List_T hits) {
List_T filtered = NULL, p;
T hit;
- int minlength;
-
- minlength = 2*querylength/3;
- if (terminal_output_minlength > minlength) {
- minlength = terminal_output_minlength;
- }
for (p = hits; p != NULL; p = p->rest) {
hit = (T) p->first;
- if (hit->hittype == TERMINAL && Stage3end_terminal_length(hit) < minlength) {
+ if (hit->trim_left + hit->trim_right >= reject_trimlength) {
Stage3end_free(&hit);
} else {
filtered = List_push(filtered,(void *) hit);
@@ -6939,6 +7041,12 @@ hit_sort_cmp (const void *a, const void *b) {
Stage3end_T x = * (Stage3end_T *) a;
Stage3end_T y = * (Stage3end_T *) b;
+ debug7(printf("Comparing %s: #%d:%u..%u, alias %d, nmatches %d, score %d with %s: #%d:%u..%u, alias %d, nmatches %d, score %d\n",
+ hittype_string(x->hittype),x->chrnum,x->genomicstart-x->chroffset,x->genomicend-x->chroffset,
+ x->alias,x->nmatches,x->score,
+ hittype_string(y->hittype),y->chrnum,y->genomicstart-y->chroffset,y->genomicend-y->chroffset,
+ y->alias,y->nmatches,y->score));
+
if (x->plusp > y->plusp) {
return -1;
} else if (y->plusp > x->plusp) {
@@ -6951,13 +7059,17 @@ hit_sort_cmp (const void *a, const void *b) {
return +1;
#else
} else if (x->low < y->low) {
+ debug7(printf("Returning -1 for low\n"));
return -1;
} else if (y->low < x->low) {
+ debug7(printf("Returning +1 for low\n"));
return +1;
} else if (x->high < y->high) {
+ debug7(printf("Returning -1 for high\n"));
return -1;
} else if (y->high < x->high) {
+ debug7(printf("Returning +1 for high\n"));
return +1;
#endif
@@ -6977,6 +7089,7 @@ hit_sort_cmp (const void *a, const void *b) {
return -1;
} else if (y->nmatches > x->nmatches) {
return +1;
+#if 0
} else if (x->nmatches_posttrim > y->nmatches_posttrim) {
return -1;
} else if (y->nmatches_posttrim > x->nmatches_posttrim) {
@@ -6985,6 +7098,7 @@ hit_sort_cmp (const void *a, const void *b) {
return -1;
} else if (y->nchimera_novel < x->nchimera_novel) {
return +1;
+#endif
} else if (x->nchimera_known > y->nchimera_known) {
return -1;
} else if (y->nchimera_known > x->nchimera_known) {
@@ -7051,11 +7165,12 @@ hit_equiv_cmp (Stage3end_T x, Stage3end_T y) {
return -1;
} else if (y->nchimera_novel < x->nchimera_novel) {
return +1;
+#endif
+
} else if (x->nchimera_known > y->nchimera_known) {
return -1;
} else if (y->nchimera_known > x->nchimera_known) {
return +1;
-#endif
#if 0
} else if (x->indel_low < y->indel_low) {
@@ -7123,43 +7238,48 @@ hit_goodness_cmp (bool *equalp, Stage3end_T hit,
}
#endif
-#if 0
if (hit->nmatches < best_hit->nmatches) {
debug7(printf(" => %d loses by nmatches\n",k));
return -1;
} else if (hit->nmatches > best_hit->nmatches) {
debug7(printf(" => %d wins by nmatches\n",k));
return +1;
- }
-#endif
- if (hit->nmatches_posttrim < best_hit->nmatches_posttrim) {
+#if 0
+ /* Causes ambiguous splices to lose to a definitive splice, which could be wrong */
+ } else if (hit->nmatches_posttrim < best_hit->nmatches_posttrim) {
debug7(printf(" => %d loses by nmatches (post-trim)\n",k));
return -1;
} else if (hit->nmatches_posttrim > best_hit->nmatches_posttrim) {
debug7(printf(" => %d wins by nmatches (post-trim)\n",k));
return +1;
+#endif
+#if 0
} else if (hit->nchimera_novel > best_hit->nchimera_novel) {
debug7(printf(" => %d loses by nchimera_novel\n",k));
return -1;
} else if (hit->nchimera_novel < best_hit->nchimera_novel) {
debug7(printf(" => %d wins by nchimera_novel\n",k));
return +1;
+#endif
} else if (hit->nchimera_known < best_hit->nchimera_known) {
- debug7(printf(" => %d loses by nchimera_known\n",k));
+ debug7(printf(" => %d loses by nchimera_known %d < %d\n",
+ k,hit->nchimera_known,best_hit->nchimera_known));
return -1;
} else if (hit->nchimera_known > best_hit->nchimera_known) {
debug7(printf(" => %d wins by nchimera_known\n",k));
return +1;
+#if 0
} else if (hit->hittype > best_hit->hittype) {
debug7(printf(" => %d loses by hittype\n",k));
return -1;
} else if (hit->hittype < best_hit->hittype) {
debug7(printf(" => %d wins by hittype\n",k));
return +1;
+#endif
} else if (hit->nindels > best_hit->nindels) {
debug7(printf(" => %d loses by nindels\n",k));
@@ -7265,7 +7385,7 @@ Stage3end_remove_overlaps (List_T hitlist, bool finalp) {
debug7(printf("Entered Stage3end_remove_overlaps with %d hits: %s\n",
- n,finalp == true ? "FINAL" : "not final"));
+ List_length(hitlist),finalp == true ? "FINAL" : "not final"));
if ((n = List_length(hitlist)) == 0) {
return NULL;
} else {
@@ -9278,6 +9398,8 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
Uintlist_T ambcoords;
#endif
Intlist_T amb_knowni, amb_nmismatches;
+ Doublelist_T amb_probs;
+ double prob_shortend;
*unresolved_amb_length = 0;
@@ -9497,10 +9619,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
acceptor_splicecoord = (*hit5)->ambcoords_acceptor[bingoi5];
acceptor_knowni = (*hit5)->amb_knowni_acceptor[bingoi5];
nmismatches_shortend = (*hit5)->amb_nmismatches_acceptor[bingoi5];
+ prob_shortend = (*hit5)->amb_probs_acceptor[bingoi5];
segment_left = acceptor_splicecoord - splice_pos;
if ((acceptor = Substring_new_acceptor(acceptor_splicecoord,acceptor_knowni,splice_pos,nmismatches_shortend,
- /*prob*/2.0,/*left*/segment_left,query5_compress_fwd,
+ /*prob*/prob_shortend,/*left*/segment_left,query5_compress_fwd,
querylength5,/*plusp*/true,genestrand,/*first_read_p*/true,/*sensep*/true,
Substring_chrnum(shortexon),Substring_chroffset(shortexon),
Substring_chrhigh(shortexon),Substring_chrlength(shortexon))) != NULL) {
@@ -9515,15 +9638,19 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
#endif
amb_knowni = Intlist_from_array(old->amb_knowni_donor,old->nambcoords_donor);
amb_nmismatches = Intlist_from_array(old->amb_nmismatches_donor,old->nambcoords_donor);
+ amb_probs = Doublelist_from_array(old->amb_probs_donor,old->nambcoords_donor);
*hit5 = Stage3end_new_shortexon(&ignore_found_score,/*donor*/old->substringD,acceptor,shortexon,
old->amb_length_donor,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/Doublelist_max(amb_probs),/*amb_prob_acceptor*/0.0,
ambcoords,/*ambcoords_acceptor*/NULL,
amb_knowni,/*amb_knowni_acceptor*/NULL,
amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*copy_shortexon_p*/true,
localsplicing_penalty,querylength5,/*sensedir*/SENSE_FORWARD,
/*sarrayp*/false);
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
@@ -9548,10 +9675,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
donor_splicecoord = (*hit5)->ambcoords_donor[bingoi5];
donor_knowni = (*hit5)->amb_knowni_donor[bingoi5];
nmismatches_shortend = (*hit5)->amb_nmismatches_donor[bingoi5];
+ prob_shortend = (*hit5)->amb_probs_donor[bingoi5];
segment_left = donor_splicecoord - splice_pos;
if ((donor = Substring_new_donor(donor_splicecoord,donor_knowni,splice_pos,nmismatches_shortend,
- /*prob*/2.0,/*left*/segment_left,query5_compress_fwd,
+ /*prob*/prob_shortend,/*left*/segment_left,query5_compress_fwd,
querylength5,/*plusp*/true,genestrand,/*first_read_p*/true,/*sensep*/false,
Substring_chrnum(shortexon),Substring_chroffset(shortexon),
Substring_chrhigh(shortexon),Substring_chrlength(shortexon))) != NULL) {
@@ -9566,15 +9694,19 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
#endif
amb_knowni = Intlist_from_array(old->amb_knowni_acceptor,old->nambcoords_acceptor);
amb_nmismatches = Intlist_from_array(old->amb_nmismatches_acceptor,old->nambcoords_acceptor);
+ amb_probs = Doublelist_from_array(old->amb_probs_acceptor,old->nambcoords_acceptor);
*hit5 = Stage3end_new_shortexon(&ignore_found_score,donor,/*acceptor*/old->substringA,shortexon,
/*amb_length_donor*/0,old->amb_length_acceptor,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/Doublelist_max(amb_probs),
/*ambcoords_donor*/NULL,ambcoords,
/*amb_knowni_donor*/NULL,amb_knowni,
/*amb_nmismatches_donor*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*copy_shortexon_p*/true,
localsplicing_penalty,querylength5,/*sensedir*/SENSE_ANTI,
/*sarrayp*/false);
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
@@ -9606,10 +9738,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
acceptor_splicecoord = (*hit5)->ambcoords_acceptor[bingoi5];
acceptor_knowni = (*hit5)->amb_knowni_acceptor[bingoi5];
nmismatches_shortend = (*hit5)->amb_nmismatches_acceptor[bingoi5];
+ prob_shortend = (*hit5)->amb_probs_acceptor[bingoi5];
segment_left = acceptor_splicecoord - splice_pos;
if ((acceptor = Substring_new_acceptor(acceptor_splicecoord,acceptor_knowni,splice_pos,nmismatches_shortend,
- /*prob*/2.0,/*left*/segment_left,query5_compress_fwd,
+ /*prob*/prob_shortend,/*left*/segment_left,query5_compress_fwd,
querylength5,/*plusp*/true,genestrand,/*first_read_p*/true,/*sensep*/true,
Substring_chrnum(donor),Substring_chroffset(donor),
Substring_chrhigh(donor),Substring_chrlength(donor))) != NULL) {
@@ -9619,10 +9752,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
old = *hit5;
*hit5 = Stage3end_new_splice(&ignore_found_score,Substring_nmismatches_whole(donor),/*nmismatches_acceptor*/nmismatches_shortend,
donor,acceptor,/*distance*/acceptor_splicecoord - donor_splicecoord,
- /*shortdistancep*/true,localsplicing_penalty,querylength5,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength5,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/true,/*sensedir*/SENSE_FORWARD,
/*sarrayp*/false);
if (*private5p == true) {
@@ -9642,10 +9776,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
donor_splicecoord = (*hit5)->ambcoords_donor[bingoi5];
donor_knowni = (*hit5)->amb_knowni_donor[bingoi5];
nmismatches_shortend = (*hit5)->amb_nmismatches_donor[bingoi5];
+ prob_shortend = (*hit5)->amb_probs_donor[bingoi5];
segment_left = donor_splicecoord - splice_pos;
if ((donor = Substring_new_donor(donor_splicecoord,donor_knowni,splice_pos,nmismatches_shortend,
- /*prob*/2.0,/*left*/segment_left,query5_compress_fwd,
+ /*prob*/prob_shortend,/*left*/segment_left,query5_compress_fwd,
querylength5,/*plusp*/true,genestrand,/*first_read_p*/true,/*sensep*/false,
Substring_chrnum(acceptor),Substring_chroffset(acceptor),
Substring_chrhigh(acceptor),Substring_chrlength(acceptor))) != NULL) {
@@ -9655,10 +9790,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
old = *hit5;
*hit5 = Stage3end_new_splice(&ignore_found_score,/*nmismatches_donor*/nmismatches_shortend,Substring_nmismatches_whole(acceptor),
donor,acceptor,/*distance*/donor_splicecoord - acceptor_splicecoord,
- /*shortdistancep*/true,localsplicing_penalty,querylength5,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength5,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/true,/*sensedir*/SENSE_ANTI,
/*sarrayp*/false);
if (*private5p == true) {
@@ -9687,10 +9823,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
acceptor_splicecoord = (*hit3)->ambcoords_acceptor[bingoi3];
acceptor_knowni = (*hit3)->amb_knowni_acceptor[bingoi3];
nmismatches_shortend = (*hit3)->amb_nmismatches_acceptor[bingoi3];
+ prob_shortend = (*hit3)->amb_probs_acceptor[bingoi3];
segment_left = acceptor_splicecoord - splice_pos;
if ((acceptor = Substring_new_acceptor(acceptor_splicecoord,acceptor_knowni,splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query3_compress_fwd,
+ /*prob*/prob_shortend,segment_left,query3_compress_fwd,
querylength3,/*plusp*/true,genestrand,/*first_read_p*/false,/*sensep*/false,
Substring_chrnum(shortexon),Substring_chroffset(shortexon),
Substring_chrhigh(shortexon),Substring_chrlength(shortexon))) != NULL) {
@@ -9705,15 +9842,19 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
#endif
amb_knowni = Intlist_from_array(old->amb_knowni_donor,old->nambcoords_donor);
amb_nmismatches = Intlist_from_array(old->amb_nmismatches_donor,old->nambcoords_donor);
+ amb_probs = Doublelist_from_array(old->amb_probs_donor,old->nambcoords_donor);
*hit3 = Stage3end_new_shortexon(&ignore_found_score,/*donor*/old->substringD,acceptor,shortexon,
old->amb_length_donor,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/Doublelist_max(amb_probs),/*amb_prob_acceptor*/0.0,
ambcoords,/*ambcoords_acceptor*/NULL,
amb_knowni,/*amb_knowni_acceptor*/NULL,
amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*copy_shortexon_p*/true,
localsplicing_penalty,querylength3,/*sensedir*/SENSE_ANTI,
/*sarrayp*/false);
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
@@ -9738,10 +9879,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
donor_splicecoord = (*hit3)->ambcoords_donor[bingoi3];
donor_knowni = (*hit3)->amb_knowni_donor[bingoi3];
nmismatches_shortend = (*hit3)->amb_nmismatches_donor[bingoi3];
+ prob_shortend = (*hit3)->amb_probs_donor[bingoi3];
segment_left = donor_splicecoord - splice_pos;
if ((donor = Substring_new_donor(donor_splicecoord,donor_knowni,splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query3_compress_fwd,
+ /*prob*/prob_shortend,segment_left,query3_compress_fwd,
querylength3,/*plusp*/true,genestrand,/*first_read_p*/false,/*sensep*/true,
Substring_chrnum(shortexon),Substring_chroffset(shortexon),
Substring_chrhigh(shortexon),Substring_chrlength(shortexon))) != NULL) {
@@ -9756,15 +9898,19 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
#endif
amb_knowni = Intlist_from_array(old->amb_knowni_acceptor,old->nambcoords_acceptor);
amb_nmismatches = Intlist_from_array(old->amb_nmismatches_acceptor,old->nambcoords_acceptor);
+ amb_probs = Doublelist_from_array(old->amb_probs_acceptor,old->nambcoords_acceptor);
*hit3 = Stage3end_new_shortexon(&ignore_found_score,donor,/*acceptor*/old->substringA,shortexon,
/*amb_length_donor*/0,old->amb_length_acceptor,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/Doublelist_max(amb_probs),
/*ambcoords_donor*/NULL,ambcoords,
/*amb_knowni_donor*/NULL,amb_knowni,
/*amb_nmismatches_donor*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*copy_shortexon_p*/true,
localsplicing_penalty,querylength3,/*sensedir*/SENSE_FORWARD,
/*sarrayp*/false);
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
@@ -9796,10 +9942,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
acceptor_splicecoord = (*hit3)->ambcoords_acceptor[bingoi3];
acceptor_knowni = (*hit3)->amb_knowni_acceptor[bingoi3];
nmismatches_shortend = (*hit3)->amb_nmismatches_acceptor[bingoi3];
+ prob_shortend = (*hit3)->amb_probs_acceptor[bingoi3];
segment_left = acceptor_splicecoord - splice_pos;
if ((acceptor = Substring_new_acceptor(acceptor_splicecoord,acceptor_knowni,splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query3_compress_fwd,
+ /*prob*/prob_shortend,segment_left,query3_compress_fwd,
querylength3,/*plusp*/true,genestrand,/*first_read_p*/false,/*sensep*/false,
Substring_chrnum(donor),Substring_chroffset(donor),
Substring_chrhigh(donor),Substring_chrlength(donor))) != NULL) {
@@ -9809,10 +9956,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
old = *hit3;
*hit3 = Stage3end_new_splice(&ignore_found_score,Substring_nmismatches_whole(donor),/*nmismatches_acceptor*/nmismatches_shortend,
donor,acceptor,/*distance*/donor_splicecoord - acceptor_splicecoord,
- /*shortdistancep*/true,localsplicing_penalty,querylength3,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength3,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/false,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false);
if (*private3p == true) {
@@ -9832,10 +9980,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
donor_splicecoord = (*hit3)->ambcoords_donor[bingoi3];
donor_knowni = (*hit3)->amb_knowni_donor[bingoi3];
nmismatches_shortend = (*hit3)->amb_nmismatches_donor[bingoi3];
+ prob_shortend = (*hit3)->amb_probs_donor[bingoi3];
segment_left = donor_splicecoord - splice_pos;
if ((donor = Substring_new_donor(donor_splicecoord,donor_knowni,splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query3_compress_fwd,
+ /*prob*/prob_shortend,segment_left,query3_compress_fwd,
querylength3,/*plusp*/true,genestrand,/*first_read_p*/false,/*sensep*/true,
Substring_chrnum(acceptor),Substring_chroffset(acceptor),
Substring_chrhigh(acceptor),Substring_chrlength(acceptor))) != NULL) {
@@ -9847,10 +9996,11 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, T *hit5, T *hi
old = *hit3;
*hit3 = Stage3end_new_splice(&ignore_found_score,/*nmismatches_donor*/nmismatches_shortend,Substring_nmismatches_whole(acceptor),
donor,acceptor,/*distance*/acceptor_splicecoord - donor_splicecoord,
- /*shortdistancep*/true,localsplicing_penalty,querylength3,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength3,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/false,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false);
if (*private3p == true) {
@@ -9897,6 +10047,8 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
Uintlist_T ambcoords;
#endif
Intlist_T amb_knowni, amb_nmismatches;
+ Doublelist_T amb_probs;
+ double prob_shortend;
*unresolved_amb_length = 0;
@@ -10117,11 +10269,12 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
acceptor_splicecoord = (*hit5)->ambcoords_acceptor[bingoi5];
acceptor_knowni = (*hit5)->amb_knowni_acceptor[bingoi5];
nmismatches_shortend = (*hit5)->amb_nmismatches_acceptor[bingoi5];
+ prob_shortend = (*hit5)->amb_probs_acceptor[bingoi5];
segment_left = acceptor_splicecoord - (querylength5 - splice_pos);
if ((acceptor = Substring_new_acceptor(acceptor_splicecoord,acceptor_knowni,
querylength5 - splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query5_compress_rev,
+ /*prob*/prob_shortend,segment_left,query5_compress_rev,
querylength5,/*plusp*/false,genestrand,/*first_read_p*/true,/*sensep*/true,
Substring_chrnum(shortexon),Substring_chroffset(shortexon),
Substring_chrhigh(shortexon),Substring_chrlength(shortexon))) != NULL) {
@@ -10136,15 +10289,19 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
#endif
amb_knowni = Intlist_from_array(old->amb_knowni_donor,old->nambcoords_donor);
amb_nmismatches = Intlist_from_array(old->amb_nmismatches_donor,old->nambcoords_donor);
+ amb_probs = Doublelist_from_array(old->amb_probs_donor,old->nambcoords_donor);
*hit5 = Stage3end_new_shortexon(&ignore_found_score,/*donor*/old->substringD,acceptor,shortexon,
old->amb_length_donor,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/Doublelist_max(amb_probs),/*amb_prob_acceptor*/0.0,
ambcoords,/*ambcoords_acceptor*/NULL,
amb_knowni,/*amb_knowni_acceptor*/NULL,
amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*copy_shortexon_p*/true,
localsplicing_penalty,querylength5,/*sensedir*/SENSE_FORWARD,
/*sarrayp*/false);
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
@@ -10169,10 +10326,11 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
donor_splicecoord = (*hit5)->ambcoords_donor[bingoi5];
donor_knowni = (*hit5)->amb_knowni_donor[bingoi5];
nmismatches_shortend = (*hit5)->amb_nmismatches_donor[bingoi5];
+ prob_shortend = (*hit5)->amb_probs_donor[bingoi5];
segment_left = donor_splicecoord - (querylength5 - splice_pos);
if ((donor = Substring_new_donor(donor_splicecoord,donor_knowni,querylength5 - splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query5_compress_rev,
+ /*prob*/prob_shortend,segment_left,query5_compress_rev,
querylength5,/*plusp*/false,genestrand,/*first_read_p*/true,/*sensep*/false,
Substring_chrnum(shortexon),Substring_chroffset(shortexon),
Substring_chrhigh(shortexon),Substring_chrlength(shortexon))) != NULL) {
@@ -10187,15 +10345,19 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
#endif
amb_knowni = Intlist_from_array(old->amb_knowni_acceptor,old->nambcoords_acceptor);
amb_nmismatches = Intlist_from_array(old->amb_nmismatches_acceptor,old->nambcoords_acceptor);
+ amb_probs = Doublelist_from_array(old->amb_probs_acceptor,old->nambcoords_acceptor);
*hit5 = Stage3end_new_shortexon(&ignore_found_score,donor,/*acceptor*/old->substringA,shortexon,
/*amb_length_donor*/0,old->amb_length_acceptor,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/Doublelist_max(amb_probs),
/*ambcoords_donor*/NULL,ambcoords,
/*amb_knowni_donor*/NULL,amb_knowni,
/*amb_nmismatches_donor*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*copy_shortexon_p*/true,
localsplicing_penalty,querylength5,/*sensedir*/SENSE_ANTI,
/*sarrayp*/false);
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
@@ -10226,10 +10388,11 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
acceptor_splicecoord = (*hit5)->ambcoords_acceptor[bingoi5];
acceptor_knowni = (*hit5)->amb_knowni_acceptor[bingoi5];
nmismatches_shortend = (*hit5)->amb_nmismatches_acceptor[bingoi5];
+ prob_shortend = (*hit5)->amb_probs_acceptor[bingoi5];
segment_left = acceptor_splicecoord - (querylength5 - splice_pos);
if ((acceptor = Substring_new_acceptor(acceptor_splicecoord,acceptor_knowni,querylength5 - splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query5_compress_rev,
+ /*prob*/prob_shortend,segment_left,query5_compress_rev,
querylength5,/*plusp*/false,genestrand,/*first_read_p*/true,/*sensep*/true,
Substring_chrnum(donor),Substring_chroffset(donor),
Substring_chrhigh(donor),Substring_chrlength(donor))) != NULL) {
@@ -10239,10 +10402,11 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
old = *hit5;
*hit5 = Stage3end_new_splice(&ignore_found_score,Substring_nmismatches_whole(donor),/*nmismatches_acceptor*/nmismatches_shortend,
donor,acceptor,/*distance*/donor_splicecoord - acceptor_splicecoord,
- /*shortdistancep*/true,localsplicing_penalty,querylength5,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength5,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/true,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false);
if (*private5p == true) {
@@ -10262,11 +10426,12 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
donor_splicecoord = (*hit5)->ambcoords_donor[bingoi5];
donor_knowni = (*hit5)->amb_knowni_donor[bingoi5];
nmismatches_shortend = (*hit5)->amb_nmismatches_donor[bingoi5];
+ prob_shortend = (*hit5)->amb_probs_donor[bingoi5];
segment_left = donor_splicecoord - (querylength5 - splice_pos);
/* BUG HERE */
if ((donor = Substring_new_donor(donor_splicecoord,donor_knowni,querylength5 - splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query5_compress_rev,
+ /*prob*/prob_shortend,segment_left,query5_compress_rev,
querylength5,/*plusp*/false,genestrand,/*first_read_p*/true,/*sensep*/false,
Substring_chrnum(acceptor),Substring_chroffset(acceptor),
Substring_chrhigh(acceptor),Substring_chrlength(acceptor))) != NULL) {
@@ -10276,10 +10441,11 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
old = *hit5;
*hit5 = Stage3end_new_splice(&ignore_found_score,/*nmismatches_donor*/nmismatches_shortend,Substring_nmismatches_whole(acceptor),
donor,acceptor,/*distance*/acceptor_splicecoord - donor_splicecoord,
- /*shortdistancep*/true,localsplicing_penalty,querylength5,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength5,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/true,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false);
if (*private5p == true) {
@@ -10308,10 +10474,11 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
acceptor_splicecoord = (*hit3)->ambcoords_acceptor[bingoi3];
acceptor_knowni = (*hit3)->amb_knowni_acceptor[bingoi3];
nmismatches_shortend = (*hit3)->amb_nmismatches_acceptor[bingoi3];
+ prob_shortend = (*hit3)->amb_probs_acceptor[bingoi3];
segment_left = acceptor_splicecoord - (querylength3 - splice_pos);
if ((acceptor = Substring_new_acceptor(acceptor_splicecoord,acceptor_knowni,querylength3 - splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query3_compress_rev,
+ /*prob*/prob_shortend,segment_left,query3_compress_rev,
querylength3,/*plusp*/false,genestrand,/*first_read_p*/false,/*sensep*/false,
Substring_chrnum(shortexon),Substring_chroffset(shortexon),
Substring_chrhigh(shortexon),Substring_chrlength(shortexon))) != NULL) {
@@ -10326,15 +10493,19 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
#endif
amb_knowni = Intlist_from_array(old->amb_knowni_donor,old->nambcoords_donor);
amb_nmismatches = Intlist_from_array(old->amb_nmismatches_donor,old->nambcoords_donor);
+ amb_probs = Doublelist_from_array(old->amb_probs_donor,old->nambcoords_donor);
*hit3 = Stage3end_new_shortexon(&ignore_found_score,/*donor*/old->substringD,acceptor,shortexon,
old->amb_length_donor,/*amb_length_acceptor*/0,
+ /*amb_prob_donor*/Doublelist_max(amb_probs),/*amb_prob_acceptor*/0.0,
ambcoords,/*ambcoords_acceptor*/NULL,
amb_knowni,/*amb_knowni_acceptor*/NULL,
amb_nmismatches,/*amb_nmismatches_acceptor*/NULL,
+ amb_probs,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*copy_shortexon_p*/true,
localsplicing_penalty,querylength3,/*sensedir*/SENSE_ANTI,
/*sarrayp*/false);
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
@@ -10359,10 +10530,11 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
donor_splicecoord = (*hit3)->ambcoords_donor[bingoi3];
donor_knowni = (*hit3)->amb_knowni_donor[bingoi3];
nmismatches_shortend = (*hit3)->amb_nmismatches_donor[bingoi3];
+ prob_shortend = (*hit3)->amb_probs_donor[bingoi3];
segment_left = donor_splicecoord - (querylength3 - splice_pos);
if ((donor = Substring_new_donor(donor_splicecoord,donor_knowni,querylength3 - splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query3_compress_rev,
+ /*prob*/prob_shortend,segment_left,query3_compress_rev,
querylength3,/*plusp*/false,genestrand,/*first_read_p*/false,/*sensep*/true,
Substring_chrnum(shortexon),Substring_chroffset(shortexon),
Substring_chrhigh(shortexon),Substring_chrlength(shortexon))) != NULL) {
@@ -10377,15 +10549,19 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
#endif
amb_knowni = Intlist_from_array(old->amb_knowni_acceptor,old->nambcoords_acceptor);
amb_nmismatches = Intlist_from_array(old->amb_nmismatches_acceptor,old->nambcoords_acceptor);
+ amb_probs = Doublelist_from_array(old->amb_probs_acceptor,old->nambcoords_acceptor);
*hit3 = Stage3end_new_shortexon(&ignore_found_score,donor,/*acceptor*/old->substringA,shortexon,
/*amb_length_donor*/0,old->amb_length_acceptor,
+ /*amb_prob_donor*/0.0,/*amb_prob_acceptor*/Doublelist_max(amb_probs),
/*ambcoords_donor*/NULL,ambcoords,
/*amb_knowni_donor*/NULL,amb_knowni,
/*amb_nmismatches_donor*/NULL,amb_nmismatches,
+ /*amb_probs_donor*/NULL,amb_probs,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*copy_shortexon_p*/true,
localsplicing_penalty,querylength3,/*sensedir*/SENSE_FORWARD,
/*sarrayp*/false);
+ Doublelist_free(&amb_probs);
Intlist_free(&amb_nmismatches);
Intlist_free(&amb_knowni);
#ifdef LARGE_GENOMES
@@ -10416,10 +10592,11 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
acceptor_splicecoord = (*hit3)->ambcoords_acceptor[bingoi3];
acceptor_knowni = (*hit3)->amb_knowni_acceptor[bingoi3];
nmismatches_shortend = (*hit3)->amb_nmismatches_acceptor[bingoi3];
+ prob_shortend = (*hit3)->amb_probs_acceptor[bingoi3];
segment_left = acceptor_splicecoord - (querylength3 - splice_pos);
if ((acceptor = Substring_new_acceptor(acceptor_splicecoord,acceptor_knowni,querylength3 - splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query3_compress_rev,
+ /*prob*/prob_shortend,segment_left,query3_compress_rev,
querylength3,/*plusp*/false,genestrand,/*first_read_p*/false,/*sensep*/false,
Substring_chrnum(donor),Substring_chroffset(donor),
Substring_chrhigh(donor),Substring_chrlength(donor))) != NULL) {
@@ -10429,10 +10606,11 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
old = *hit3;
*hit3 = Stage3end_new_splice(&ignore_found_score,Substring_nmismatches_whole(donor),/*nmismatches_acceptor*/nmismatches_shortend,
donor,acceptor,/*distance*/acceptor_splicecoord - donor_splicecoord,
- /*shortdistancep*/true,localsplicing_penalty,querylength3,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength3,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/false,
/*sensedir*/SENSE_ANTI,/*sarrayp*/false);
if (*private3p == true) {
@@ -10452,10 +10630,11 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
donor_splicecoord = (*hit3)->ambcoords_donor[bingoi3];
donor_knowni = (*hit3)->amb_knowni_donor[bingoi3];
nmismatches_shortend = (*hit3)->amb_nmismatches_donor[bingoi3];
+ prob_shortend = (*hit3)->amb_probs_donor[bingoi3];
segment_left = donor_splicecoord - (querylength3 - splice_pos);
if ((donor = Substring_new_donor(donor_splicecoord,donor_knowni,querylength3 - splice_pos,nmismatches_shortend,
- /*prob*/2.0,segment_left,query3_compress_rev,
+ /*prob*/prob_shortend,segment_left,query3_compress_rev,
querylength3,/*plusp*/false,genestrand,/*first_read_p*/false,/*sensep*/true,
Substring_chrnum(acceptor),Substring_chroffset(acceptor),
Substring_chrhigh(acceptor),Substring_chrlength(acceptor))) != NULL) {
@@ -10465,10 +10644,11 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, T *hit5, T *h
old = *hit3;
*hit3 = Stage3end_new_splice(&ignore_found_score,/*nmismatches_donor*/nmismatches_shortend,Substring_nmismatches_whole(acceptor),
donor,acceptor,/*distance*/donor_splicecoord - acceptor_splicecoord,
- /*shortdistancep*/true,localsplicing_penalty,querylength3,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,querylength3,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_knowni_donor*/NULL,/*amb_knowni_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/false,
/*sensedir*/SENSE_FORWARD,/*sarrayp*/false);
if (*private3p == true) {
@@ -10534,9 +10714,8 @@ Stage3pair_new (T hit5, T hit3, Univcoord_T *splicesites,
debug10(printf("\nStage3pair_new called with pairtype %s and chrnum %d, %d (effective %d, %d)\n",
Pairtype_string(pairtype),hit5->chrnum,hit3->chrnum,hit5->effective_chrnum,hit3->effective_chrnum));
- if (hit5->hittype == TERMINAL && Stage3end_terminal_length(hit5) < terminal_output_minlength) {
- debug10(printf("5' terminal length %d is not long enough (< %d)\n",
- Stage3end_terminal_length(hit5),terminal_output_minlength));
+ if (hit5->hittype == TERMINAL && hit5->trim_left + hit5->trim_right >= reject_trimlength) {
+ debug10(printf("5' rejected by trimlength %d + %d\n",hit5->trim_left,hit5->trim_right));
if (private5p == true) {
Stage3end_free(&hit5);
}
@@ -10544,9 +10723,9 @@ Stage3pair_new (T hit5, T hit3, Univcoord_T *splicesites,
Stage3end_free(&hit3);
}
return (Stage3pair_T) NULL;
- } else if (hit3->hittype == TERMINAL && Stage3end_terminal_length(hit3) < terminal_output_minlength) {
- debug10(printf("3' terminal length %d is not long enough (< %d)\n",
- Stage3end_terminal_length(hit3),terminal_output_minlength));
+
+ } else if (hit3->hittype == TERMINAL && hit3->trim_left + hit3->trim_right >= reject_trimlength) {
+ debug10(printf("3' rejected by trimlength %d + %d\n",hit3->trim_left,hit3->trim_right));
if (private5p == true) {
Stage3end_free(&hit5);
}
@@ -10817,18 +10996,20 @@ Stage3pair_new (T hit5, T hit3, Univcoord_T *splicesites,
if (hit5->sensedir == SENSE_FORWARD) {
copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_whole(hit5->substring1),
/*nmismatches_acceptor*/0,/*donor*/hit5->substring1,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/true,
/*sensedir*/hit5->sensedir,hit5->sarrayp);
} else if (hit5->sensedir == SENSE_ANTI) {
copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
/*nmismatches_acceptor*/Substring_nmismatches_whole(hit5->substring1),/*donor*/NULL,
/*acceptor*/hit5->substring1,/*distance*/0U,
- /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/true,
/*sensedir*/hit5->sensedir,hit5->sarrayp);
} else {
@@ -10848,17 +11029,19 @@ Stage3pair_new (T hit5, T hit3, Univcoord_T *splicesites,
copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
/*nmismatches_acceptor*/Substring_nmismatches_whole(hit3->substring2),/*donor*/NULL,
/*acceptor*/hit3->substring2,/*distance*/0U,
- /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/true,
/*sensedir*/hit3->sensedir,hit3->sarrayp);
} else if (hit3->sensedir == SENSE_ANTI) {
copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_whole(hit3->substring2),
/*nmismatches_acceptor*/0,/*donor*/hit3->substring2,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/true,
/*sensedir*/hit3->sensedir,hit3->sarrayp);
} else {
@@ -10942,18 +11125,20 @@ Stage3pair_new (T hit5, T hit3, Univcoord_T *splicesites,
if (hit5->sensedir == SENSE_FORWARD) {
copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_whole(hit5->substring1),
/*nmismatches_acceptor*/0,/*donor*/hit5->substring1,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/false,
/*sensedir*/hit5->sensedir,hit5->sarrayp);
} else if (hit5->sensedir == SENSE_ANTI) {
copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
/*nmismatches_acceptor*/Substring_nmismatches_whole(hit5->substring1),/*donor*/NULL,
/*acceptor*/hit5->substring1,/*distance*/0U,
- /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/false,
/*sensedir*/hit5->sensedir,hit5->sarrayp);
} else {
@@ -10973,17 +11158,19 @@ Stage3pair_new (T hit5, T hit3, Univcoord_T *splicesites,
copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
/*nmismatches_acceptor*/Substring_nmismatches_whole(hit3->substring2),/*donor*/NULL,
/*acceptor*/hit3->substring2,/*distance*/0U,
- /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/false,
/*sensedir*/hit3->sensedir,hit3->sarrayp);
} else if (hit3->sensedir == SENSE_ANTI) {
copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_whole(hit3->substring2),
/*nmismatches_acceptor*/0,/*donor*/hit3->substring2,/*acceptor*/NULL,/*distance*/0U,
- /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,
+ /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
/*ambcoords_donor*/NULL,/*ambcoords_acceptor*/NULL,
/*amb_nmismatches_donor*/NULL,/*amb_nmismatches_acceptor*/NULL,
+ /*amb_probs_donor*/NULL,/*amb_probs_acceptor*/NULL,
/*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/false,
/*sensedir*/hit3->sensedir,hit3->sarrayp);
} else {
@@ -11276,6 +11463,23 @@ hitpair_sort_cmp (const void *a, const void *b) {
Univcoord_T x_hit3_high, x_hit3_low, y_hit3_high, y_hit3_low;
Univcoord_T x_low, x_high, y_low, y_high;
+ debug8(printf(" Comparing (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), alias %d|%d, nmatches: %d (%d posttrim), indel_low %d and %d\n",
+ Pairtype_string(x->pairtype),hittype_string(x->hit5->hittype),
+ hittype_string(x->hit3->hittype),x,
+ x->hit5->low - x->hit5->chroffset,x->hit5->high - x->hit5->chroffset,
+ x->hit3->low - x->hit3->chroffset,x->hit3->high - x->hit3->chroffset,
+ x->dir,x->hit5->alias,x->hit3->alias,x->nmatches,x->nmatches_posttrim,
+ x->hit5->indel_low,x->hit3->indel_low));
+
+ debug8(printf(" with (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), alias %d|%d, nmatches: %d (%d posttrim), indel_low %d and %d\n",
+ Pairtype_string(y->pairtype),hittype_string(y->hit5->hittype),
+ hittype_string(y->hit3->hittype),y,
+ y->hit5->low - y->hit5->chroffset,y->hit5->high - y->hit5->chroffset,
+ y->hit3->low - y->hit3->chroffset,y->hit3->high - y->hit3->chroffset,
+ y->dir,y->hit5->alias,y->hit3->alias,y->nmatches,y->nmatches_posttrim,
+ y->hit5->indel_low,y->hit3->indel_low));
+
+
x_hit5_low = normalize_coord(x->hit5->low,x->hit5->alias,x->hit5->chrlength);
x_hit5_high = normalize_coord(x->hit5->high,x->hit5->alias,x->hit5->chrlength);
@@ -11320,7 +11524,23 @@ hitpair_sort_cmp (const void *a, const void *b) {
return -1;
} else if (y->hit3->high < x->hit3->low) {
return +1;
-#elif 0
+#else
+ /* low to high pattern needed for finding overlaps */
+ } else if (x_low < y_low) {
+ debug8(printf("Returning -1 for low\n"));
+ return -1;
+ } else if (y_low < x_low) {
+ debug8(printf("Returning +1 for low\n"));
+ return +1;
+
+ } else if (x_high > y_high) {
+ debug8(printf("Returning -1 for high\n"));
+ return -1;
+ } else if (y_high > x_high) {
+ debug8(printf("Returning +1 for high\n"));
+ return +1;
+
+ /* Need to check inside ends to avoid declaring unequal hitpairs equal */
} else if (x_hit5_low < y_hit5_low) {
return -1;
} else if (y_hit5_low < x_hit5_low) {
@@ -11340,18 +11560,6 @@ hitpair_sort_cmp (const void *a, const void *b) {
return -1;
} else if (y_hit3_high < x_hit3_high) {
return +1;
-#else
- /* low to high pattern needed for finding overlaps */
- } else if (x_low < y_low) {
- return -1;
- } else if (y_low < x_low) {
- return +1;
-
- } else if (x_high > y_high) {
- return -1;
- } else if (y_high > x_high) {
- return +1;
-
#endif
@@ -11373,6 +11581,7 @@ hitpair_sort_cmp (const void *a, const void *b) {
return -1;
} else if (y->nmatches > x->nmatches) {
return +1;
+#if 0
} else if (x->nmatches_posttrim > y->nmatches_posttrim) {
return -1;
} else if (y->nmatches_posttrim > x->nmatches_posttrim) {
@@ -11381,6 +11590,7 @@ hitpair_sort_cmp (const void *a, const void *b) {
return -1;
} else if (y->nchimera_novel < x->nchimera_novel) {
return +1;
+#endif
} else if (x->nchimera_known > y->nchimera_known) {
return -1;
} else if (y->nchimera_known > x->nchimera_known) {
@@ -11499,11 +11709,11 @@ hitpair_equiv_cmp (Stage3pair_T x, Stage3pair_T y) {
return -1;
} else if (y->nchimera_novel < x->nchimera_novel) {
return +1;
+#endif
} else if (x->nchimera_known > y->nchimera_known) {
return -1;
} else if (y->nchimera_known > x->nchimera_known) {
return +1;
-#endif
} else if (x->sense_consistent_p == true && y->sense_consistent_p == false) {
return -1;
@@ -11930,7 +12140,6 @@ hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
#endif
-#if 0
if (hitpair->nmatches < best_hitpair->nmatches) {
/* k is worse */
debug8(printf(" => loses by nmatches\n"));
@@ -11939,10 +12148,9 @@ hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
/* k is better */
debug8(printf(" => wins by nmatches\n"));
return +1;
- }
-#endif
- if (hitpair->nmatches_posttrim < best_hitpair->nmatches_posttrim) {
+#if 0
+ } else if (hitpair->nmatches_posttrim < best_hitpair->nmatches_posttrim) {
/* k is worse */
debug8(printf(" => loses by nmatches_posttrim\n"));
return -1;
@@ -11950,7 +12158,9 @@ hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
/* k is better */
debug8(printf(" => wins by nmatches_posttrim\n"));
return +1;
+#endif
+#if 0
} else if (hitpair->nchimera_novel > best_hitpair->nchimera_novel) {
/* k is worse */
debug8(printf(" => loses by nchimera_novel\n"));
@@ -11959,12 +12169,14 @@ hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
/* k is better */
debug8(printf(" => wins by nchimera_novel\n"));
return +1;
+#endif
/* Favoring nchimera_known helps before outerlength favors known
splices over novel ones */
} else if (hitpair->nchimera_known < best_hitpair->nchimera_known) {
/* k is worse */
- debug8(printf(" => loses by nchimera_known\n"));
+ debug8(printf(" => loses by nchimera_known: %d < %d\n",
+ hitpair->nchimera_known < best_hitpair->nchimera_known));
return -1;
} else if (hitpair->nchimera_known > best_hitpair->nchimera_known) {
/* k is better */
@@ -11982,6 +12194,7 @@ hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
return +1;
#endif
+#if 0
} else if (hitpair->hit5->hittype > best_hitpair->hit5->hittype &&
hitpair->hit3->hittype >= best_hitpair->hit3->hittype) {
/* k is worse */
@@ -12005,6 +12218,7 @@ hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
/* k is better */
debug8(printf(" => wins by hittype\n"));
return +1;
+#endif
} else if (finalp == false) {
debug8(printf(" => indistinguishable\n"));
@@ -12024,6 +12238,7 @@ hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
return +1;
#endif
+#if 0
/* Previously favored longer insert lengths to give more compact
splices. However, we now accept splices first that give
expected pairlength */
@@ -12035,6 +12250,7 @@ hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
/* k is better */
debug8(printf(" => wins by insertlength_expected_sign\n"));
return +1;
+#endif
/* Next we look at splice probability */
} else {
@@ -12105,6 +12321,7 @@ hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
return +1;
} else {
+#if 0
if (hitpair->insertlength_expected_sign >= 0 && best_hitpair->insertlength_expected_sign >= 0) {
/* Both insert lengths are short, so favor shorter insert length */
debug8(printf(" => short insertlengths"));
@@ -12118,19 +12335,19 @@ hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
debug8(printf(" => wins by insertlength\n"));
return +1;
}
+ }
+#endif
- } else {
- /* Both insert lengths are long, so favor longer insert length to give more compact splices */
- debug8(printf(" => long insertlengths"));
- if (hitpair->insertlength < best_hitpair->insertlength) {
- /* k is worse */
- debug8(printf(" => loses by insertlength\n"));
- return -1;
- } else if (hitpair->insertlength > best_hitpair->insertlength) {
- /* k is better */
- debug8(printf(" => wins by insertlength\n"));
- return +1;
- }
+ /* Both insert lengths are long, so favor longer insert length to give more compact splices */
+ debug8(printf(" => long insertlengths"));
+ if (hitpair->insertlength < best_hitpair->insertlength) {
+ /* k is worse */
+ debug8(printf(" => loses by insertlength\n"));
+ return -1;
+ } else if (hitpair->insertlength > best_hitpair->insertlength) {
+ /* k is better */
+ debug8(printf(" => wins by insertlength\n"));
+ return +1;
}
debug8(printf(" => equal\n"));
@@ -13778,6 +13995,8 @@ pair_up_concordant_aux (bool *abort_pairing_p, int *found_score, int *nconcordan
if (hit5->start_amb_length > 0 || hit5->end_amb_length > 0 ||
hit3->start_amb_length > 0 || hit3->end_amb_length > 0) {
/* Don't use ambiguous splices to update found_score*/
+ hitpairs = List_push(hitpairs,(void *) stage3pair);
+ (*nconcordant)++;
} else if (pairscore < new_found_score) {
new_found_score = pairscore;
diff --git a/src/stage3hr.h b/src/stage3hr.h
index 253276d..b1cb8e9 100644
--- a/src/stage3hr.h
+++ b/src/stage3hr.h
@@ -1,4 +1,4 @@
-/* $Id: stage3hr.h 154023 2014-11-25 03:45:18Z twu $ */
+/* $Id: stage3hr.h 154778 2014-12-06 03:32:33Z twu $ */
#ifndef STAGE3HR_INCLUDED
#define STAGE3HR_INCLUDED
@@ -8,6 +8,7 @@
#include "chrnum.h"
#include "genomicpos.h"
#include "intlist.h"
+#include "doublelist.h"
#include "iit-read-univ.h"
#include "iit-read.h"
#include "shortread.h"
@@ -36,7 +37,7 @@ Stage3hr_setup (bool invert_first_p_in, bool invert_second_p_in,
IIT_T genes_iit_in, int *genes_divint_crosstable_in,
IIT_T tally_iit_in, int *tally_divint_crosstable_in,
IIT_T runlength_iit_in, int *runlength_divint_crosstable_in,
- int terminal_output_minlength_in, bool distances_observed_p, int pairmax_in,
+ int reject_trimlength_in, bool distances_observed_p, int pairmax_in,
Chrpos_T expected_pairlength, Chrpos_T pairlength_deviation,
int localsplicing_penalty_in, int indel_penalty_middle_in,
int antistranded_penalty_in, bool favor_multiexon_p_in,
@@ -219,6 +220,15 @@ extern int
Stage3end_amb_length_start (T this);
extern int
Stage3end_amb_length_end (T this);
+extern Univcoord_T *
+Stage3end_start_ambcoords (T this);
+extern Univcoord_T *
+Stage3end_end_ambcoords (T this);
+extern int
+Stage3end_start_nambcoords (T this);
+extern int
+Stage3end_end_nambcoords (T this);
+
extern bool
Stage3end_gmap_triedp (T this);
@@ -231,7 +241,7 @@ Stage3end_gmap_queryend (T this);
extern int
Stage3end_terminal_trim (T this);
extern int
-Stage3end_terminal_length (T this);
+Stage3end_trimlength (T this);
extern bool
Stage3end_contains_known_splicesite (T this);
extern bool
@@ -322,7 +332,7 @@ Stage3end_new_terminal (int querystart, int queryend, Univcoord_T left, Compress
extern T
Stage3end_new_splice (int *found_score, int donor_nmismatches, int acceptor_nmismatches,
Substring_T donor, Substring_T acceptor, Chrpos_T distance,
- bool shortdistancep, int splicing_penalty, int querylength, int amb_length,
+ bool shortdistancep, int splicing_penalty, int querylength, int amb_length, double amb_prob,
#ifdef LARGE_GENOMES
Uint8list_T ambcoords_donor, Uint8list_T ambcoords_acceptor,
#else
@@ -330,11 +340,12 @@ Stage3end_new_splice (int *found_score, int donor_nmismatches, int acceptor_nmis
#endif
Intlist_T amb_knowni_donor, Intlist_T amb_knowni_acceptor,
Intlist_T amb_nmismatches_donor, Intlist_T amb_nmismatches_acceptor,
+ Doublelist_T amb_probs_donor, Doublelist_T amb_probs_acceptor,
bool copy_donor_p, bool copy_acceptor_p,
bool first_read_p, int sensedir, bool sarrayp);
extern T
Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T acceptor, Substring_T shortexon,
- int amb_length_donor, int amb_length_acceptor,
+ int amb_length_donor, int amb_length_acceptor, double amb_prob_donor, double amb_prob_acceptor,
#ifdef LARGE_GENOMES
Uint8list_T ambcoords_donor, Uint8list_T ambcoords_acceptor,
#else
@@ -342,6 +353,7 @@ Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T accept
#endif
Intlist_T amb_knowni_donor, Intlist_T amb_knowni_acceptor,
Intlist_T amb_nmismatches_donor, Intlist_T amb_nmismatches_acceptor,
+ Doublelist_T amb_probs_donor, Doublelist_T amb_probs_acceptor,
bool copy_donor_p, bool copy_acceptor_p, bool copy_shortexon_p,
int splicing_penalty, int querylength, int sensedir, bool sarrayp);
@@ -387,7 +399,7 @@ Stage3end_remove_circular_alias (List_T hitlist);
extern List_T
Stage3end_remove_duplicates (List_T hitlist);
extern List_T
-Stage3end_filter_terminals (List_T hits, int querylength);
+Stage3end_reject_trimlengths (List_T hits);
extern List_T
Stage3end_remove_overlaps (List_T hitlist, bool finalp);
extern List_T
diff --git a/src/substring.c b/src/substring.c
index acd84ae..3de8a72 100644
--- a/src/substring.c
+++ b/src/substring.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: substring.c 154023 2014-11-25 03:45:18Z twu $";
+static char rcsid[] = "$Id: substring.c 154591 2014-12-04 02:00:32Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -159,6 +159,7 @@ static bool output_sam_p;
static Mode_T mode;
static double genomelength; /* For BLAST E-value */
+static int reject_trimlength;
char *
@@ -1454,7 +1455,8 @@ Substring_setup (bool print_nsnpdiffs_p_in, bool print_snplabels_p_in,
IIT_T splicesites_iit_in, int *splicesites_divint_crosstable_in,
int donor_typeint_in, int acceptor_typeint_in, int trim_mismatch_score_in,
bool novelsplicingp_in, bool knownsplicingp_in,
- bool output_sam_p_in, Mode_T mode_in, Univcoord_T genomelength_in) {
+ bool output_sam_p_in, Mode_T mode_in, Univcoord_T genomelength_in,
+ int reject_trimlength_in) {
print_nsnpdiffs_p = print_nsnpdiffs_p_in;
print_snplabels_p = print_snplabels_p_in;
show_refdiff_p = show_refdiff_p_in;
@@ -1480,6 +1482,7 @@ Substring_setup (bool print_nsnpdiffs_p_in, bool print_snplabels_p_in,
mode = mode_in;
genomelength = (double) genomelength_in;
+ reject_trimlength = reject_trimlength_in;
return;
}
@@ -1604,6 +1607,7 @@ Substring_new (int nmismatches_whole, Chrnum_T chrnum, Univcoord_T chroffset,
int aligndiff;
int nmatches;
double prob1, prob2;
+ int nonterminal_trim = 0;
/* General test for goodness over original region */
@@ -1680,102 +1684,139 @@ Substring_new (int nmismatches_whole, Chrnum_T chrnum, Univcoord_T chroffset,
/* Assign new->nmismatches_whole */
new->nmismatches_whole = nmismatches_whole;
+ /* Initialize these so Substring_free knows what to do */
+ new->genomic_bothdiff = (char *) NULL;
+ new->genomic_refdiff = (char *) NULL;
/* Do trimming */
debug8(printf("trim_left_p %d, trim_right_p %d\n",trim_left_p,trim_right_p));
if (trim_left_p == false) {
new->trim_left = 0;
- new->trim_left_splicep = false;
- } else {
- if (new->start_endtype == TERM) {
- new->trim_left = trim_left_end(query_compress,left,querystart,queryend,querylength,plusp,genestrand,first_read_p,
- /*trim_mismatch_score*/-3);
- } else {
- new->trim_left = trim_left_end(query_compress,left,querystart,queryend,querylength,plusp,genestrand,first_read_p,
- trim_mismatch_score);
- }
- new->querystart += new->trim_left;
- if (plusp == true) {
- new->alignstart_trim += new->trim_left;
- prob1 = Maxent_hr_acceptor_prob(left + new->trim_left,chroffset);
- prob2 = Maxent_hr_antidonor_prob(left + new->trim_left,chroffset);
- /* fprintf(stderr,"At %u, acceptor prob %f, antidonor prob %f\n",left+new->trim_left,prob1,prob2); */
- } else {
- new->alignstart_trim -= new->trim_left;
+ } else if (new->start_endtype == TERM) {
+ /* Accept true terminals generated by GSNAP procedure */
+ new->trim_left = trim_left_end(query_compress,left,querystart,queryend,querylength,plusp,genestrand,first_read_p,
+ /*trim_mismatch_score*/-3);
- prob1 = Maxent_hr_donor_prob(left + querylength - new->trim_left,chroffset);
- prob2 = Maxent_hr_antiacceptor_prob(left + querylength - new->trim_left,chroffset);
- /* fprintf(stderr,"At %u, donor prob %f, antiacceptor prob %f\n",left + querylength - new->trim_left,prob1,prob2); */
- }
- if (novelsplicingp == false) {
- new->trim_left_splicep = false;
- } else if (prob1 > 0.90 || prob2 > 0.90) {
- new->trim_left_splicep = true;
- } else {
- new->trim_left_splicep = false;
- }
+ } else {
+ new->trim_left = trim_left_end(query_compress,left,querystart,queryend,querylength,plusp,genestrand,first_read_p,
+ trim_mismatch_score);
+ nonterminal_trim += new->trim_left;
}
-
if (trim_right_p == false) {
new->trim_right = 0;
- new->trim_right_splicep = false;
- } else {
- if (new->end_endtype == TERM) {
- new->trim_right = trim_right_end(query_compress,left,querystart,queryend,querylength,plusp,genestrand,first_read_p,
- /*trim_mismatch_score*/-3);
- } else {
- new->trim_right = trim_right_end(query_compress,left,querystart,queryend,querylength,plusp,genestrand,first_read_p,
- trim_mismatch_score);
- }
- new->queryend -= new->trim_right;
- if (plusp == true) {
- new->alignend_trim -= new->trim_right;
- prob1 = Maxent_hr_donor_prob(left + querylength - new->trim_right,chroffset);
- prob2 = Maxent_hr_antiacceptor_prob(left + querylength - new->trim_right,chroffset);
- /* fprintf(stderr,"At %u, donor prob %f, antiacceptor prob %f\n",left + querylength - new->trim_right,prob1,prob2); */
- } else {
- new->alignend_trim += new->trim_right;
+ } else if (new->end_endtype == TERM) {
+ /* Accept true terminals generated by GSNAP procedure */
+ new->trim_right = trim_right_end(query_compress,left,querystart,queryend,querylength,plusp,genestrand,first_read_p,
+ /*trim_mismatch_score*/-3);
- prob1 = Maxent_hr_acceptor_prob(left + new->trim_right,chroffset);
- prob2 = Maxent_hr_antidonor_prob(left + new->trim_right,chroffset);
- /* fprintf(stderr,"At %u, acceptor prob %f, antidonor prob %f\n",left+new->trim_right,prob1,prob2); */
- }
- if (novelsplicingp == false) {
- new->trim_right_splicep = false;
- } else if (prob1 > 0.90 || prob2 > 0.90) {
- new->trim_right_splicep = true;
- } else {
- new->trim_right_splicep = false;
- }
+ } else {
+ new->trim_right = trim_right_end(query_compress,left,querystart,queryend,querylength,plusp,genestrand,first_read_p,
+ trim_mismatch_score);
+ nonterminal_trim += new->trim_right;
}
-
- /* Initialize these so Substring_free knows what to do */
- new->genomic_bothdiff = (char *) NULL;
- new->genomic_refdiff = (char *) NULL;
-
-
- /* Check for minlength. Needed to avoid nonsensical terminal alignments */
- if (new->queryend - new->querystart <= minlength) {
- debug8(printf("queryend %d - querystart %d <= minlength %d, so returning NULL\n",
- new->queryend,new->querystart,minlength));
+ debug8(printf("Nonterminal trim %d vs reject_trimlength %d\n",nonterminal_trim,reject_trimlength));
+ if (nonterminal_trim >= reject_trimlength) {
+ /* Reject non-terminal alignments (including those by sarray search) with excessive trim */
+ /* Keep true terminals for now in case they help lead to GMAP alignments */
+ debug8(printf("Nonterminal trims %d exceeds reject_trimlength %d, so returning NULL\n",
+ nonterminal_trim,reject_trimlength));
Substring_free(&new);
-
return (T) NULL;
+ } else {
+ new->querystart += new->trim_left;
+ new->queryend -= new->trim_right;
+ /* Check for minlength. Needed to avoid nonsensical terminal alignments */
+ if (new->queryend - new->querystart <= minlength) {
+ debug8(printf("queryend %d - querystart %d <= minlength %d, so returning NULL\n",
+ new->queryend,new->querystart,minlength));
+ Substring_free(&new);
+ return (T) NULL;
+ }
}
-
- /* nmatches: Counts matches over whole region including trims */
+ /* ? Should we spend the time to determine trim_left_splicep and
+ trim_right_splicep, especially since trimming may not be perfect */
if (plusp == true) {
+ /* nmatches: Counts matches over whole region including trims */
new->nmatches = (new->alignend - new->alignstart) - new->nmismatches_whole;
+
+ new->alignstart_trim += new->trim_left;
+ new->alignend_trim -= new->trim_right;
+
+ if (novelsplicingp == false) {
+ new->trim_left_splicep = new->trim_right_splicep = false;
+ } else {
+ if (new->trim_left == 0) {
+ new->trim_left_splicep = false;
+ } else {
+ prob1 = Maxent_hr_acceptor_prob(left + new->trim_left,chroffset);
+ prob2 = Maxent_hr_antidonor_prob(left + new->trim_left,chroffset);
+ /* fprintf(stderr,"At %u, acceptor prob %f, antidonor prob %f\n",left+new->trim_left,prob1,prob2); */
+ if (prob1 > 0.90 || prob2 > 0.90) {
+ new->trim_left_splicep = true;
+ } else {
+ new->trim_left_splicep = false;
+ }
+ }
+
+ if (new->trim_right == 0) {
+ new->trim_right_splicep = false;
+ } else {
+ prob1 = Maxent_hr_donor_prob(left + querylength - new->trim_right,chroffset);
+ prob2 = Maxent_hr_antiacceptor_prob(left + querylength - new->trim_right,chroffset);
+ /* fprintf(stderr,"At %u, donor prob %f, antiacceptor prob %f\n",left + querylength - new->trim_right,prob1,prob2); */
+ if (prob1 > 0.90 || prob2 > 0.90) {
+ new->trim_right_splicep = true;
+ } else {
+ new->trim_right_splicep = false;
+ }
+ }
+ }
+
} else {
+ /* nmatches: Counts matches over whole region including trims */
new->nmatches = (new->alignstart - new->alignend) - new->nmismatches_whole;
+
+ new->alignstart_trim -= new->trim_left;
+ new->alignend_trim += new->trim_right;
+
+ if (novelsplicingp == false) {
+ new->trim_left_splicep = new->trim_right_splicep = false;
+ } else {
+ if (new->trim_left == 0) {
+ new->trim_left_splicep = false;
+ } else {
+ prob1 = Maxent_hr_donor_prob(left + querylength - new->trim_left,chroffset);
+ prob2 = Maxent_hr_antiacceptor_prob(left + querylength - new->trim_left,chroffset);
+ /* fprintf(stderr,"At %u, donor prob %f, antiacceptor prob %f\n",left + querylength - new->trim_left,prob1,prob2); */
+ if (prob1 > 0.90 || prob2 > 0.90) {
+ new->trim_left_splicep = true;
+ } else {
+ new->trim_left_splicep = false;
+ }
+ }
+
+ if (new->trim_right == 0) {
+ new->trim_right_splicep = false;
+ } else {
+ prob1 = Maxent_hr_acceptor_prob(left + new->trim_right,chroffset);
+ prob2 = Maxent_hr_antidonor_prob(left + new->trim_right,chroffset);
+ /* fprintf(stderr,"At %u, acceptor prob %f, antidonor prob %f\n",left+new->trim_right,prob1,prob2); */
+ if (prob1 > 0.90 || prob2 > 0.90) {
+ new->trim_right_splicep = true;
+ } else {
+ new->trim_right_splicep = false;
+ }
+ }
+ }
}
+
/* nmismatches_bothdiff: Counts matches of trimmed region */
if (new->trim_left == 0 && new->trim_right == 0) {
new->nmismatches_bothdiff = nmismatches_whole;
@@ -1796,7 +1837,6 @@ Substring_new (int nmismatches_whole, Chrnum_T chrnum, Univcoord_T chroffset,
debug2(printf("Substring fails general test for goodness with %d matches and %d mismatches\n",
nmatches,new->nmismatches_bothdiff));
Substring_free(&new);
-
return (T) NULL;
}
diff --git a/src/substring.h b/src/substring.h
index 6b10158..3878c72 100644
--- a/src/substring.h
+++ b/src/substring.h
@@ -1,4 +1,4 @@
-/* $Id: substring.h 154023 2014-11-25 03:45:18Z twu $ */
+/* $Id: substring.h 154591 2014-12-04 02:00:32Z twu $ */
#ifndef SUBSTRING_INCLUDED
#define SUBSTRING_INCLUDED
@@ -27,7 +27,8 @@ Substring_setup (bool print_nsnpdiffs_p_in, bool print_snplabels_p_in,
IIT_T splicesites_iit_in, int *splicesites_divint_crosstable_in,
int donor_typeint_in, int acceptor_typeint_in, int trim_mismatch_score_in,
bool novelsplicingp_in, bool knownsplicingp_in,
- bool output_sam_p_in, Mode_T mode_in, Univcoord_T genomelength_in);
+ bool output_sam_p_in, Mode_T mode_in, Univcoord_T genomelength_in,
+ int reject_trimlength_in);
#define T Substring_T
typedef struct T *T;
diff --git a/src/uniqscan.c b/src/uniqscan.c
index 5fb7c85..4a4e39b 100644
--- a/src/uniqscan.c
+++ b/src/uniqscan.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: uniqscan.c 153955 2014-11-24 17:54:45Z twu $";
+static char rcsid[] = "$Id: uniqscan.c 158355 2015-02-10 19:08:45Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -451,7 +451,7 @@ uniqueness_scan (bool from_right_p) {
/*barcode_length*/0,/*invertp*/0,/*copy_acc_p*/false,/*skipp*/false);
stage3array = Stage1_single_read(&npaths,&first_absmq,&second_absmq,
queryseq1,indexdb,indexdb2,indexdb_size_threshold,
- genome,floors_array,user_maxlevel_float,subopt_levels,
+ genome,floors_array,user_maxlevel_float,
indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,/*distantsplicing_penalty*/100,min_shortend,
@@ -495,7 +495,7 @@ uniqueness_scan (bool from_right_p) {
/*barcode_length*/0,/*invertp*/0,/*copy_acc_p*/false,/*skipp*/false);
stage3array = Stage1_single_read(&npaths,&first_absmq,&second_absmq,
queryseq1,indexdb,indexdb2,indexdb_size_threshold,
- genome,floors_array,user_maxlevel_float,subopt_levels,
+ genome,floors_array,user_maxlevel_float,
indel_penalty_middle,indel_penalty_end,
allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
localsplicing_penalty,/*distantsplicing_penalty*/100,min_shortend,
@@ -1200,7 +1200,7 @@ main (int argc, char *argv[]) {
genomealt,mode,/*maxpaths_search*/10,/*terminal_threshold*/5,/*terminal_output_minlength*/0,
splicesites,splicetypes,splicedists,nsplicesites,
novelsplicingp,knownsplicingp,distances_observed_p,
- max_middle_insertions,max_middle_deletions,
+ subopt_levels,max_middle_insertions,max_middle_deletions,
shortsplicedist,shortsplicedist_known,shortsplicedist_novelend,min_intronlength,
min_distantsplicing_end_matches,min_distantsplicing_identity,
nullgap,maxpeelback,maxpeelback_distalmedial,
@@ -1213,7 +1213,8 @@ main (int argc, char *argv[]) {
splicing_iit,splicing_divint_crosstable,
donor_typeint,acceptor_typeint,trim_mismatch_score,
novelsplicingp,knownsplicingp,/*output_sam_p*/false,mode,
- Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias*/false));
+ Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias*/false),
+ /*reject_trimlength*/1000);
Dynprog_single_setup(/*homopolymerp*/false);
Dynprog_genome_setup(novelsplicingp,splicing_iit,splicing_divint_crosstable,
donor_typeint,acceptor_typeint);
@@ -1223,7 +1224,7 @@ main (int argc, char *argv[]) {
Stage2_setup(/*splicingp*/novelsplicingp == true || knownsplicingp == true,/*cross_species_p*/false,
suboptimal_score_start,suboptimal_score_end,
mode,/*snps_p*/snps_iit ? true : false);
- Pair_setup(trim_mismatch_score,trim_indel_score,/*sam_insert_0M_p*/false,
+ Pair_setup(trim_mismatch_score,trim_indel_score,/*gff3_separators_p*/false,/*sam_insert_0M_p*/false,
/*force_xs_direction_p*/false,/*md_lowercase_variant_p*/false,
/*snps_p*/snps_iit ? true : false,
Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias*/false),
diff --git a/util/gff3_genes.pl.in b/util/gff3_genes.pl.in
index fa7bef8..eb0fe73 100644
--- a/util/gff3_genes.pl.in
+++ b/util/gff3_genes.pl.in
@@ -87,10 +87,7 @@ while (defined($line = <>)) {
if (!defined($gene_name) || $gene_name !~ /\S/) {
$gene_name = $last_transcript_id;
}
- if (!defined($chr) || $chr !~ /\S/) {
- $chr = $fields[0];
- }
-
+ $chr = $fields[0];
@exons = ();
@CDS_regions = ();
diff --git a/util/gff3_introns.pl.in b/util/gff3_introns.pl.in
index 1e83593..73699e3 100755
--- a/util/gff3_introns.pl.in
+++ b/util/gff3_introns.pl.in
@@ -73,10 +73,7 @@ if (defined($opt_d)) {
if (!defined($gene_name) || $gene_name !~ /\S/) {
$gene_name = $last_transcript_id;
}
- if (!defined($chr) || $chr !~ /\S/) {
- $chr = $fields[0];
- }
-
+ $chr = $fields[0];
@exons = ();
@CDS_regions = ();
diff --git a/util/gff3_splicesites.pl.in b/util/gff3_splicesites.pl.in
index e626b84..93ba453 100755
--- a/util/gff3_splicesites.pl.in
+++ b/util/gff3_splicesites.pl.in
@@ -73,10 +73,7 @@ if (defined($opt_d)) {
if (!defined($gene_name) || $gene_name !~ /\S/) {
$gene_name = $last_transcript_id;
}
- if (!defined($chr) || $chr !~ /\S/) {
- $chr = $fields[0];
- }
-
+ $chr = $fields[0];
@exons = ();
@CDS_regions = ();
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gmap.git
More information about the debian-med-commit
mailing list