[med-svn] [gmap] 01/04: Imported Upstream version 2014-07-28
Alex Mestiashvili
malex-guest at moszumanska.debian.org
Thu Aug 7 13:49:53 UTC 2014
This is an automated email from the git hooks/post-receive script.
malex-guest pushed a commit to branch master
in repository gmap.
commit 464532893758675fc4beaf9b7ad075f8ec3cf87c
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date: Wed Jul 30 09:52:56 2014 +0200
Imported Upstream version 2014-07-28
---
ChangeLog | 85 ++++
VERSION | 2 +-
configure | 24 +-
src/atoiindex.c | 18 +-
src/cmetindex.c | 18 +-
src/dynprog.h | 10 +-
src/dynprog_genome.c | 246 +++++-----
src/dynprog_simd.c | 271 +++++++++--
src/gmap.c | 5 +-
src/pair.c | 46 +-
src/pair.h | 7 +-
src/pairpool.c | 8 +-
src/stage1hr.c | 10 +-
src/stage2.c | 69 ++-
src/stage3.c | 529 ++++++++++++++-------
src/stage3.h | 3 +-
src/stage3hr.c | 1099 ++++++++++++-------------------------------
src/stage3hr.h | 10 +-
util/gff3_genes.pl.in | 47 +-
util/gff3_introns.pl.in | 30 +-
util/gff3_splicesites.pl.in | 34 +-
21 files changed, 1370 insertions(+), 1201 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index a4a2339..d97adf4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,88 @@
+2014-07-29 twu
+
+ * ltmain.sh: Updated from 2.2.6 to 2.2.6b
+
+ * config.sub: Updated from 2008 version to 2009 version
+
+ * config.guess: Updated from 2008 version to 2009 version
+
+ * INSTALL: Updated from 2007 version to 2009 version
+
+ * archive.html, index.html: Made changes for new version
+
+ * gff3_genes.pl.in, gff3_introns.pl.in, gff3_splicesites.pl.in: Handling
+ GFF3 files that have both exon and CDS fields
+
+2014-07-22 twu
+
+ * stage3hr.c: Restored random behavior for equivalent alignments
+
+ * VERSION: Updated version number
+
+ * atoiindex.c, cmetindex.c: Making correct calls to
+ Sarray_discriminating_chars
+
+ * dynprog_simd.c: Added code for having no initial gap penalties, to be used
+ for bam_indelfix
+
+2014-07-19 twu
+
+ * dynprog_simd.c: Improvements to debugging procedures to handle 3-digit
+ indices
+
+2014-07-17 twu
+
+ * gmap.c: Using new interface to Stage3_compute
+
+ * stage3hr.h: Added interfaces for Stage3end_shortexonA_distance and
+ Stage3end_shortexonD_distance
+
+ * stage3hr.c: Added hook for amb_prob in Stage3end_new_gmap. Removed
+ penalty for ambig end lengths if amb_prob > 0.9. Fixed pointer advances
+ in removing bad superstretches.
+
+ * stage3.c: Made fixes to choice of cdna_direction: using presence/absence
+ of intron types, rather than number, and decreased binomial threshold for
+ alignments around intron. Fixed handling of multiple start or end paths
+ by joining them at the outset.
+
+ * stage2.c: Allowing for multiple end cells from each rootposition
+
+ * dynprog_genome.c: Altered decision-making between best alignment and
+ probability-based alignment
+
+ * dynprog_simd.c: Added debugging statements
+
+ * dynprog.h: Made open gap penalties more uniform for different defect rates
+
+ * pair.c: Fixed calculation of nmatches at end for Pair_trim_ends
+
+2014-07-16 twu
+
+ * pairpool.c: Enhanced debugging statement
+
+ * stage3.h: Stage3_compute now returns ambig_prob_5 and ambig_prob_3
+
+ * stage3.c: For ambiguous ends, no longer calling clean_pairs_end5,
+ clean_path_end3, trim_end5_exon_indels, or trim_end3_exon_indels
+
+ * stage1hr.c: Passing amb_prob_5 and amb_prob_3 to Stage3end_new_gmap
+
+ * pair.c, pair.h: In Pair_trim_ends, not trimming ambiguous ends
+
+2014-07-15 twu
+
+ * stage3.c: In trim_end5_exon_indels and trim_end3_exon_indels, counting
+ trimmed ends as mismatches and handling large indels differently from
+ small indels. Added hooks for ambig_prob.
+
+ * pair.c, pair.h, stage3hr.c: Eliminating ambig_end_nmatches from
+ consideration in Pair_nmismatches_region
+
+ * stage3hr.c: Restored eventrim algorithm from revision 140363, which had
+ only a single eventrim calculation and not separate calculations for the
+ two ends.
+
2014-07-04 twu
* VERSION: Updated version number
diff --git a/VERSION b/VERSION
index af5e428..a046c08 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2014-07-04
\ No newline at end of file
+2014-07-28
\ No newline at end of file
diff --git a/configure b/configure
index c6ad8ce..a28e5dc 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.63 for gmap 2014-07-04.
+# Generated by GNU Autoconf 2.63 for gmap 2014-07-28.
#
# Report bugs to <Thomas Wu <twu at gene.com>>.
#
@@ -745,8 +745,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
# Identity of this package.
PACKAGE_NAME='gmap'
PACKAGE_TARNAME='gmap'
-PACKAGE_VERSION='2014-07-04'
-PACKAGE_STRING='gmap 2014-07-04'
+PACKAGE_VERSION='2014-07-28'
+PACKAGE_STRING='gmap 2014-07-28'
PACKAGE_BUGREPORT='Thomas Wu <twu at gene.com>'
ac_unique_file="src/gmap.c"
@@ -1501,7 +1501,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures gmap 2014-07-04 to adapt to many kinds of systems.
+\`configure' configures gmap 2014-07-28 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1572,7 +1572,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of gmap 2014-07-04:";;
+ short | recursive ) echo "Configuration of gmap 2014-07-28:";;
esac
cat <<\_ACEOF
@@ -1695,7 +1695,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-gmap configure 2014-07-04
+gmap configure 2014-07-28
generated by GNU Autoconf 2.63
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1709,7 +1709,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by gmap $as_me 2014-07-04, which was
+It was created by gmap $as_me 2014-07-28, which was
generated by GNU Autoconf 2.63. Invocation command line was
$ $0 $@
@@ -2079,8 +2079,8 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
{ $as_echo "$as_me:$LINENO: checking package version" >&5
$as_echo_n "checking package version... " >&6; }
-{ $as_echo "$as_me:$LINENO: result: 2014-07-04" >&5
-$as_echo "2014-07-04" >&6; }
+{ $as_echo "$as_me:$LINENO: result: 2014-07-28" >&5
+$as_echo "2014-07-28" >&6; }
### Read defaults
@@ -3979,7 +3979,7 @@ fi
# Define the identity of the package.
PACKAGE=gmap
- VERSION=2014-07-04
+ VERSION=2014-07-28
cat >>confdefs.h <<_ACEOF
@@ -23689,7 +23689,7 @@ exec 6>&1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by gmap $as_me 2014-07-04, which was
+This file was extended by gmap $as_me 2014-07-28, which was
generated by GNU Autoconf 2.63. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -23752,7 +23752,7 @@ Report bugs to <bug-autoconf at gnu.org>."
_ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_version="\\
-gmap config.status 2014-07-04
+gmap config.status 2014-07-28
configured by $0, generated by GNU Autoconf 2.63,
with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
diff --git a/src/atoiindex.c b/src/atoiindex.c
index d677b53..914f03d 100644
--- a/src/atoiindex.c
+++ b/src/atoiindex.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: atoiindex.c 133760 2014-04-20 05:16:56Z twu $";
+static char rcsid[] = "$Id: atoiindex.c 142098 2014-07-22 03:11:00Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -988,7 +988,6 @@ main (int argc, char *argv[]) {
FREE(indexijptrsfile);
SA = (UINT4 *) Access_mmap(&sa_fd,&sa_len,sarrayfile,sizeof(UINT4),/*randomp*/false);
- FREE(sarrayfile);
#if 0
/* Not needed if we already have gbuffer */
@@ -1001,6 +1000,8 @@ main (int argc, char *argv[]) {
#endif
lcp = Sarray_compute_lcp_from_genome(SA,gbuffer,/*n*/genomelength);
+ munmap((void *) SA,sa_len);
+ close(sa_fd);
FREE(gbuffer);
#if 0
Genome_free(&genomebits);
@@ -1034,11 +1035,10 @@ main (int argc, char *argv[]) {
FREE(lcpexcfile);
/* Compute discriminating chars (DC) array */
- discrim_chars = Sarray_discriminating_chars(&nbytes,SA,genomecomp,lcp_bytes,lcp_guide,
+ discrim_chars = Sarray_discriminating_chars(&nbytes,sarrayfile,genomecomp,lcp_bytes,lcp_guide,
lcp_exceptions,/*guide_interval*/1024,/*n*/genomelength,
AG_CHARTABLE);
- munmap((void *) SA,sa_len);
- close(sa_fd);
+ FREE(sarrayfile);
fprintf(stderr,"Building child array\n");
/* Compute child array (relative values) */
@@ -1129,7 +1129,6 @@ main (int argc, char *argv[]) {
FREE(indexijptrsfile);
SA = (UINT4 *) Access_mmap(&sa_fd,&sa_len,sarrayfile,sizeof(UINT4),/*randomp*/false);
- FREE(sarrayfile);
#if 0
/* Not needed if we already have gbuffer */
@@ -1142,6 +1141,8 @@ main (int argc, char *argv[]) {
#endif
lcp = Sarray_compute_lcp_from_genome(SA,gbuffer,/*n*/genomelength);
+ munmap((void *) SA,sa_len);
+ close(sa_fd);
FREE(gbuffer);
#if 0
Genome_free(&genomebits);
@@ -1175,11 +1176,10 @@ main (int argc, char *argv[]) {
FREE(lcpexcfile);
/* Compute discriminating chars (DC) array */
- discrim_chars = Sarray_discriminating_chars(&nbytes,SA,genomecomp,lcp_bytes,lcp_guide,
+ discrim_chars = Sarray_discriminating_chars(&nbytes,sarrayfile,genomecomp,lcp_bytes,lcp_guide,
lcp_exceptions,/*guide_interval*/1024,/*n*/genomelength,
TC_CHARTABLE);
- munmap((void *) SA,sa_len);
- close(sa_fd);
+ FREE(sarrayfile);
fprintf(stderr,"Building child array\n");
/* Compute child array (relative values) */
diff --git a/src/cmetindex.c b/src/cmetindex.c
index 728b057..4c7b850 100644
--- a/src/cmetindex.c
+++ b/src/cmetindex.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: cmetindex.c 133760 2014-04-20 05:16:56Z twu $";
+static char rcsid[] = "$Id: cmetindex.c 142098 2014-07-22 03:11:00Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -991,7 +991,6 @@ main (int argc, char *argv[]) {
FREE(indexijptrsfile);
SA = (UINT4 *) Access_mmap(&sa_fd,&sa_len,sarrayfile,sizeof(UINT4),/*randomp*/false);
- FREE(sarrayfile);
#if 0
/* Not needed if we already have gbuffer */
@@ -1004,6 +1003,8 @@ main (int argc, char *argv[]) {
#endif
lcp = Sarray_compute_lcp_from_genome(SA,gbuffer,/*n*/genomelength);
+ munmap((void *) SA,sa_len);
+ close(sa_fd);
FREE(gbuffer);
#if 0
Genome_free(&genomebits);
@@ -1037,11 +1038,10 @@ main (int argc, char *argv[]) {
FREE(lcpexcfile);
/* Compute discriminating chars (DC) array */
- discrim_chars = Sarray_discriminating_chars(&nbytes,SA,genomecomp,lcp_bytes,lcp_guide,
+ discrim_chars = Sarray_discriminating_chars(&nbytes,sarrayfile,genomecomp,lcp_bytes,lcp_guide,
lcp_exceptions,/*guide_interval*/1024,/*n*/genomelength,
CT_CHARTABLE);
- munmap((void *) SA,sa_len);
- close(sa_fd);
+ FREE(sarrayfile);
fprintf(stderr,"Building child array\n");
/* Compute child array (relative values) */
@@ -1131,7 +1131,6 @@ main (int argc, char *argv[]) {
FREE(indexijptrsfile);
SA = (UINT4 *) Access_mmap(&sa_fd,&sa_len,sarrayfile,sizeof(UINT4),/*randomp*/false);
- FREE(sarrayfile);
#if 0
/* Not needed if we already have gbuffer */
@@ -1144,6 +1143,8 @@ main (int argc, char *argv[]) {
#endif
lcp = Sarray_compute_lcp_from_genome(SA,gbuffer,/*n*/genomelength);
+ munmap((void *) SA,sa_len);
+ close(sa_fd);
FREE(gbuffer);
#if 0
Genome_free(&genomebits);
@@ -1177,11 +1178,10 @@ main (int argc, char *argv[]) {
FREE(lcpexcfile);
/* Compute discriminating chars (DC) array */
- discrim_chars = Sarray_discriminating_chars(&nbytes,SA,genomecomp,lcp_bytes,lcp_guide,
+ discrim_chars = Sarray_discriminating_chars(&nbytes,sarrayfile,genomecomp,lcp_bytes,lcp_guide,
lcp_exceptions,/*guide_interval*/1024,/*n*/genomelength,
GA_CHARTABLE);
- munmap((void *) SA,sa_len);
- close(sa_fd);
+ FREE(sarrayfile);
fprintf(stderr,"Building child array\n");
/* Compute child array (relative values) */
diff --git a/src/dynprog.h b/src/dynprog.h
index 70a2933..4077f0a 100644
--- a/src/dynprog.h
+++ b/src/dynprog.h
@@ -1,4 +1,4 @@
-/* $Id: dynprog.h 138110 2014-06-04 19:34:22Z twu $ */
+/* $Id: dynprog.h 141804 2014-07-17 02:20:36Z twu $ */
#ifndef DYNPROG_INCLUDED
#define DYNPROG_INCLUDED
@@ -31,16 +31,16 @@ typedef enum {HIGHQ, MEDQ, LOWQ, ENDQ} Mismatchtype_T;
#define DEFECT_MEDQ 0.014
#define SINGLE_OPEN_HIGHQ -8
-#define SINGLE_OPEN_MEDQ -6
-#define SINGLE_OPEN_LOWQ -4
+#define SINGLE_OPEN_MEDQ -7
+#define SINGLE_OPEN_LOWQ -6
#define SINGLE_EXTEND_HIGHQ -3
#define SINGLE_EXTEND_MEDQ -2
#define SINGLE_EXTEND_LOWQ -1
#define PAIRED_OPEN_HIGHQ -8
-#define PAIRED_OPEN_MEDQ -6
-#define PAIRED_OPEN_LOWQ -4
+#define PAIRED_OPEN_MEDQ -7
+#define PAIRED_OPEN_LOWQ -6
#define PAIRED_EXTEND_HIGHQ -3
#define PAIRED_EXTEND_MEDQ -2
diff --git a/src/dynprog_genome.c b/src/dynprog_genome.c
index 5f6de4f..d66a7c8 100644
--- a/src/dynprog_genome.c
+++ b/src/dynprog_genome.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: dynprog_genome.c 138118 2014-06-04 20:28:58Z twu $";
+static char rcsid[] = "$Id: dynprog_genome.c 141806 2014-07-17 02:41:31Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -29,6 +29,7 @@ static char rcsid[] = "$Id: dynprog_genome.c 138118 2014-06-04 20:28:58Z twu $";
#include "complement.h"
#include "maxent.h"
#include "maxent_hr.h"
+#include "dynprog.h" /* For parameters */
#include "dynprog_simd.h"
@@ -83,6 +84,7 @@ static char rcsid[] = "$Id: dynprog_genome.c 138118 2014-06-04 20:28:58Z twu $";
#define PROB_CEILING 0.85
#define PROB_FLOOR 0.75
+#define PROB_BAD 0.50
/* Prefer alternate intron to other non-canonicals, but don't
introduce mismatches or gaps to identify */
@@ -1018,8 +1020,9 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -1027,6 +1030,9 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -1108,8 +1114,9 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -1117,6 +1124,9 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -1176,8 +1186,9 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -1185,6 +1196,9 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -1262,8 +1276,9 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -1271,6 +1286,9 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -1330,8 +1348,9 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -1339,6 +1358,9 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -1382,36 +1404,24 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
}
- debug(printf("bestscore %d (bestprob %f) vs bestscore_with_prob %d (bestprob_trunc %f)\n",
- bestscore,bestprob,bestscore_with_prob,bestprob_trunc));
- if (left_probabilities[bestcL_with_prob] < PROB_CEILING || right_probabilities[bestcR_with_prob] < PROB_CEILING) {
- /* Use score without prob */
+ debug(printf("SIMD 8. bestscore %d (bestprob %f) vs bestscore_with_prob %d (bestprob_trunc %f, actually %f and %f)\n",
+ bestscore,bestprob,bestscore_with_prob,bestprob_trunc,left_probabilities[bestcL_with_prob],right_probabilities[bestcR_with_prob]));
+ if (bestprob > 2*PROB_CEILING) {
+ /* Probability is good with best alignment, so take that */
+ debug(printf("Best alignment has good probability\n"));
+ } else if (left_probabilities[bestcL_with_prob] < PROB_CEILING && right_probabilities[bestcR_with_prob] < PROB_CEILING) {
+ /* Probability-based solution is bad, so use alignment */
+ debug(printf("Probability-based solution is bad\n"));
+ } else if (bestscore_with_prob < bestscore - 9) {
+ debug(printf("Probability-based solution requires very bad alignment\n"));
} else {
- if (defect_rate < DEFECT_HIGHQ) {
- if (bestscore_with_prob > bestscore - 15) {
- *bestcL = bestcL_with_prob;
- *bestcR = bestcR_with_prob;
- *bestrL = bestrL_with_prob;
- *bestrR = bestrR_with_prob;
- bestscore = bestscore_with_prob;
- }
-
- } else if (defect_rate < DEFECT_MEDQ) {
- if (bestscore_with_prob > bestscore - 25) {
- *bestcL = bestcL_with_prob;
- *bestcR = bestcR_with_prob;
- *bestrL = bestrL_with_prob;
- *bestrR = bestrR_with_prob;
- bestscore = bestscore_with_prob;
- }
-
- } else {
- *bestcL = bestcL_with_prob;
- *bestcR = bestcR_with_prob;
- *bestrL = bestrL_with_prob;
- *bestrR = bestrR_with_prob;
- bestscore = bestscore_with_prob;
- }
+ /* Best alignment yields bad probability, and probability-based alignment yields good probability, so switch */
+ debug(printf("Switch to probability-based solution\n"));
+ *bestcL = bestcL_with_prob;
+ *bestcR = bestcR_with_prob;
+ *bestrL = bestrL_with_prob;
+ *bestrR = bestrR_with_prob;
+ bestscore = bestscore_with_prob;
}
scoreI = intron_score(&introntype,leftdi[*bestcL],rightdi[*bestcR],
@@ -1958,8 +1968,9 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -1967,6 +1978,9 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -2048,8 +2062,9 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -2057,6 +2072,9 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -2116,8 +2134,9 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -2125,6 +2144,9 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -2202,8 +2224,9 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -2211,6 +2234,9 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -2270,8 +2296,9 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -2279,6 +2306,9 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -2322,36 +2352,24 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
}
- debug(printf("bestscore %d (bestprob %f) vs bestscore_with_prob %d (bestprob_trunc %f)\n",
- bestscore,bestprob,bestscore_with_prob,bestprob_trunc));
- if (left_probabilities[bestcL_with_prob] < PROB_CEILING || right_probabilities[bestcR_with_prob] < PROB_CEILING) {
- /* Use score without prob */
+ debug(printf("SIMD 16. bestscore %d (bestprob %f) vs bestscore_with_prob %d (bestprob_trunc %f, actually %f and %f)\n",
+ bestscore,bestprob,bestscore_with_prob,bestprob_trunc,left_probabilities[bestcL_with_prob],right_probabilities[bestcR_with_prob]));
+ if (bestprob > 2*PROB_CEILING) {
+ /* Probability is good with best alignment, so take that */
+ debug(printf("Best alignment has good probability\n"));
+ } else if (left_probabilities[bestcL_with_prob] < PROB_CEILING && right_probabilities[bestcR_with_prob] < PROB_CEILING) {
+ /* Probability-based solution is bad, so use alignment */
+ debug(printf("Probability-based solution is bad\n"));
+ } else if (bestscore_with_prob < bestscore - 9) {
+ debug(printf("Probability-based solution requires very bad alignment\n"));
} else {
- if (defect_rate < DEFECT_HIGHQ) {
- if (bestscore_with_prob > bestscore - 15) {
- *bestcL = bestcL_with_prob;
- *bestcR = bestcR_with_prob;
- *bestrL = bestrL_with_prob;
- *bestrR = bestrR_with_prob;
- bestscore = bestscore_with_prob;
- }
-
- } else if (defect_rate < DEFECT_MEDQ) {
- if (bestscore_with_prob > bestscore - 25) {
- *bestcL = bestcL_with_prob;
- *bestcR = bestcR_with_prob;
- *bestrL = bestrL_with_prob;
- *bestrR = bestrR_with_prob;
- bestscore = bestscore_with_prob;
- }
-
- } else {
- *bestcL = bestcL_with_prob;
- *bestcR = bestcR_with_prob;
- *bestrL = bestrL_with_prob;
- *bestrR = bestrR_with_prob;
- bestscore = bestscore_with_prob;
- }
+ /* Best alignment yields bad probability, and probability-based alignment yields good probability, so switch */
+ debug(printf("Switch to probability-based solution\n"));
+ *bestcL = bestcL_with_prob;
+ *bestcR = bestcR_with_prob;
+ *bestrL = bestrL_with_prob;
+ *bestrR = bestrR_with_prob;
+ bestscore = bestscore_with_prob;
}
scoreI = intron_score(&introntype,leftdi[*bestcL],rightdi[*bestcR],
@@ -2748,8 +2766,9 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -2757,6 +2776,9 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -2837,8 +2859,9 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -2846,6 +2869,9 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -2922,8 +2948,9 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
cdna_direction,canonical_reward,finalp);
if ((score = scoreL + scoreI + scoreR) > bestscore) {
- debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore)\n",
- cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR));
+ debug3(printf("No prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
bestscore = score;
*bestrL = rL;
*bestrR = rR;
@@ -2931,6 +2958,9 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
*bestcR = cR;
bestprob = probL + probR;
} else if (score == bestscore && probL + probR > bestprob) {
+ debug3(printf("Improved prob: At %d left to %d right, score is (%d)+(%d)+(%d) = %d (bestscore, prob %f + %f)\n",
+ cL,cR,scoreL,scoreI,scoreR,scoreL+scoreI+scoreR,probL,probR));
+ debug3(printf("probL %f, probR %f\n",left_probabilities[cL],right_probabilities[cR]));
*bestrL = rL;
*bestrR = rR;
*bestcL = cL;
@@ -2974,36 +3004,24 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
}
- debug(printf("bestscore %d (bestprob %f) vs bestscore_with_prob %d (bestprob_trunc %f)\n",
- bestscore,bestprob,bestscore_with_prob,bestprob_trunc));
- if (left_probabilities[bestcL_with_prob] < PROB_CEILING || right_probabilities[bestcR_with_prob] < PROB_CEILING) {
- /* Use score without prob */
+ debug(printf("Non-SIMD. bestscore %d (bestprob %f) vs bestscore_with_prob %d (bestprob_trunc %f, actually %f and %f)\n",
+ bestscore,bestprob,bestscore_with_prob,bestprob_trunc,left_probabilities[bestcL_with_prob],right_probabilities[bestcR_with_prob]));
+ if (bestprob > 2*PROB_CEILING) {
+ /* Probability is good with best alignment, so take that */
+ debug(printf("Best alignment has good probability\n"));
+ } else if (left_probabilities[bestcL_with_prob] < PROB_CEILING && right_probabilities[bestcR_with_prob] < PROB_CEILING) {
+ /* Probability-based solution is bad, so use alignment */
+ debug(printf("Probability-based solution is bad\n"));
+ } else if (bestscore_with_prob < bestscore - 9) {
+ debug(printf("Probability-based solution requires very bad alignment\n"));
} else {
- if (defect_rate < DEFECT_HIGHQ) {
- if (bestscore_with_prob > bestscore - 15) {
- *bestcL = bestcL_with_prob;
- *bestcR = bestcR_with_prob;
- *bestrL = bestrL_with_prob;
- *bestrR = bestrR_with_prob;
- bestscore = bestscore_with_prob;
- }
-
- } else if (defect_rate < DEFECT_MEDQ) {
- if (bestscore_with_prob > bestscore - 25) {
- *bestcL = bestcL_with_prob;
- *bestcR = bestcR_with_prob;
- *bestrL = bestrL_with_prob;
- *bestrR = bestrR_with_prob;
- bestscore = bestscore_with_prob;
- }
-
- } else {
- *bestcL = bestcL_with_prob;
- *bestcR = bestcR_with_prob;
- *bestrL = bestrL_with_prob;
- *bestrR = bestrR_with_prob;
- bestscore = bestscore_with_prob;
- }
+ /* Best alignment yields bad probability, and probability-based alignment yields good probability, so switch */
+ debug(printf("Switch to probability-based solution\n"));
+ *bestcL = bestcL_with_prob;
+ *bestcR = bestcR_with_prob;
+ *bestrL = bestrL_with_prob;
+ *bestrR = bestrR_with_prob;
+ bestscore = bestscore_with_prob;
}
scoreI = intron_score(&introntype,leftdi[*bestcL],rightdi[*bestcR],
diff --git a/src/dynprog_simd.c b/src/dynprog_simd.c
index 0cca861..d02e082 100644
--- a/src/dynprog_simd.c
+++ b/src/dynprog_simd.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: dynprog_simd.c 140650 2014-07-04 01:15:48Z twu $";
+static char rcsid[] = "$Id: dynprog_simd.c 142097 2014-07-22 03:10:37Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -21,6 +21,8 @@ static char rcsid[] = "$Id: dynprog_simd.c 140650 2014-07-04 01:15:48Z twu $";
#include "assert.h"
+/* #define NO_INITIAL_GAP_PENALTY 1 -- Needed for bam_indelfix */
+
#define LAZY_INDEL 1 /* Don't advance to next coordinate on final indel, since could go over chromosome bounds. */
/* Row 0 and column 0 directions */
@@ -281,39 +283,83 @@ Matrix16_print (Score16_T **matrix, int rlength, int glength, char *rsequence,
_mm_lfence();
/* j */
- printf(" "); /* For i */
+ if (rlength >= 100) {
+ printf(" ");
+ } else {
+ printf(" "); /* For i */
+ }
printf(" ");
- for (j = 0; j <= glength; ++j) {
- printf(" %2d ",j);
+ if (glength >= 100) {
+ for (j = 0; j <= glength; ++j) {
+ printf(" %3d ",j);
+ }
+ } else {
+ for (j = 0; j <= glength; ++j) {
+ printf(" %2d ",j);
+ }
}
printf("\n");
if (gsequence) {
- printf(" "); /* For i */
+ if (rlength >= 100) {
+ printf(" ");
+ } else {
+ printf(" "); /* For i */
+ }
printf(" ");
- for (j = 0; j <= glength; ++j) {
- if (j == 0) {
- printf(" ");
- } else {
- printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
+ if (glength >= 100) {
+ for (j = 0; j <= glength; ++j) {
+ if (j == 0) {
+ printf(" ");
+ } else {
+ printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
+ }
+ }
+ } else {
+ for (j = 0; j <= glength; ++j) {
+ if (j == 0) {
+ printf(" ");
+ } else {
+ printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
+ }
}
}
printf("\n");
}
if (gsequencealt != gsequence) {
- printf(" "); /* For i */
+ if (rlength >= 100) {
+ printf(" ");
+ } else {
+ printf(" "); /* For i */
+ }
printf(" ");
- for (j = 0; j <= glength; ++j) {
- if (j == 0) {
- printf(" ");
- } else {
- g = revp ? gsequence[-j+1] : gsequence[j-1];
- g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
- if (g == g_alt) {
- printf(" %c ",' ');
+ if (glength >= 100) {
+ for (j = 0; j <= glength; ++j) {
+ if (j == 0) {
+ printf(" ");
} else {
- printf(" %c ",g_alt);
+ g = revp ? gsequence[-j+1] : gsequence[j-1];
+ g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
+ if (g == g_alt) {
+ printf(" %c ",' ');
+ } else {
+ printf(" %c ",g_alt);
+ }
+ }
+ }
+ } else {
+ for (j = 0; j <= glength; ++j) {
+ if (j == 0) {
+ printf(" ");
+ } else {
+ g = revp ? gsequence[-j+1] : gsequence[j-1];
+ g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
+ if (g == g_alt) {
+ printf(" %c ",' ');
+ } else {
+ printf(" %c ",g_alt);
+ }
}
}
}
@@ -321,21 +367,39 @@ Matrix16_print (Score16_T **matrix, int rlength, int glength, char *rsequence,
}
for (i = 0; i <= rlength; ++i) {
- printf("%2d ",i);
+ if (rlength >= 100) {
+ printf("%3d ",i);
+ } else {
+ printf("%2d ",i);
+ }
if (i == 0) {
printf(" ");
} else {
printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
}
- for (j = 0; j <= glength; ++j) {
- if (j < i - lband) {
- printf(" . ");
- } else if (j > i + uband) {
- printf(" . ");
- } else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
- printf("%3d ",NEG_INFINITY_DISPLAY);
- } else {
- printf("%3d ",matrix[j][i]);
+ if (glength >= 100) {
+ for (j = 0; j <= glength; ++j) {
+ if (j < i - lband) {
+ printf(" . ");
+ } else if (j > i + uband) {
+ printf(" . ");
+ } else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
+ printf(" %3d ",NEG_INFINITY_DISPLAY);
+ } else {
+ printf(" %3d ",matrix[j][i]);
+ }
+ }
+ } else {
+ for (j = 0; j <= glength; ++j) {
+ if (j < i - lband) {
+ printf(" . ");
+ } else if (j > i + uband) {
+ printf(" . ");
+ } else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
+ printf("%3d ",NEG_INFINITY_DISPLAY);
+ } else {
+ printf("%3d ",matrix[j][i]);
+ }
}
}
printf("\n");
@@ -651,7 +715,7 @@ Directions16_print (Direction16_T **directions_nogap, Direction16_T **directions
printf(" "); /* For i */
printf(" ");
for (j = 0; j <= glength; ++j) {
- printf(" %2d ",j);
+ printf(" %3d ",j);
}
printf("\n");
@@ -1368,6 +1432,7 @@ Dynprog_simd_8 (Direction8_T ***directions_nogap, Direction8_T ***directions_Ega
#endif
+ debug2(printf("Dynprog_simd_8. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
debug15(printf("Dynprog_simd_8. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
rlength_ceil = (int) ((rlength + SIMD_NCHARS)/SIMD_NCHARS) * SIMD_NCHARS;
@@ -1380,7 +1445,7 @@ Dynprog_simd_8 (Direction8_T ***directions_nogap, Direction8_T ***directions_Ega
all_128 = _mm_set1_epi8(128);
#endif
- debug(printf("compute_scores_simd_8_bycols (upper): "));
+ debug(printf("Dynprog_simd_8: "));
debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
debug(printf("Query length rounded up to %d\n",rlength_ceil));
@@ -1442,17 +1507,30 @@ Dynprog_simd_8 (Direction8_T ***directions_nogap, Direction8_T ***directions_Ega
/* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
#ifdef HAVE_SSE4_1
pairscores_col0[0] = (Score8_T) 0;
- /* Initializion just to lband causes errors in dir_horiz for Egap */
+ /* Initialization just to lband causes errors in dir_horiz for Egap */
+#ifdef NO_INITIAL_GAP_PENALTY
+ for (r = 1; r < lband_ceil; r++) {
+ pairscores_col0[r] = (Score8_T) 0;
+ }
+#else
for (r = 1; r < lband_ceil; r++) {
pairscores_col0[r] = (Score8_T) NEG_INFINITY_8;
}
+#endif
#else
pairscores_col0[0] = (Score8_T) 0+128;
- /* Initializion just to lband causes errors in dir_horiz for Egap */
+ /* Initialization just to lband causes errors in dir_horiz for Egap */
+#ifdef NO_INITIAL_GAP_PENALTY
+ for (r = 1; r < lband_ceil; r++) {
+ pairscores_col0[r] = (Score8_T) 0+128;
+ }
+#else
for (r = 1; r < lband_ceil; r++) {
pairscores_col0[r] = (Score8_T) NEG_INFINITY_8+128;
}
#endif
+#endif
+
/* Row 0 */
r = 0; na1 = 'N';
@@ -1506,8 +1584,13 @@ Dynprog_simd_8 (Direction8_T ***directions_nogap, Direction8_T ***directions_Ega
/* dir_horiz tests if E >= H. To fill in first column of each
row block with non-diags, make E == H. */
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_r_gap = _mm_set1_epi8(-extend);
+ H_nogap_r = _mm_set1_epi8(-open-extend);
+#else
E_r_gap = _mm_set1_epi8(NEG_INFINITY_8);
H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
+#endif
if ((c = rlo - lband) < 0) {
c = 0;
@@ -1530,7 +1613,11 @@ Dynprog_simd_8 (Direction8_T ***directions_nogap, Direction8_T ***directions_Ega
pairscores_alt_ptr = pairscores[na2_alt];
if (rlo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi8(0);
+#else
X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR);
} else {
/* second or greater block of 8 */
@@ -1690,8 +1777,13 @@ Dynprog_simd_8 (Direction8_T ***directions_nogap, Direction8_T ***directions_Ega
/* dir_horiz tests if E > H. To fill in first column of each
row block with non-diags, make E > H. */
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_r_gap = _mm_set1_epi8(-extend);
+ H_nogap_r = _mm_set1_epi8(-open-extend-1);
+#else
E_r_gap = _mm_set1_epi8(NEG_INFINITY_8+1);
H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
+#endif
if ((c = rlo - lband) < 0) {
c = 0;
@@ -1714,7 +1806,11 @@ Dynprog_simd_8 (Direction8_T ***directions_nogap, Direction8_T ***directions_Ega
pairscores_alt_ptr = pairscores[na2_alt];
if (rlo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi8(0);
+#else
X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR);
} else {
/* second or greater block of 8 */
@@ -1992,6 +2088,7 @@ Dynprog_simd_8_upper (Direction8_T ***directions_nogap, Direction8_T ***directio
#endif
+ debug2(printf("Dynprog_simd_8_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
debug15(printf("Dynprog_simd_8_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
rlength_ceil = (int) ((rlength + SIMD_NCHARS)/SIMD_NCHARS) * SIMD_NCHARS;
@@ -2112,8 +2209,13 @@ Dynprog_simd_8_upper (Direction8_T ***directions_nogap, Direction8_T ***directio
row block with non-diags, could make E == H. But irrelevant,
because these are below the diagonal. */
E_mask = _mm_set1_epi8(1);
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_r_gap = _mm_set1_epi8(-extend);
+ H_nogap_r = _mm_set1_epi8(-open-extend);
+#else
E_r_gap = _mm_set1_epi8(NEG_INFINITY_8);
H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
+#endif
for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
score_column = matrix[c];
@@ -2130,7 +2232,11 @@ Dynprog_simd_8_upper (Direction8_T ***directions_nogap, Direction8_T ***directio
if (c == 0) {
X_prev_nogap = _mm_set1_epi8(0);
} else if (rlo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi8(0);
+#else
X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the rlo bounds */
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR);
} else {
/* second or greater block of 8 */
@@ -2231,8 +2337,13 @@ Dynprog_simd_8_upper (Direction8_T ***directions_nogap, Direction8_T ***directio
row block with non-diags, could make E > H. But irrelevant,
because these are below the diagonal. */
E_mask = _mm_set1_epi8(1);
- E_r_gap = _mm_set1_epi8(NEG_INFINITY_8);
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_r_gap = _mm_set1_epi8(-extend);
+ H_nogap_r = _mm_set1_epi8(-open-extend-1);
+#else
+ E_r_gap = _mm_set1_epi8(NEG_INFINITY_8+1);
H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
+#endif
for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
score_column = matrix[c];
@@ -2249,7 +2360,11 @@ Dynprog_simd_8_upper (Direction8_T ***directions_nogap, Direction8_T ***directio
if (c == 0) {
X_prev_nogap = _mm_set1_epi8(0);
} else if (rlo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi8(0);
+#else
X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the rlo bounds */
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR);
} else {
/* second or greater block of 8 */
@@ -2433,6 +2548,7 @@ Dynprog_simd_8_lower (Direction8_T ***directions_nogap, Direction8_T ***directio
#endif
+ debug2(printf("Dynprog_simd_8_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
debug15(printf("Dynprog_simd_8_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
glength_ceil = (int) ((glength + SIMD_NCHARS)/SIMD_NCHARS) * SIMD_NCHARS;
@@ -2592,8 +2708,13 @@ Dynprog_simd_8_lower (Direction8_T ***directions_nogap, Direction8_T ***directio
/* dir_vert tests if E >= H. To fill in first row of each
column block with non-diags, make E == H. */
E_mask = _mm_set1_epi8(1);
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_c_gap = _mm_set1_epi8(-extend);
+ H_nogap_c = _mm_set1_epi8(-open-extend);
+#else
E_c_gap = _mm_set1_epi8(NEG_INFINITY_8);
H_nogap_c = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
+#endif
for (r = clo; r <= chigh + lband && r <= rlength; r++) {
score_column = matrix[r];
@@ -2608,7 +2729,11 @@ Dynprog_simd_8_lower (Direction8_T ***directions_nogap, Direction8_T ***directio
if (r == 0) {
X_prev_nogap = _mm_set1_epi8(0);
} else if (clo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi8(0);
+#else
X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the clo bounds */
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR);
} else {
/* second or greater block of 8 */
@@ -2707,8 +2832,13 @@ Dynprog_simd_8_lower (Direction8_T ***directions_nogap, Direction8_T ***directio
/* dir_vert tests if E > H. To fill in first row of each
column block with non-diags, make E > H. */
E_mask = _mm_set1_epi8(1);
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_c_gap = _mm_set1_epi8(-extend);
+ H_nogap_c = _mm_set1_epi8(-open-extend-1);
+#else
E_c_gap = _mm_set1_epi8(NEG_INFINITY_8+1);
H_nogap_c = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
+#endif
for (r = clo; r <= chigh + lband && r <= rlength; r++) {
score_column = matrix[r];
@@ -2723,7 +2853,11 @@ Dynprog_simd_8_lower (Direction8_T ***directions_nogap, Direction8_T ***directio
if (r == 0) {
X_prev_nogap = _mm_set1_epi8(0);
} else if (clo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi8(0);
+#else
X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the clo bounds */
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR);
} else {
/* second or greater block of 8 */
@@ -2900,6 +3034,7 @@ Dynprog_simd_16 (Direction16_T ***directions_nogap, Direction16_T ***directions_
#endif
+ debug2(printf("Dynprog_simd_16. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
debug15(printf("Dynprog_simd_16. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
rlength_ceil = (int) ((rlength + SIMD_NSHORTS)/SIMD_NSHORTS) * SIMD_NSHORTS;
@@ -2964,10 +3099,16 @@ Dynprog_simd_16 (Direction16_T ***directions_nogap, Direction16_T ***directions_
pairscores_col0[0] = (Score16_T) 0;
- /* Initializion just to lband causes errors in dir_horiz for Egap */
+ /* Initialization just to lband causes errors in dir_horiz for Egap */
+#ifdef NO_INITIAL_GAP_PENALTY
+ for (r = 1; r < lband_ceil; r++) {
+ pairscores_col0[r] = (Score16_T) 0;
+ }
+#else
for (r = 1; r < lband_ceil; r++) {
pairscores_col0[r] = (Score16_T) NEG_INFINITY_16;
}
+#endif
r = 0; na1 = 'N';
pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
@@ -3020,8 +3161,13 @@ Dynprog_simd_16 (Direction16_T ***directions_nogap, Direction16_T ***directions_
/* dir_horiz tests if E >= H. To fill in first column of each
row block with non-diags, make E == H. */
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_r_gap = _mm_set1_epi16(-extend);
+ H_nogap_r = _mm_set1_epi16(-open-extend);
+#else
E_r_gap = _mm_set1_epi16(NEG_INFINITY_16);
H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
+#endif
if ((c = rlo - lband) < 0) {
c = 0;
@@ -3044,7 +3190,11 @@ Dynprog_simd_16 (Direction16_T ***directions_nogap, Direction16_T ***directions_
pairscores_alt_ptr = pairscores[na2_alt];
if (rlo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi16(0);
+#else
X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT);
} else {
/* second or greater block of 16 */
@@ -3184,8 +3334,13 @@ Dynprog_simd_16 (Direction16_T ***directions_nogap, Direction16_T ***directions_
/* dir_horiz tests if E > H. To fill in first column of each
row block with non-diags, make E > H. */
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_r_gap = _mm_set1_epi16(-extend);
+ H_nogap_r = _mm_set1_epi16(-open-extend-1);
+#else
E_r_gap = _mm_set1_epi16(NEG_INFINITY_16+1);
H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
+#endif
if ((c = rlo - lband) < 0) {
c = 0;
@@ -3208,7 +3363,11 @@ Dynprog_simd_16 (Direction16_T ***directions_nogap, Direction16_T ***directions_
pairscores_alt_ptr = pairscores[na2_alt];
if (rlo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi16(0);
+#else
X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT);
} else {
/* second or greater block of 16 */
@@ -3435,6 +3594,8 @@ Dynprog_simd_16_upper (Direction16_T ***directions_nogap, Direction16_T ***direc
char na2_single;
#endif
+
+ debug2(printf("Dynprog_simd_16_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
debug15(printf("Dynprog_simd_16_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
rlength_ceil = (int) ((rlength + SIMD_NSHORTS)/SIMD_NSHORTS) * SIMD_NSHORTS;
@@ -3543,8 +3704,13 @@ Dynprog_simd_16_upper (Direction16_T ***directions_nogap, Direction16_T ***direc
row block with non-diags, could make E == H. But irrelevant,
because these are above the diagonal. */
E_mask = _mm_set1_epi16(1);
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_r_gap = _mm_set1_epi16(-extend);
+ H_nogap_r = _mm_set1_epi16(-open-extend);
+#else
E_r_gap = _mm_set1_epi16(NEG_INFINITY_16);
H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
+#endif
for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
score_column = matrix[c];
@@ -3561,7 +3727,11 @@ Dynprog_simd_16_upper (Direction16_T ***directions_nogap, Direction16_T ***direc
if (c == 0) {
X_prev_nogap = _mm_set1_epi16(0);
} else if (rlo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi16(0);
+#else
X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT);
} else {
/* second or greater block of 16 */
@@ -3633,8 +3803,13 @@ Dynprog_simd_16_upper (Direction16_T ***directions_nogap, Direction16_T ***direc
row block with non-diags, could make E > H. But irrelevant,
because these are above the diagonal. */
E_mask = _mm_set1_epi16(1);
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_r_gap = _mm_set1_epi16(-extend);
+ H_nogap_r = _mm_set1_epi16(-open-extend-1);
+#else
E_r_gap = _mm_set1_epi16(NEG_INFINITY_16+1);
H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
+#endif
for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
score_column = matrix[c];
@@ -3651,7 +3826,11 @@ Dynprog_simd_16_upper (Direction16_T ***directions_nogap, Direction16_T ***direc
if (c == 0) {
X_prev_nogap = _mm_set1_epi16(0);
} else if (rlo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi16(0);
+#else
X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT);
} else {
/* second or greater block of 16 */
@@ -3801,6 +3980,8 @@ Dynprog_simd_16_lower (Direction16_T ***directions_nogap, Direction16_T ***direc
char na2_single;
#endif
+
+ debug2(printf("Dynprog_simd_16_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
debug15(printf("Dynprog_simd_16_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
glength_ceil = (int) ((glength + SIMD_NSHORTS)/SIMD_NSHORTS) * SIMD_NSHORTS;
@@ -3940,8 +4121,13 @@ Dynprog_simd_16_lower (Direction16_T ***directions_nogap, Direction16_T ***direc
/* dir_vert tests if E >= H. To fill in first row of each
column block with non-diags, make E == H. */
E_mask = _mm_set1_epi16(1);
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_c_gap = _mm_set1_epi16(-extend);
+ H_nogap_c = _mm_set1_epi16(-open-extend);
+#else
E_c_gap = _mm_set1_epi16(NEG_INFINITY_16);
H_nogap_c = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
+#endif
for (r = clo; r <= chigh + lband && r <= rlength; r++) {
score_column = matrix[r];
@@ -3956,7 +4142,11 @@ Dynprog_simd_16_lower (Direction16_T ***directions_nogap, Direction16_T ***direc
if (r == 0) {
X_prev_nogap = _mm_set1_epi16(0);
} else if (clo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi16(0);
+#else
X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT);
} else {
/* second or greater block of 16 */
@@ -4026,8 +4216,13 @@ Dynprog_simd_16_lower (Direction16_T ***directions_nogap, Direction16_T ***direc
/* dir_vert tests if E > H. To fill in first row of each
column block with non-diags, make E > H. */
E_mask = _mm_set1_epi16(1);
+#ifdef NO_INITIAL_GAP_PENALTY
+ E_c_gap = _mm_set1_epi16(-extend);
+ H_nogap_c = _mm_set1_epi16(-open-extend-1);
+#else
E_c_gap = _mm_set1_epi16(NEG_INFINITY_16+1);
H_nogap_c = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
+#endif
for (r = clo; r <= chigh + lband && r <= rlength; r++) {
score_column = matrix[r];
@@ -4042,7 +4237,11 @@ Dynprog_simd_16_lower (Direction16_T ***directions_nogap, Direction16_T ***direc
if (r == 0) {
X_prev_nogap = _mm_set1_epi16(0);
} else if (clo == 0) {
+#ifdef NO_INITIAL_GAP_PENALTY
+ X_prev_nogap = _mm_set1_epi16(0);
+#else
X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
+#endif
X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT);
} else {
/* second or greater block of 16 */
diff --git a/src/gmap.c b/src/gmap.c
index 5fce38e..a5a4bdf 100644
--- a/src/gmap.c
+++ b/src/gmap.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: gmap.c 140368 2014-07-02 00:56:33Z twu $";
+static char rcsid[] = "$Id: gmap.c 141811 2014-07-17 03:06:17Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -942,6 +942,7 @@ update_stage3list (List_T stage3list, bool lowidentityp, Sequence_T queryseq,
int sensedir;
int nmatches_posttrim, max_match_length, ambig_end_length_5, ambig_end_length_3;
Splicetype_T ambig_splicetype_5, ambig_splicetype_3;
+ double ambig_prob_5, ambig_prob_3;
double defect_rate, min_splice_prob;
double stage3_runtime;
int subseq_offset;
@@ -1002,7 +1003,7 @@ update_stage3list (List_T stage3list, bool lowidentityp, Sequence_T queryseq,
&matches,&nmatches_posttrim,&max_match_length,
&ambig_end_length_5,&ambig_end_length_3,
&ambig_splicetype_5,&ambig_splicetype_3,
- &unknowns,&mismatches,&qopens,&qindels,&topens,&tindels,
+ &ambig_prob_5,&ambig_prob_3,&unknowns,&mismatches,&qopens,&qindels,&topens,&tindels,
&ncanonical,&nsemicanonical,&nnoncanonical,&min_splice_prob,stage2,
#ifdef PMAP
/*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
diff --git a/src/pair.c b/src/pair.c
index 09e0f19..39dcf83 100644
--- a/src/pair.c
+++ b/src/pair.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: pair.c 135239 2014-05-06 16:12:15Z twu $";
+static char rcsid[] = "$Id: pair.c 141803 2014-07-17 02:12:57Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -6110,22 +6110,27 @@ Pair_array_nmatches_posttrim (struct T *pairarray, int npairs, int pos5, int pos
int
Pair_nmismatches_region (int *nindelbreaks, struct T *pairs, int npairs,
- int trim_left, int trim_right, int querylength) {
+ int trim_left, int trim_right, int start_amb_nmatches, int end_amb_nmatches,
+ int querylength) {
int nmismatches = 0;
bool in_intron = false, indelp = false;
int i = 0;
T this;
*nindelbreaks = 0;
+
+ /* Handle GMAP alignments that are not extended to the end */
+ this = &(pairs[0]);
+ if (this->querypos - start_amb_nmatches < trim_left) {
+ /* Skip */
+ } else {
+ nmismatches += (this->querypos - start_amb_nmatches) - trim_left;
+ }
+
while (i < npairs) {
this = &(pairs[i]);
- if (this->querypos < trim_left) {
- /* Skip */
- } else if (this->querypos >= querylength - trim_right) {
- /* Skip */
- } else if (this->comp == MISMATCH_COMP) {
- nmismatches++;
- } else if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
+ if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
+ /* Count indelbreaks, even if outside of trimmed region */
if (this->genome == ' ') {
/* INSERTION */
while (i < npairs && this->genome == ' ') {
@@ -6144,10 +6149,25 @@ Pair_nmismatches_region (int *nindelbreaks, struct T *pairs, int npairs,
i--;
(*nindelbreaks) += 1;
}
+
+ } else if (this->querypos < trim_left) {
+ /* Skip for counting mismatches */
+ } else if (this->querypos >= querylength - trim_right) {
+ /* Skip for counting mismatches */
+ } else if (this->comp == MISMATCH_COMP) {
+ nmismatches++;
}
i++;
}
+ /* Handle GMAP alignments that are not extended to the end */
+ this = &(pairs[npairs-1]);
+ if (this->querypos + end_amb_nmatches >= (querylength - 1) - trim_right) {
+ /* Skip */
+ } else {
+ nmismatches += (querylength - 1 - trim_right) - (this->querypos + end_amb_nmatches);
+ }
+
return nmismatches;
}
@@ -7868,7 +7888,7 @@ Pair_end_bound (int *cdna_direction, List_T pairs, int breakpoint) {
List_T
-Pair_trim_ends (bool *trim5p, bool *trim3p, List_T pairs) {
+Pair_trim_ends (bool *trim5p, bool *trim3p, List_T pairs, int ambig_end_length_5, int ambig_end_length_3) {
List_T trimmed = NULL;
int trim_right = 0, trim_left = -1; /* Needs to be -1 to avoid trimming when pairs is NULL */
int bestscore, score;
@@ -7939,6 +7959,9 @@ Pair_trim_ends (bool *trim5p, bool *trim3p, List_T pairs) {
if (this == NULL) {
fprintf(stderr,"check for trim_right yields this == NULL\n");
abort();
+ } else if (ambig_end_length_3 > 0) {
+ debug8(printf("Not disturbing ambiguous end on right\n"));
+ trim_right = 0;
} else if (this->protectedp == true) {
debug8(printf("Protected against trim_right\n"));
trim_right = 0;
@@ -8005,6 +8028,9 @@ Pair_trim_ends (bool *trim5p, bool *trim3p, List_T pairs) {
if (this == NULL) {
fprintf(stderr,"check for trim_left yields this == NULL\n");
abort();
+ } else if (ambig_end_length_5 > 0) {
+ debug8(printf("Not disturbing ambiguous end on left\n"));
+ trim_left = pairi - 1;
} else if (this->protectedp == true) {
debug8(printf("Protected against trim_left\n"));
trim_left = pairi - 1;
diff --git a/src/pair.h b/src/pair.h
index d48a6a1..ffed08a 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -1,4 +1,4 @@
-/* $Id: pair.h 133832 2014-04-21 21:34:20Z twu $ */
+/* $Id: pair.h 141662 2014-07-16 01:30:15Z twu $ */
#ifndef PAIR_INCLUDED
#define PAIR_INCLUDED
@@ -241,7 +241,8 @@ extern int
Pair_array_nmatches_posttrim (struct T *pairs, int npairs, int pos5, int pos3);
extern int
Pair_nmismatches_region (int *nindelbreaks, struct T *pairs, int npairs,
- int trim_left, int trim_right, int querylength);
+ int trim_left, int trim_right, int start_amb_nmatches, int end_amb_nmatches,
+ int querylength);
extern void
Pair_fracidentity_simple (int *matches, int *unknowns, int *mismatches, List_T pairs);
@@ -305,7 +306,7 @@ Pair_end_bound (int *cdna_direction, List_T pairs, int breakpoint);
extern List_T
-Pair_trim_ends (bool *trim5p, bool *trim3p, List_T pairs);
+Pair_trim_ends (bool *trim5p, bool *trim3p, List_T pairs, int ambig_end_length_5, int ambig_end_length_3);
#ifdef GSNAP
extern float
diff --git a/src/pairpool.c b/src/pairpool.c
index ed02043..3afa19e 100644
--- a/src/pairpool.c
+++ b/src/pairpool.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: pairpool.c 136057 2014-05-13 20:14:15Z twu $";
+static char rcsid[] = "$Id: pairpool.c 141801 2014-07-16 23:29:17Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -289,11 +289,11 @@ Pairpool_push_copy (List_T list, T this, Pair_T orig) {
memcpy(pair,orig,sizeof(struct Pair_T));
debug(
- printf("Copying %p: %d %d %c %c %c\n",
- pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
+ printf("Copying %p <= %p: %d %d %c %c %c\n",
+ pair,orig,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
);
- if (this->listcellctr >= this->nlistcells) {
+ if (this->listcellctr >= this->nlistcells) {
this->listcellptr = add_new_listcellchunk(this);
} else if ((this->listcellctr % CHUNKSIZE) == 0) {
for (n = this->nlistcells - CHUNKSIZE, p = this->listcellchunks;
diff --git a/src/stage1hr.c b/src/stage1hr.c
index 9b1d09c..99bbaed 100644
--- a/src/stage1hr.c
+++ b/src/stage1hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage1hr.c 140368 2014-07-02 00:56:33Z twu $";
+static char rcsid[] = "$Id: stage1hr.c 141663 2014-07-16 01:30:53Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -11663,6 +11663,7 @@ run_gmap (bool *good_start_p, bool *good_end_p, History_T gmap_history,
int matches, unknowns, mismatches, qopens, qindels, topens, tindels;
int nmatches_posttrim, max_match_length, ambig_end_length_5, ambig_end_length_3;
Splicetype_T ambig_splicetype_5, ambig_splicetype_3;
+ double ambig_prob_5, ambig_prob_3;
int ncanonical, nsemicanonical, nnoncanonical;
int maxintronlen_bound;
@@ -11749,6 +11750,7 @@ run_gmap (bool *good_start_p, bool *good_end_p, History_T gmap_history,
&matches,&nmatches_posttrim,&max_match_length,
&ambig_end_length_5,&ambig_end_length_3,
&ambig_splicetype_5,&ambig_splicetype_3,
+ &ambig_prob_5,&ambig_prob_3,
&unknowns,&mismatches,&qopens,&qindels,&topens,&tindels,
&ncanonical,&nsemicanonical,&nnoncanonical,&min_splice_prob,stage2,
#ifdef END_KNOWNSPLICING_SHORTCUT
@@ -11802,7 +11804,8 @@ run_gmap (bool *good_start_p, bool *good_end_p, History_T gmap_history,
/*plusterm*/querylength - 1 - Pair_querypos(&(pairarray[npairs-1])),chrhigh);
if ((hit = Stage3end_new_gmap(nmismatches_whole,nmatches_posttrim,max_match_length,
ambig_end_length_5,ambig_end_length_3,
- ambig_splicetype_5,ambig_splicetype_3,min_splice_prob,
+ ambig_splicetype_5,ambig_splicetype_3,
+ ambig_prob_5,ambig_prob_3,min_splice_prob,
pairarray,npairs,nsegments,nintrons,nindelbreaks,
/*left*/start,/*genomiclength*/end - start + 1,
/*plusp*/watsonp,genestrand,first_read_p,
@@ -11847,7 +11850,8 @@ run_gmap (bool *good_start_p, bool *good_end_p, History_T gmap_history,
/*minusterm*/querylength - 1 - Pair_querypos(&(pairarray[npairs-1])),chroffset);
if ((hit = Stage3end_new_gmap(nmismatches_whole,nmatches_posttrim,max_match_length,
ambig_end_length_5,ambig_end_length_3,
- ambig_splicetype_5,ambig_splicetype_3,min_splice_prob,
+ ambig_splicetype_5,ambig_splicetype_3,
+ ambig_prob_5,ambig_prob_3,min_splice_prob,
pairarray,npairs,nsegments,nintrons,nindelbreaks,
/*left*/end,/*genomiclength*/start - end + 1,
/*plusp*/watsonp,genestrand,first_read_p,
diff --git a/src/stage2.c b/src/stage2.c
index fc5626d..8564117 100644
--- a/src/stage2.c
+++ b/src/stage2.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage2.c 137652 2014-05-30 16:49:12Z twu $";
+static char rcsid[] = "$Id: stage2.c 141807 2014-07-17 02:42:06Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -198,7 +198,7 @@ Stage2_setup (bool splicingp_in, bool cross_species_p,
/* Dynamic programming */
/* Can also define debug9(x) as: if (querypos == XX) {x;} */
#ifdef DEBUG9
-#define debug9(x) if (querypos == 39) {x;}
+#define debug9(x) if (querypos == 67) {x;}
#else
#define debug9(x)
#endif
@@ -2273,7 +2273,7 @@ Linkmatrix_get_cells_fwd (int *nunique, struct Link_T **links, int querystart, i
List_T celllist = NULL;
int querypos, hit;
int rootposition, last_rootposition;
- int threshold_score;
+ int threshold_score, best_score_for_root;
int ngood, ncells, i, k;
if (bestscore > 2*suboptimal_score_end) {
@@ -2327,14 +2327,25 @@ Linkmatrix_get_cells_fwd (int *nunique, struct Link_T **links, int querystart, i
k = 0;
last_rootposition = -1;
+ best_score_for_root = -1;
for (i = 0; i < ncells; i++) {
- if (cells[i]->rootposition == last_rootposition) {
- /* Cell_free(&(cells[i])); -- no need with cellpool */
- } else {
+ if (cells[i]->rootposition != last_rootposition) {
debug11(printf("Pushing rootposition %d, trace #%d, score %d, pos %d, hit %d\n",
cells[i]->rootposition,cells[i]->tracei,cells[i]->score,cells[i]->querypos,cells[i]->hit));
sorted[k++] = cells[i];
last_rootposition = cells[i]->rootposition;
+ best_score_for_root = cells[i]->score;
+
+ } else if (cells[i]->querypos == best_score_for_root) {
+ debug11(printf("Equivalent cell for rootposition %d, trace #%d, score %d, pos %d, hit %d\n",
+ cells[i]->rootposition,cells[i]->tracei,cells[i]->score,cells[i]->querypos,cells[i]->hit));
+ sorted[k++] = cells[i];
+ /* last_rootposition = cells[i]->rootposition;*/
+ /* best_score_for_root = cells[i]->score; */
+
+ } else {
+ /* Cell_free(&(cells[i])); -- no need with cellpool */
+
}
}
debug11(printf("\n"));
@@ -2356,6 +2367,7 @@ Linkmatrix_get_cells_fwd (int *nunique, struct Link_T **links, int querystart, i
List_T celllist = NULL;
int querypos, hit;
int rootposition, last_rootposition;
+ int best_score_for_root;
int ngood, ncells, i, k;
ncells = 0;
@@ -2371,6 +2383,7 @@ Linkmatrix_get_cells_fwd (int *nunique, struct Link_T **links, int querystart, i
}
}
+ debug12(printf("Have %d cells\n",ncells));
if (ncells == 0) {
*nunique = 0;
return (Cell_T *) NULL;
@@ -2390,14 +2403,26 @@ Linkmatrix_get_cells_fwd (int *nunique, struct Link_T **links, int querystart, i
k = 0;
last_rootposition = -1;
+ best_score_for_root = -1;
for (i = 0; i < ncells; i++) {
- if (cells[i]->rootposition == last_rootposition) {
- /* Cell_free(&(cells[i])); -- no need with cellpool */
- } else {
+ if (cells[i]->rootposition != last_rootposition) {
+ /* Take best cell at this rootposition */
debug11(printf("Pushing rootposition %d, score %d, pos %d, hit %d\n",
cells[i]->rootposition,cells[i]->score,cells[i]->querypos,cells[i]->hit));
sorted[k++] = cells[i];
last_rootposition = cells[i]->rootposition;
+ best_score_for_root = cells[i]->score;
+
+ } else if (cells[i]->score == best_score_for_root) {
+ /* Take equivalent cell for this rootposition */
+ debug11(printf("Pushing equivalent end for rootposition %d, score %d, pos %d, hit %d\n",
+ cells[i]->rootposition,cells[i]->score,cells[i]->querypos,cells[i]->hit));
+ sorted[k++] = cells[i];
+ /* last_rootposition = cells[i]->rootposition; */
+ /* best_score_for_root = cells[i]->score; */
+
+ } else {
+ /* Cell_free(&(cells[i])); -- no need with cellpool */
}
}
debug11(printf("\n"));
@@ -2419,7 +2444,7 @@ Linkmatrix_get_cells_both (int *nunique, struct Link_T **links, int querystart,
List_T celllist = NULL;
int querypos, hit;
int rootposition, last_rootposition;
- int threshold_score;
+ int threshold_score, best_score_for_root;
int ngood, ncells, i, k;
if (bestscore > 2*suboptimal_score_end) {
@@ -2469,6 +2494,7 @@ Linkmatrix_get_cells_both (int *nunique, struct Link_T **links, int querystart,
}
}
+ debug12(printf("Have %d cells\n",ncells));
if (ncells == 0) {
*nunique = 0;
return (Cell_T *) NULL;
@@ -2488,14 +2514,26 @@ Linkmatrix_get_cells_both (int *nunique, struct Link_T **links, int querystart,
k = 0;
last_rootposition = -1;
+ best_score_for_root = -1;
for (i = 0; i < ncells; i++) {
- if (cells[i]->rootposition == last_rootposition) {
- /* Cell_free(&(cells[i])); -- no need with cellpool */
- } else {
+ if (cells[i]->rootposition != last_rootposition) {
+ /* Take best cell at this rootposition */
debug11(printf("rootposition %d, score %d, pos %d, hit %d\n",
cells[i]->rootposition,cells[i]->score,cells[i]->querypos,cells[i]->hit));
sorted[k++] = cells[i];
last_rootposition = cells[i]->rootposition;
+ best_score_for_root = cells[i]->score;
+
+ } else if (cells[i]->score == best_score_for_root) {
+ /* Take equivalent end cell for this rootposition */
+ debug11(printf("equivalent end for rootposition %d, score %d, pos %d, hit %d\n",
+ cells[i]->rootposition,cells[i]->score,cells[i]->querypos,cells[i]->hit));
+ sorted[k++] = cells[i];
+ /* last_rootposition = cells[i]->rootposition; */
+ /* best_score_for_root = cells[i]->score; */
+
+ } else {
+ /* Cell_free(&(cells[i])); -- no need with cellpool */
}
}
debug11(printf("\n"));
@@ -3038,6 +3076,7 @@ traceback_one (int querypos, int hit, struct Link_T **links, Chrpos_T **mappings
}
debug3(printf("%d %d %d %d 3\n",prev_querypos,prevhit,querypos,hit));
}
+ debug0(printf("Done\n\n"));
return path;
}
@@ -3123,6 +3162,7 @@ traceback_one_snps (int querypos, int hit, struct Link_T **links, Chrpos_T **map
}
debug3(printf("%d %d %d %d 3\n",prev_querypos,prevhit,querypos,hit));
}
+ debug0(printf("Done\n\n"));
return path;
}
@@ -4793,7 +4833,7 @@ Stage2_compute (int *stage2_source, int *stage2_indexsize,
/* fprintf(stderr,"%d starts\n",List_length(start_paths)); */
if (List_length(start_paths) == 1) {
path = (List_T) List_head(start_paths);
- debug5(printf("Converting single start\n"));
+ debug5(printf("Converting single start\n"));
if (snps_p == true) {
pairs = convert_to_nucleotides_snps(path,
#ifndef PMAP
@@ -4813,6 +4853,7 @@ Stage2_compute (int *stage2_source, int *stage2_indexsize,
middle = Pairpool_join_end5(middle,path,pairpool,/*copy_end_p*/false);
debug0(printf("ATTACHING SINGLE START TO MIDDLE\n"));
debug0(Pair_dump_list(middle,true));
+
} else {
debug0(i = 0);
for (q = start_paths; q != NULL; q = List_next(q)) {
diff --git a/src/stage3.c b/src/stage3.c
index dfae4ce..bb8dad9 100644
--- a/src/stage3.c
+++ b/src/stage3.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage3.c 140368 2014-07-02 00:56:33Z twu $";
+static char rcsid[] = "$Id: stage3.c 141808 2014-07-17 02:44:58Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -2630,74 +2630,78 @@ remove_adjacent_ins_del (bool *foundp, List_T pairs) {
/* Called only by GMAP, because nucleotide matches in PMAP have several ambiguous matches. */
static List_T
-clean_path_end3 (List_T path) {
+clean_path_end3 (List_T path, int ambig_end_length_3) {
Pair_T lastpair;
- debug(printf("clean_path_end3\n"));
- /* Remove any remaining nonmatches, gaps, or indels at 3' end */
- if (path != NULL) {
- lastpair = path->first;
- while (lastpair->gapp || (lastpair->comp != MATCH_COMP && lastpair->comp != DYNPROG_MATCH_COMP && lastpair->comp != AMBIGUOUS_COMP)) {
- debug(printf("Removing nonmatch at 3' end: "));
- debug(Pair_dump_one(lastpair,/*zerobasedp*/true));
- debug(printf("\n"));
- path = Pairpool_pop(path,&lastpair);
- if (path == NULL) {
- return NULL;
- } else {
- lastpair = path->first;
+ if (ambig_end_length_3 == 0) {
+ debug(printf("clean_path_end3\n"));
+ /* Remove any remaining nonmatches, gaps, or indels at 3' end */
+ if (path != NULL) {
+ lastpair = path->first;
+ while (lastpair->gapp || (lastpair->comp != MATCH_COMP && lastpair->comp != DYNPROG_MATCH_COMP && lastpair->comp != AMBIGUOUS_COMP)) {
+ debug(printf("Removing nonmatch at 3' end: "));
+ debug(Pair_dump_one(lastpair,/*zerobasedp*/true));
+ debug(printf("\n"));
+ path = Pairpool_pop(path,&lastpair);
+ if (path == NULL) {
+ return NULL;
+ } else {
+ lastpair = path->first;
+ }
}
}
- }
#ifdef PMAP
- while (path != NULL) {
- lastpair = path->first;
- if (lastpair->querypos % 3 == 2) {
- return path;
- } else {
- debug(printf("PMAP popping querypos %d to get to codon boundary\n",lastpair->querypos));
- path = Pairpool_pop(path,&lastpair);
+ while (path != NULL) {
+ lastpair = path->first;
+ if (lastpair->querypos % 3 == 2) {
+ return path;
+ } else {
+ debug(printf("PMAP popping querypos %d to get to codon boundary\n",lastpair->querypos));
+ path = Pairpool_pop(path,&lastpair);
+ }
}
- }
#endif
+ }
return path;
}
static List_T
-clean_pairs_end5 (List_T pairs) {
+clean_pairs_end5 (List_T pairs, int ambig_end_length_5) {
Pair_T firstpair;
- debug(printf("clean_pairs_end5\n"));
- /* Remove any remaining nonmatches, gaps, or indels at 5' end */
- if (pairs != NULL) {
- firstpair = pairs->first;
- while (firstpair->gapp || (firstpair->comp != MATCH_COMP && firstpair->comp != DYNPROG_MATCH_COMP && firstpair->comp != AMBIGUOUS_COMP)) {
- debug(printf("Removing nonmatch at 5' end: "));
- debug(Pair_dump_one(firstpair,/*zerobasedp*/true));
- debug(printf("\n"));
- pairs = Pairpool_pop(pairs,&firstpair);
- if (pairs == NULL) {
- return NULL;
- } else {
- firstpair = pairs->first;
+ if (ambig_end_length_5 == 0) {
+ debug(printf("clean_pairs_end5\n"));
+ /* Remove any remaining nonmatches, gaps, or indels at 5' end */
+ if (pairs != NULL) {
+ firstpair = pairs->first;
+ while (firstpair->gapp || (firstpair->comp != MATCH_COMP && firstpair->comp != DYNPROG_MATCH_COMP && firstpair->comp != AMBIGUOUS_COMP)) {
+ debug(printf("Removing nonmatch at 5' end: "));
+ debug(Pair_dump_one(firstpair,/*zerobasedp*/true));
+ debug(printf("\n"));
+ pairs = Pairpool_pop(pairs,&firstpair);
+ if (pairs == NULL) {
+ return NULL;
+ } else {
+ firstpair = pairs->first;
+ }
}
}
- }
#ifdef PMAP
- while (pairs != NULL) {
- firstpair = pairs->first;
- if (firstpair->querypos % 3 == 0) {
- return pairs;
- } else {
- debug(printf("PMAP popping querypos %d to get to codon boundary\n",firstpair->querypos));
- pairs = Pairpool_pop(pairs,&firstpair);
+ while (pairs != NULL) {
+ firstpair = pairs->first;
+ if (firstpair->querypos % 3 == 0) {
+ return pairs;
+ } else {
+ debug(printf("PMAP popping querypos %d to get to codon boundary\n",firstpair->querypos));
+ pairs = Pairpool_pop(pairs,&firstpair);
+ }
}
- }
#endif
+ }
return pairs;
}
@@ -3486,18 +3490,18 @@ exon_length_3 (List_T path) {
/* Also handles case where novelsplicingp == false */
/* pairs -> pairs */
static List_T
-trim_end5_exon_indels (bool *trim5p, int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5,
- List_T pairs, int paired_favor_mode, int zero_offset,
- int querylength, int watsonp, int cdna_direction, int maxintronlen
+trim_end5_exon_indels (bool *trim5p, int ambig_end_length, List_T pairs, int paired_favor_mode, int zero_offset,
+ int querylength, int watsonp, int cdna_direction, int maxintronlen
#ifdef WASTE
- , Pairpool_T pairpool
+ , Pairpool_T pairpool
#endif
- ) {
+ ) {
List_T path, exon, pairptr, p;
Pair_T pair, medial, indel = NULL, splice = NULL;
- int nmatches = 0, nmismatches = -1 /* because of the gap */, i;
+ int nmatches = 0, nmismatches /* = -1 because of the gap */, i;
bool nearindelp = false, nearmismatchp = false, is_canonical;
double medial_prob;
+ int nindels;
debug3(printf("Starting trim_end5_exon_indels\n"));
@@ -3505,8 +3509,19 @@ trim_end5_exon_indels (bool *trim5p, int *ambig_end_length_5, Splicetype_T *ambi
if (pairs == NULL) {
*trim5p = false;
return (List_T) NULL;
+ } else if (ambig_end_length > 0) {
+ /* Don't mess with ambiguous end */
+ *trim5p = false;
+ return pairs;
} else {
pair = pairs->first;
+ debug3(printf("querystart %d\n",pair->querypos));
+ /* Normally expect pair->querypos to be 0, and want to start with -1 because of the gap */
+ if (pair->querypos <= ambig_end_length) {
+ nmismatches = -1;
+ } else {
+ nmismatches = (pair->querypos - ambig_end_length) - 1;
+ }
}
exon = (List_T) NULL;
@@ -3532,8 +3547,10 @@ trim_end5_exon_indels (bool *trim5p, int *ambig_end_length_5, Splicetype_T *ambi
/* indel = pair; */
p = pairs;
+ nindels = 1;
while (p != NULL && ((Pair_T) p->first)->comp == INDEL_COMP) {
p = List_next(p);
+ nindels++;
}
for ( i = 0; p != NULL && i < NEARBY_INDEL; p = List_next(p), i++) {
@@ -3617,20 +3634,37 @@ trim_end5_exon_indels (bool *trim5p, int *ambig_end_length_5, Splicetype_T *ambi
*trim5p = true;
} else if (splice == NULL) {
+ debug3(printf("nindels %d\n",nindels));
if (nmatches < min_indel_end_matches) {
debug3(printf("Not enough matches %d < %d, so trimming it\n",nmatches,min_indel_end_matches));
path = (List_T) NULL;
*trim5p = true;
- } else if (nmatches - nmismatches > 2) {
- debug3(printf("More matches than mismatches, so keeping it\n"));
- path = exon; /* exon already has the indel */
- *trim5p = false;
+ } else if (nindels > 3) {
+ /* Large indel */
+ if (nmatches - nmismatches > nindels) {
+ debug3(printf("Large indel: More matches than mismatches, so keeping it\n"));
+ path = exon; /* exon already has the indel */
+ *trim5p = false;
+
+ } else {
+ debug3(printf("Large indel: Trimming it\n"));
+ path = (List_T) NULL;
+ *trim5p = true;
+ }
} else {
- debug3(printf("Trimming it\n"));
- path = (List_T) NULL;
- *trim5p = true;
+ /* Small indel */
+ if (nmatches - nmismatches > 2) {
+ debug3(printf("Small indel: More matches than mismatches, so keeping it\n"));
+ path = exon; /* exon already has the indel */
+ *trim5p = false;
+
+ } else {
+ debug3(printf("Small indel: Trimming it\n"));
+ path = (List_T) NULL;
+ *trim5p = true;
+ }
}
} else {
@@ -3683,7 +3717,7 @@ trim_end5_exon_indels (bool *trim5p, int *ambig_end_length_5, Splicetype_T *ambi
#endif
pairs = List_reverse(path);
- pairs = clean_pairs_end5(pairs);
+ pairs = clean_pairs_end5(pairs,ambig_end_length);
debug3(printf("End of trim_end5_exon_indels: length = %d\n",List_length(pairs)));
debug3(Pair_dump_list(pairs,true));
@@ -3695,8 +3729,7 @@ trim_end5_exon_indels (bool *trim5p, int *ambig_end_length_5, Splicetype_T *ambi
/* Also handles case where novelsplicingp == false */
/* path -> path */
static List_T
-trim_end3_exon_indels (bool *trim3p, int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3,
- List_T path, int paired_favor_mode, int zero_offset,
+trim_end3_exon_indels (bool *trim3p, int ambig_end_length, List_T path, int paired_favor_mode, int zero_offset,
int querylength, bool watsonp, int cdna_direction,
int maxintronlen
#ifdef WASTE
@@ -3705,19 +3738,30 @@ trim_end3_exon_indels (bool *trim3p, int *ambig_end_length_3, Splicetype_T *ambi
) {
List_T pairs, exon, pairptr, p;
Pair_T pair, medial, indel = NULL, splice = NULL;
- int nmatches = 0, nmismatches = -1 /* because of the gap */, i;
+ int nmatches = 0, nmismatches /* = -1 because of the gap */, i;
bool nearindelp = false, nearmismatchp = false, is_canonical;
double medial_prob;
-
+ int nindels;
debug3(printf("Starting trim_end3_exon_indels\n"));
/* Handle last exon */
if (path == NULL) {
- /* *trim3p = false; */
+ *trim3p = false;
return (List_T) NULL;
+ } else if (ambig_end_length > 0) {
+ /* Don't mess with ambiguous end */
+ *trim3p = false;
+ return path;
} else {
pair = path->first;
+ debug3(printf("queryend %d\n",pair->querypos));
+ /* Normally expect pair->querypos to be 0, and want to start with -1 because of the gap */
+ if (pair->querypos >= (querylength - 1) - ambig_end_length) {
+ nmismatches = -1;
+ } else {
+ nmismatches = (querylength - 1) - ambig_end_length - pair->querypos - 1;
+ }
}
exon = (List_T) NULL;
@@ -3743,8 +3787,10 @@ trim_end3_exon_indels (bool *trim3p, int *ambig_end_length_3, Splicetype_T *ambi
/* indel = pair; */
p = path;
+ nindels = 1;
while (p != NULL && ((Pair_T) p->first)->comp == INDEL_COMP) {
p = List_next(p);
+ nindels++;
}
for ( i = 0; p != NULL && i < NEARBY_INDEL; p = List_next(p), i++) {
@@ -3828,20 +3874,37 @@ trim_end3_exon_indels (bool *trim3p, int *ambig_end_length_3, Splicetype_T *ambi
*trim3p = true;
} else if (splice == NULL) {
+ debug3(printf("nindels %d\n",nindels));
if (nmatches < min_indel_end_matches) {
debug3(printf("Not enough matches %d < %d, so trimming it\n",nmatches,min_indel_end_matches));
pairs = (List_T) NULL;
*trim3p = true;
- } else if (nmatches - nmismatches > 2) {
- debug3(printf("More matches than mismatches, so keeping it\n"));
- pairs = exon; /* exon already has the indel */
- *trim3p = false;
+ } else if (nindels > 3) {
+ /* Large indel */
+ if (nmatches - nmismatches > nindels) {
+ debug3(printf("Large indel: More matches than mismatches, so keeping it\n"));
+ pairs = exon; /* exon already has the indel */
+ *trim3p = false;
+
+ } else {
+ debug3(printf("Large indel: Trimming it\n"));
+ pairs = (List_T) NULL;
+ *trim3p = true;
+ }
} else {
- debug3(printf("Trimming it\n"));
- pairs = (List_T) NULL;
- *trim3p = true;
+ /* Small indel */
+ if (nmatches - nmismatches > 2) {
+ debug3(printf("Small indel: More matches than mismatches, so keeping it\n"));
+ pairs = exon; /* exon already has the indel */
+ *trim3p = false;
+
+ } else {
+ debug3(printf("Small indel: Trimming it\n"));
+ pairs = (List_T) NULL;
+ *trim3p = true;
+ }
}
} else {
@@ -3893,7 +3956,7 @@ trim_end3_exon_indels (bool *trim3p, int *ambig_end_length_3, Splicetype_T *ambi
#endif
path = List_reverse(pairs);
- path = clean_path_end3(path);
+ path = clean_path_end3(path,ambig_end_length);
debug3(printf("End of trim_noncanonical_end3_exons: length = %d\n",List_length(path)));
debug3(Pair_dump_list(path,true));
@@ -4493,15 +4556,17 @@ pick_cdna_direction (int *winning_cdna_direction, int *sensedir,
int indel_alignment_score_fwd, int indel_alignment_score_rev,
#endif
int alignment_score_fwd, int alignment_score_rev, int sense_filter) {
+#if 0
int canonical_score_fwd, canonical_score_rev;
+#endif
if (pairs_fwd) {
- canonical_score_fwd = ncanonical_fwd - nbadintrons_fwd + nsemicanonical_fwd - nnoncanonical_fwd;
+ /* canonical_score_fwd = ncanonical_fwd - nbadintrons_fwd + nsemicanonical_fwd - nnoncanonical_fwd; */
debug11(printf("ncanonical_fwd %d, nbadintrons_fwd %d, nsemicanonical_fwd %d, nnoncanonical_fwd %d\n",
ncanonical_fwd,nbadintrons_fwd,nsemicanonical_fwd,nnoncanonical_fwd));
}
if (pairs_rev) {
- canonical_score_rev = ncanonical_rev - nbadintrons_rev + nsemicanonical_rev - nnoncanonical_rev;
+ /* canonical_score_rev = ncanonical_rev - nbadintrons_rev + nsemicanonical_rev - nnoncanonical_rev; */
debug11(printf("ncanonical_rev %d, nbadintrons_rev %d, nsemicanonical_rev %d, nnoncanonical_rev %d\n",
ncanonical_rev,nbadintrons_rev,nsemicanonical_rev,nnoncanonical_rev));
}
@@ -4567,16 +4632,17 @@ pick_cdna_direction (int *winning_cdna_direction, int *sensedir,
/* intronscores reveal a clear sensedir */
*winning_cdna_direction = -1;
- } else if (ncanonical_fwd > ncanonical_rev && nnoncanonical_fwd <= nnoncanonical_rev) {
+ } else if (ncanonical_fwd > 0 && ncanonical_rev == 0) {
debug11(printf("ncanonical_fwd %d && ncanonical_rev %d, so fwd wins\n",
ncanonical_fwd,ncanonical_rev));
*winning_cdna_direction = +1;
- } else if (ncanonical_rev > ncanonical_fwd && nnoncanonical_rev <= nnoncanonical_fwd) {
+ } else if (ncanonical_rev > 0 && ncanonical_fwd == 0) {
debug11(printf("ncanonical_fwd %d && ncanonical_rev %d, so rev wins\n",
ncanonical_fwd,ncanonical_rev));
*winning_cdna_direction = -1;
+#if 0
} else if (canonical_score_fwd > canonical_score_rev + 1) {
debug11(printf("canonical_score_fwd %d > canonical_score_rev %d + 1, so fwd wins\n",
canonical_score_fwd,canonical_score_rev));
@@ -4586,6 +4652,7 @@ pick_cdna_direction (int *winning_cdna_direction, int *sensedir,
debug11(printf("canonical_score_rev %d > canonical_score_fwd %d + 1, so rev wins\n",
canonical_score_rev,canonical_score_fwd));
*winning_cdna_direction = -1;
+#endif
#if 0
} else if (alignment_score_fwd > alignment_score_rev + SCORE_SIGDIFF) {
@@ -4599,16 +4666,26 @@ pick_cdna_direction (int *winning_cdna_direction, int *sensedir,
*winning_cdna_direction = -1;
#endif
- } else if (nnoncanonical_fwd < nnoncanonical_rev) {
+ } else if (nnoncanonical_fwd == 0 && nnoncanonical_rev > 0) {
debug11(printf("nnoncanonical_fwd %d < nnoncanonical_rev %d, so fwd wins\n",
nnoncanonical_fwd,nnoncanonical_rev));
*winning_cdna_direction = +1;
- } else if (nnoncanonical_rev < nnoncanonical_fwd) {
+ } else if (nnoncanonical_rev == 0 && nnoncanonical_fwd > 0) {
debug11(printf("nnoncanonical_rev %d < nnoncanonical_fwd %d, so rev wins\n",
nnoncanonical_rev,nnoncanonical_fwd));
*winning_cdna_direction = -1;
+ } else if (nbadintrons_fwd == 0 && nbadintrons_rev > 0) {
+ debug11(printf("nbadintrons_fwd %d < nbadintrons_rev %d, so fwd wins\n",
+ nbadintrons_fwd,nbadintrons_rev));
+ *winning_cdna_direction = +1;
+
+ } else if (nbadintrons_rev == 0 && nbadintrons_fwd > 0) {
+ debug11(printf("nbadintrons_rev %d < nbadintrons_fwd %d, so rev wins\n",
+ nbadintrons_rev,nbadintrons_fwd));
+ *winning_cdna_direction = -1;
+
} else if (avg_donor_score_fwd + avg_acceptor_score_fwd > avg_donor_score_rev + avg_acceptor_score_rev + PROB_SIGDIFF) {
debug11(printf("intronscores fwd %f+%f > intronscores rev %f+%f, so fwd wins\n",
avg_donor_score_fwd,avg_acceptor_score_fwd,avg_donor_score_rev,avg_acceptor_score_rev));
@@ -7801,8 +7878,8 @@ good_end_intron_p (Pair_T gappair, int cdna_direction) {
static List_T
distalmedial_ending5 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_minor,
- int *finalscore, int *ambig_end_length, Splicetype_T *ambig_splicetype, List_T *pairs,
- int leftquerypos, int leftgenomepos, Pair_T rightpair,
+ int *finalscore, int *ambig_end_length, Splicetype_T *ambig_splicetype, double *ambig_prob,
+ List_T *pairs, int leftquerypos, int leftgenomepos, Pair_T rightpair,
Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
#ifdef GSNAP
@@ -7880,6 +7957,7 @@ distalmedial_ending5 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
if (0 /* genomedp3_medialgap > genomiclength */) {
debug(printf("Not feasible to do medial gap\n"));
*ambig_end_length = 0;
+ *ambig_prob = 0.0;
*pairs = Pairpool_transfer(*pairs,endgappairs);
*chop_exon_p = false;
@@ -7907,6 +7985,9 @@ distalmedial_ending5 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
#endif
cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate);
+ if (*ambig_end_length > 0) {
+ *ambig_prob = 2.0;
+ }
} else {
continuous_gappairs_medialgap = Dynprog_end5_gap(&(*dynprogindex_minor),&(*finalscore),
&nmatches,&nmismatches,&nopens,&nindels,dynprog,
@@ -7915,6 +7996,7 @@ distalmedial_ending5 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
chroffset,chrhigh,cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate,/*endalign*/QUERYEND_INDELS);
*ambig_end_length = 0;
+ *ambig_prob = 0.0;
}
continuous_goodness_medialgap = nmatches + MISMATCH*nmismatches + QOPEN*nopens + QINDEL*nindels;
@@ -7924,6 +8006,7 @@ distalmedial_ending5 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
if (continuous_goodness_distalgap > continuous_goodness_medialgap) {
debug(printf("Continuous distal wins: %d > %d\n",continuous_goodness_distalgap,continuous_goodness_medialgap));
*ambig_end_length = 0;
+ *ambig_prob = 0.0;
#if 0
debug(printf("Before transferring endgappairs, here is pairs:\n"));
@@ -7957,7 +8040,7 @@ distalmedial_ending5 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
static List_T
extend_ending5 (bool *knownsplicep, int *dynprogindex_minor,
- int *finalscore, int *ambig_end_length, Splicetype_T *ambig_splicetype,
+ int *finalscore, int *ambig_end_length, Splicetype_T *ambig_splicetype, double *ambig_prob,
List_T *pairs, int leftquerypos, int leftgenomepos, Pair_T rightpair,
Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
@@ -8032,6 +8115,9 @@ extend_ending5 (bool *knownsplicep, int *dynprogindex_minor,
#endif
cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate);
+ if (*ambig_end_length > 0) {
+ *ambig_prob = 2.0;
+ }
} else {
continuous_gappairs_distalgap = Dynprog_end5_gap(&(*dynprogindex_minor),&(*finalscore),
&nmatches,&nmismatches,&nopens,&nindels,dynprog,
@@ -8040,6 +8126,7 @@ extend_ending5 (bool *knownsplicep, int *dynprogindex_minor,
chroffset,chrhigh,cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate,endalign);
*ambig_end_length = 0;
+ *ambig_prob = 0.0;
*knownsplicep = false;
}
@@ -8070,8 +8157,8 @@ extend_ending5 (bool *knownsplicep, int *dynprogindex_minor,
static List_T
distalmedial_ending3 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_minor,
- int *finalscore, int *ambig_end_length, Splicetype_T *ambig_splicetype, List_T *path,
- Pair_T leftpair, int rightquerypos, int querylength,
+ int *finalscore, int *ambig_end_length, Splicetype_T *ambig_splicetype, double *ambig_prob,
+ List_T *path, Pair_T leftpair, int rightquerypos, int querylength,
Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
#ifdef GSNAP
@@ -8155,6 +8242,7 @@ distalmedial_ending3 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
if (genomedp5_medialgap < 0) {
debug(printf("Not feasible to do medial gap\n"));
*ambig_end_length = 0;
+ *ambig_prob = 0.0;
*path = Pairpool_transfer(*path,endgappairs);
*chop_exon_p = false;
@@ -8181,6 +8269,9 @@ distalmedial_ending3 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
#endif
cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate);
+ if (*ambig_end_length > 0) {
+ *ambig_prob = 2.0;
+ }
} else {
continuous_gappairs_medialgap = Dynprog_end3_gap(&(*dynprogindex_minor),&(*finalscore),
&nmatches,&nmismatches,&nopens,&nindels,dynprog,
@@ -8189,6 +8280,7 @@ distalmedial_ending3 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
chroffset,chrhigh,cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate,/*endalign*/QUERYEND_INDELS);
*ambig_end_length = 0;
+ *ambig_prob = 0.0;
}
continuous_goodness_medialgap = nmatches + MISMATCH*nmismatches + QOPEN*nopens + QINDEL*nindels;
@@ -8198,6 +8290,7 @@ distalmedial_ending3 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
if (continuous_goodness_distalgap > continuous_goodness_medialgap) {
debug(printf("Continuous distal wins: %d > %d\n",continuous_goodness_distalgap,continuous_goodness_medialgap));
*ambig_end_length = 0;
+ *ambig_prob = 0.0;
#if 0
debug(printf("Before transferring endgappairs, here is path:\n"));
@@ -8231,7 +8324,7 @@ distalmedial_ending3 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_m
static List_T
extend_ending3 (bool *knownsplicep, int *dynprogindex_minor, int *finalscore,
- int *ambig_end_length, Splicetype_T *ambig_splicetype,
+ int *ambig_end_length, Splicetype_T *ambig_splicetype, double *ambig_prob,
List_T *path, Pair_T leftpair, int rightquerypos,
int querylength, Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
@@ -8306,6 +8399,9 @@ extend_ending3 (bool *knownsplicep, int *dynprogindex_minor, int *finalscore,
#endif
cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate);
+ if (*ambig_end_length > 0) {
+ *ambig_prob = 2.0;
+ }
} else {
continuous_gappairs_distalgap = Dynprog_end3_gap(&(*dynprogindex_minor),&(*finalscore),
&nmatches,&nmismatches,&nopens,&nindels,dynprog,
@@ -8314,6 +8410,7 @@ extend_ending3 (bool *knownsplicep, int *dynprogindex_minor, int *finalscore,
chroffset,chrhigh,cdna_direction,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate,endalign);
*ambig_end_length = 0;
+ *ambig_prob = 0.0;
*knownsplicep = false;
}
@@ -8618,7 +8715,7 @@ build_dual_breaks (bool *dual_break_p, int *dynprogindex_minor, int *dynproginde
<- querydpspan ->
*/
static List_T
-build_path_end3 (bool *knownsplicep, int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3,
+build_path_end3 (bool *knownsplicep, int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3, double *ambig_prob_3,
bool *chop_exon_p, int *dynprogindex_minor,
List_T path, Univcoord_T chroffset, Univcoord_T chrhigh, int querylength,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
@@ -8639,16 +8736,27 @@ build_path_end3 (bool *knownsplicep, int *ambig_end_length_3, Splicetype_T *ambi
int queryjump, rightquerypos;
int finalscore;
+#if 0
if (*ambig_end_length_3 > 0) {
debug(printf("ambig_end_length_3 is %d, so returning path\n",*ambig_end_length_3));
return path;
} else {
path = clean_path_end3_gap_indels(path);
}
+#else
+ /* Always want to clean indels at end */
+ path = clean_path_end3_gap_indels(path);
+ if (*ambig_end_length_3 > 0) {
+ debug(printf("ambig_end_length_3 is %d, so returning path\n",*ambig_end_length_3));
+ return path;
+ }
+#endif
+
*knownsplicep = false;
if (path == NULL) {
*ambig_end_length_3 = 0;
+ *ambig_prob_3 = 0.0;
return (List_T) NULL;
} else {
leftpair = path->first;
@@ -8657,6 +8765,7 @@ build_path_end3 (bool *knownsplicep, int *ambig_end_length_3, Splicetype_T *ambi
cdna_direction,leftpair->querypos,querylength,leftpair->genomepos));
if (leftpair->querypos < 0) {
*ambig_end_length_3 = 0;
+ *ambig_prob_3 = 0.0;
return (List_T) NULL;
/* abort(); */
}
@@ -8680,8 +8789,8 @@ build_path_end3 (bool *knownsplicep, int *ambig_end_length_3, Splicetype_T *ambi
debug(printf("Running extend_ending3\n"));
*chop_exon_p = false;
gappairs = extend_ending3(&(*knownsplicep),&(*dynprogindex_minor),&finalscore,
- &(*ambig_end_length_3),&(*ambig_splicetype_3),&path,
- leftpair,rightquerypos,querylength,
+ &(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
+ &path,leftpair,rightquerypos,querylength,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
#ifdef GSNAP
#ifdef END_KNOWNSPLICING_SHORTCUT
@@ -8696,7 +8805,7 @@ build_path_end3 (bool *knownsplicep, int *ambig_end_length_3, Splicetype_T *ambi
abort();
debug(printf("Running distalmedial_ending3\n"));
gappairs = distalmedial_ending3(&(*knownsplicep),&(*chop_exon_p),&(*dynprogindex_minor),
- &finalscore,&(*ambig_end_length_3),&(*ambig_splicetype_3),
+ &finalscore,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
&path,leftpair,rightquerypos,querylength,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
#ifdef GSNAP
@@ -8733,7 +8842,7 @@ build_path_end3 (bool *knownsplicep, int *ambig_end_length_3, Splicetype_T *ambi
*/
static List_T
-build_pairs_end5 (bool *knownsplicep, int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5,
+build_pairs_end5 (bool *knownsplicep, int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5, double *ambig_prob_5,
bool *chop_exon_p, int *dynprogindex_minor, List_T pairs,
Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
@@ -8754,16 +8863,27 @@ build_pairs_end5 (bool *knownsplicep, int *ambig_end_length_5, Splicetype_T *amb
int finalscore;
/* int genomejump */
+#if 0
if (*ambig_end_length_5 > 0) {
debug(printf("ambig_end_length_5 is %d, so returning pairs\n",*ambig_end_length_5));
return pairs;
} else {
pairs = clean_pairs_end5_gap_indels(pairs);
}
+#else
+ /* Always want to clean indels at end */
+ pairs = clean_pairs_end5_gap_indels(pairs);
+ if (*ambig_end_length_5 > 0) {
+ debug(printf("ambig_end_length_5 is %d, so returning pairs\n",*ambig_end_length_5));
+ return pairs;
+ }
+#endif
+
*knownsplicep = false;
if (pairs == NULL) {
*ambig_end_length_5 = 0;
+ *ambig_prob_5 = 0.0;
return (List_T) NULL;
} else {
rightpair = pairs->first;
@@ -8772,6 +8892,7 @@ build_pairs_end5 (bool *knownsplicep, int *ambig_end_length_5, Splicetype_T *amb
cdna_direction,-1,rightpair->querypos,-1));
if (rightpair->querypos < 0) {
*ambig_end_length_5 = 0;
+ *ambig_prob_5 = 0.0;
return (List_T) NULL;
/* abort(); */
}
@@ -8791,7 +8912,7 @@ build_pairs_end5 (bool *knownsplicep, int *ambig_end_length_5, Splicetype_T *amb
debug(printf("Running extend_ending5\n"));
*chop_exon_p = false;
gappairs = extend_ending5(&(*knownsplicep),&(*dynprogindex_minor),
- &finalscore,&(*ambig_end_length_5),&(*ambig_splicetype_5),
+ &finalscore,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
&pairs,leftquerypos,/*leftgenomepos*/-1,rightpair,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
#ifdef GSNAP
@@ -8807,7 +8928,7 @@ build_pairs_end5 (bool *knownsplicep, int *ambig_end_length_5, Splicetype_T *amb
abort();
debug(printf("Running distalmedial_ending5\n"));
gappairs = distalmedial_ending5(&(*knownsplicep),&(*chop_exon_p),&(*dynprogindex_minor),
- &finalscore,&(*ambig_end_length_5),&(*ambig_splicetype_5),
+ &finalscore,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
&pairs,leftquerypos,/*leftgenomepos*/-1,rightpair,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
#ifdef GSNAP
@@ -9444,7 +9565,7 @@ score_introns (double *max_intron_score, double *avg_donor_score, double *avg_ac
}
debug11(printf("right neighborhood: intron_matches %d, intron_denominator %d, theta %f => pvalue %g\n",
intron_matches,intron_denominator,theta,Pbinom(intron_matches,intron_denominator,theta)));
- if (Pbinom(intron_matches,intron_denominator,theta) < 0.9) { /* was 1e-3 */
+ if (Pbinom(intron_matches,intron_denominator,theta) < 0.05) { /* was 1e-3 */
/* Not a good intron */
/* *nbadintrons += 1; */
@@ -9465,7 +9586,7 @@ score_introns (double *max_intron_score, double *avg_donor_score, double *avg_ac
}
debug11(printf("left neighborhood: intron_matches %d, intron_denominator %d, theta %f => pvalue %g\n",
intron_matches,intron_denominator,theta,Pbinom(intron_matches,intron_denominator,theta)));
- if (Pbinom(intron_matches,intron_denominator,theta) < 0.9) { /* was 1e-3 */
+ if (Pbinom(intron_matches,intron_denominator,theta) < 0.05) { /* was 1e-3 */
/* Not a good intron */
/* *nbadintrons += 1; */
@@ -9518,10 +9639,10 @@ score_introns (double *max_intron_score, double *avg_donor_score, double *avg_ac
acceptor_score = Maxent_hr_antiacceptor_prob(chroffset + splicesitepos,chroffset);
}
}
- debug11(printf("donor score at %u is %f, watson %d, cdna_direction %d\n",
- leftpair->genomepos,donor_score,watsonp,cdna_direction));
- debug11(printf("acceptor score at %u is %f, watson %d, cdna_direction %d\n",
- rightpair->genomepos,acceptor_score,watsonp,cdna_direction));
+ debug11(printf("donor score at %u is %f, watson %d, cdna_direction %d, comp %c\n",
+ leftpair->genomepos,donor_score,watsonp,cdna_direction,pair->comp));
+ debug11(printf("acceptor score at %u is %f, watson %d, cdna_direction %d, comp %c\n",
+ rightpair->genomepos,acceptor_score,watsonp,cdna_direction,pair->comp));
nintrons += 1;
if (pair->knowngapp == true) {
/* Skip */
@@ -9591,10 +9712,10 @@ score_introns (double *max_intron_score, double *avg_donor_score, double *avg_ac
donor_score = Maxent_hr_donor_prob(chroffset + splicesitepos,chroffset);
}
}
- debug11(printf("donor score at %u is %f, watson %d, cdna_direction %d\n",
- leftpair->genomepos,donor_score,watsonp,cdna_direction));
- debug11(printf("acceptor score at %u is %f, watson %d, cdna_direction %d\n",
- rightpair->genomepos,acceptor_score,watsonp,cdna_direction));
+ debug11(printf("donor score at %u is %f, watson %d, cdna_direction %d, comp %c\n",
+ leftpair->genomepos,donor_score,watsonp,cdna_direction,pair->comp));
+ debug11(printf("acceptor score at %u is %f, watson %d, cdna_direction %d, comp %c\n",
+ rightpair->genomepos,acceptor_score,watsonp,cdna_direction,pair->comp));
nintrons += 1;
if (pair->knowngapp == true) {
/* Skip */
@@ -10703,13 +10824,14 @@ path_compute_dir (double *defect_rate, List_T pairs,
/* Pass 3: introns */
/* >>pairs */
- debug(printf("\n*** Pass 3: Smooth and solve dual introns iteratively. Iteration0 %d\n",iter0));
+ debug(printf("\n*** Pass 3 (dir %d): Smooth and solve dual introns iteratively. Iteration0 %d\n",
+ cdna_direction,iter0));
iter1 = 0;
shortp = true;
while ((shortp == true || deletep == true || badp == true) && iter1 < MAXITER_SMOOTH_BY_SIZE) {
/* Pass 3c: single introns */
- debug(printf("*** Pass 3c: Solve introns. Iteration0 %d, iteration1 %d\n",
- iter0,iter1));
+ debug(printf("*** Pass 3c (dir %d): Solve introns. Iteration0 %d, iteration1 %d\n",
+ cdna_direction,iter0,iter1));
iter2 = 0;
shiftp = true;
@@ -10742,7 +10864,8 @@ path_compute_dir (double *defect_rate, List_T pairs,
#if 0
/* Pass 4: Remove bad sections */
/* >>pairs */
- debug(printf("\n*** Pass 4: Remove bad sections. Iteration0 %d\n",iter0));
+ debug(printf("\n*** Pass 4 (dir %d): Remove bad sections. Iteration0 %d\n",
+ cdna_direction,iter0));
/* pairs = filter_goodness_hmm(&filterp,pairs,*defect_rate); */
pairs = filter_indels_hmm(&filterp,pairs);
/* <<pairs */
@@ -10771,7 +10894,8 @@ path_compute_dir (double *defect_rate, List_T pairs,
#endif
/* Smoothing by size: This can undo the short exons found by traverse_dual_genome, so we use protectedp in traverse_dual_genome */
- debug(printf("*** Pass 3a: Smoothing by size. Iteration0 %d, iteration1 %d\n",iter0,iter1));
+ debug(printf("*** Pass 3a (dir %d): Smoothing by size. Iteration0 %d, iteration1 %d\n",
+ cdna_direction,iter0,iter1));
path = List_reverse(pairs);
pairs = remove_indel_gaps(path);
pairs = Smooth_pairs_by_size(&shortp,&deletep,pairs,pairpool,/*stage2_indexsize*/6);
@@ -10785,7 +10909,8 @@ path_compute_dir (double *defect_rate, List_T pairs,
}
#endif
- debug(printf("*** Pass 3b: Solve dual introns. Iteration0 %d, Iteration1 %d\n",iter0,iter1));
+ debug(printf("*** Pass 3b (dir %d): Solve dual introns. Iteration0 %d, Iteration1 %d\n",
+ cdna_direction,iter0,iter1));
if (badp == false && shortp == false && deletep == false) {
debug(printf(" no shortp or deletep, so do nothing\n"));
} else {
@@ -10829,7 +10954,7 @@ path_compute_dir (double *defect_rate, List_T pairs,
/* Don't trim ends */
} else {
/* Pass 3b: trim end exons: pairs -> pairs */
- debug(printf("\n*** Pass 3b: Trim end exons\n"));
+ debug(printf("\n*** Pass 3b (dir %d): Trim end exons\n",cdna_direction));
#ifdef WASTE
pairs = chop_ends_by_changepoint(pairs,pairpool);
#else
@@ -10843,7 +10968,7 @@ path_compute_dir (double *defect_rate, List_T pairs,
debug(Pair_dump_list(path,/*zerobasedp*/true));
/* Pass 5: Fix dual breaks */
- debug(printf("\n*** Pass 5: Fix dual breaks. Iteration0 %d\n",iter0));
+ debug(printf("\n*** Pass 5 (dir %d): Fix dual breaks. Iteration0 %d\n",cdna_direction,iter0));
pairs = remove_indel_gaps(path);
path = List_reverse(pairs);
@@ -10888,7 +11013,7 @@ path_compute_dir (double *defect_rate, List_T pairs,
static List_T
-path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5,
+path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5, double *ambig_prob_5,
double defect_rate, List_T pairs, int cdna_direction,
bool watsonp, int genestrand, bool jump_late_p, int querylength,
#ifdef GSNAP
@@ -10921,7 +11046,7 @@ path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5,
bool trim5p, trim3p, adjacent_indels_p;
*ambig_end_length_5 = 0;
-
+ *ambig_prob_5 = 0.0;
#if 0
/* Pass 7: Remove dual breaks at 5' end */
@@ -10954,8 +11079,8 @@ path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5,
/* Extend to query end, so we get an accurate count of matches and mismatches */
/* This is the first extension */
/* >>pairs */
- debug(printf("\n*** Pass 8 (dir %d): Extend to ends and determine distalmedial\n",cdna_direction));
- pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),
+ debug(printf("\n*** Pass 8 (dir %d): Extend to 5' end and determine distalmedial\n",cdna_direction));
+ pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
&chop_exon_p,&dynprogindex_minor,pairs,
chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
@@ -11000,11 +11125,10 @@ path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5,
/* Using iter1 to avoid the possibility of an infinite loop */
iter1 = 0;
while (iter1 < 5 && trim5p == true) {
- pairs = trim_end5_exon_indels(&trim5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),
- pairs,paired_favor_mode,zero_offset,querylength,
+ pairs = trim_end5_exon_indels(&trim5p,*ambig_end_length_5,pairs,paired_favor_mode,zero_offset,querylength,
watsonp,cdna_direction,maxintronlen_bound);
if (trim5p == true) {
- pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),
+ pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
&chop_exon_p,&dynprogindex_minor,pairs,
chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
@@ -11031,7 +11155,7 @@ path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5,
}
#if 0
- pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),
+ pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
&chop_exon_p,&dynprogindex_minor,pairs,
chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
@@ -11052,7 +11176,7 @@ path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5,
debug(printf("Pass 11 (dir %d): Final extension, end5\n",cdna_direction));
/* This is the second extension */
/* Perform final extension without gaps so we can compare fwd and rev properly */
- pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),
+ pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
&chop_exon_p,&dynprogindex_minor,pairs,
chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
@@ -11076,7 +11200,7 @@ path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5,
static List_T
-path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3,
+path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3, double *ambig_prob_3,
double defect_rate, List_T path, int cdna_direction,
bool watsonp, int genestrand, bool jump_late_p, int querylength,
#ifdef GSNAP
@@ -11109,6 +11233,7 @@ path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3,
bool trim5p, trim3p, adjacent_indels_p;
*ambig_end_length_3 = 0;
+ *ambig_prob_3 = 0.0;
#if 0
/* Pass 7: Remove dual breaks at 3' end */
@@ -11141,8 +11266,8 @@ path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3,
/* Extend to ends */
/* This is the first extension */
/* >>path */
- debug(printf("\n*** Pass 8 (dir %d): Extend to ends and determine distalmedial\n",cdna_direction));
- path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),
+ debug(printf("\n*** Pass 8 (dir %d): Extend to 3' end and determine distalmedial\n",cdna_direction));
+ path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
&chop_exon_p,&dynprogindex_minor,path,
chroffset,chrhigh,querylength,
knownsplice_limit_low,knownsplice_limit_high,
@@ -11186,11 +11311,10 @@ path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3,
/* Using iter1 to avoid the possibility of an infinite loop */
iter1 = 0;
while (iter1 < 5 && trim3p == true) {
- path = trim_end3_exon_indels(&trim3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),
- path,paired_favor_mode,zero_offset,querylength,
+ path = trim_end3_exon_indels(&trim3p,*ambig_end_length_3,path,paired_favor_mode,zero_offset,querylength,
watsonp,cdna_direction,maxintronlen_bound);
if (trim3p == true) {
- path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),
+ path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
&chop_exon_p,&dynprogindex_minor,path,
chroffset,chrhigh,querylength,
knownsplice_limit_low,knownsplice_limit_high,
@@ -11216,7 +11340,7 @@ path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3,
}
#if 0
- path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),
+ path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
&chop_exon_p,&dynprogindex_minor,path,
chroffset,chrhigh,querylength,
knownsplice_limit_low,knownsplice_limit_high,
@@ -11237,7 +11361,7 @@ path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3,
debug(printf("Pass 11 (dir %d): Final extension, end3\n",cdna_direction));
/* This is the second extension */
/* Perform final extension without gaps so we can compare fwd and rev properly */
- path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),
+ path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
&chop_exon_p,&dynprogindex_minor,path,
chroffset,chrhigh,querylength,
knownsplice_limit_low,knownsplice_limit_high,
@@ -11349,6 +11473,7 @@ static List_T
trim_novel_spliceends (List_T pairs,
int *ambig_end_length_5, int *ambig_end_length_3,
Splicetype_T *ambig_splicetype_5, Splicetype_T *ambig_splicetype_3,
+ double *ambig_prob_5, double *ambig_prob_3,
int *cdna_direction, int *sensedir, bool watsonp, int querylength,
Univcoord_T chroffset, Univcoord_T chrhigh,
bool knownsplice5p, bool knownsplice3p) {
@@ -11528,10 +11653,11 @@ trim_novel_spliceends (List_T pairs,
while (path != NULL && ((Pair_T) path->first)->genomepos > splice_genomepos_3) {
path = Pairpool_pop(path,&pair);
}
- path = clean_path_end3(path);
+ /* path = clean_path_end3(path); -- Gives wrong end */
if (path != NULL) {
*ambig_end_length_3 = (querylength - 1) - ((Pair_T) path->first)->querypos;
*ambig_splicetype_3 = splicetype3;
+ *ambig_prob_3 = max_prob_3;
debug13(printf("Set ambig_end_length_3 to be %d\n",*ambig_end_length_3));
}
}
@@ -11698,10 +11824,11 @@ trim_novel_spliceends (List_T pairs,
while (pairs != NULL && ((Pair_T) pairs->first)->genomepos < splice_genomepos_5) {
pairs = Pairpool_pop(pairs,&pair);
}
- pairs = clean_pairs_end5(pairs);
+ /* pairs = clean_pairs_end5(pairs); -- gives wrong end */
if (pairs != NULL) {
*ambig_end_length_5 = ((Pair_T) pairs->first)->querypos;
*ambig_splicetype_5 = splicetype5;
+ *ambig_prob_5 = max_prob_5;
debug13(printf("Set ambig_end_length_5 to be %d\n",*ambig_end_length_5));
}
}
@@ -11717,10 +11844,11 @@ trim_novel_spliceends (List_T pairs,
while (path != NULL && ((Pair_T) path->first)->genomepos > splice_genomepos_3) {
path = Pairpool_pop(path,&pair);
}
- path = clean_path_end3(path);
+ /* path = clean_path_end3(path); -- gives wrong end */
if (path != NULL) {
*ambig_end_length_3 = (querylength - 1) - ((Pair_T) path->first)->querypos;
*ambig_splicetype_3 = splicetype3;
+ *ambig_prob_3 = max_prob_3;
*cdna_direction = splice_cdna_direction_3;
*sensedir = splice_sensedir_3;
debug13(printf("Set ambig_end_length_3 to be %d\n",*ambig_end_length_3));
@@ -11734,10 +11862,11 @@ trim_novel_spliceends (List_T pairs,
while (pairs != NULL && ((Pair_T) pairs->first)->genomepos < splice_genomepos_5) {
pairs = Pairpool_pop(pairs,&pair);
}
- pairs = clean_pairs_end5(pairs);
+ /* pairs = clean_pairs_end5(pairs); -- gives wrong end */
if (pairs != NULL) {
*ambig_end_length_5 = ((Pair_T) pairs->first)->querypos;
*ambig_splicetype_5 = splicetype5;
+ *ambig_prob_5 = max_prob_5;
*cdna_direction = splice_cdna_direction_5;
*sensedir = splice_sensedir_5;
debug13(printf("Set ambig_end_length_5 to be %d\n",*ambig_end_length_5));
@@ -11754,6 +11883,7 @@ trim_novel_spliceends (List_T pairs,
static List_T
path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
Splicetype_T *ambig_splicetype_5, Splicetype_T *ambig_splicetype_3,
+ double *ambig_prob_5, double *ambig_prob_3,
List_T pairs, int *cdna_direction, int *sensedir, bool watsonp, bool jump_late_p,
int querylength,
#ifdef GSNAP
@@ -11786,6 +11916,7 @@ path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
if (novelsplicingp == true) {
pairs = trim_novel_spliceends(pairs,&(*ambig_end_length_5),&(*ambig_end_length_3),
&(*ambig_splicetype_5),&(*ambig_splicetype_3),
+ &(*ambig_prob_5),&(*ambig_prob_3),
&(*cdna_direction),&(*sensedir),watsonp,querylength,
chroffset,chrhigh,knownsplice5p,knownsplice3p);
}
@@ -11819,7 +11950,7 @@ path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
if (trim5p == true) {
debug3(printf("Extending at 5'\n"));
/* This is the third and final extension */
- pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),
+ pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
&chop_exon_p,&dynprogindex_minor,pairs,
chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
#ifdef GSNAP
@@ -11833,8 +11964,7 @@ path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
nullgap,extramaterial_end,extraband_end,
defect_rate,pairpool,dynprogR,
/*extendp*/true,/*endalign*/QUERYEND_NOGAPS);
- pairs = trim_end5_exon_indels(&trim5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),
- pairs,paired_favor_mode,zero_offset,querylength,
+ pairs = trim_end5_exon_indels(&trim5p,*ambig_end_length_5,pairs,paired_favor_mode,zero_offset,querylength,
watsonp,*cdna_direction,maxintronlen_bound);
}
@@ -11842,7 +11972,7 @@ path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
debug3(printf("Extending at 3'\n"));
/* This is the third and final extension */
path = List_reverse(pairs);
- path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),
+ path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
&chop_exon_p,&dynprogindex_minor,path,
chroffset,chrhigh,querylength,
knownsplice_limit_low,knownsplice_limit_high,
@@ -11857,15 +11987,14 @@ path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
nullgap,extramaterial_end,extraband_end,
defect_rate,pairpool,dynprogL,
/*extendp*/true,/*endalign*/QUERYEND_NOGAPS);
- path = trim_end3_exon_indels(&trim3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),
- path,paired_favor_mode,zero_offset,querylength,
+ path = trim_end3_exon_indels(&trim3p,*ambig_end_length_3,path,paired_favor_mode,zero_offset,querylength,
watsonp,*cdna_direction,maxintronlen_bound);
pairs = List_reverse(path);
}
/* Important to end the alignment with Pair_trim_ends, or else trimming will be faulty */
/* Also, doing trimming within each loop yields better results in a small number of cases */
- pairs = Pair_trim_ends(&trim5p_ignore,&trim3p_ignore,pairs);
+ pairs = Pair_trim_ends(&trim5p_ignore,&trim3p_ignore,pairs,*ambig_end_length_5,*ambig_end_length_3);
}
debug3(printf("After trim ends:\n"));
@@ -11887,6 +12016,7 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
int *matches, int *nmatches_posttrim, int *max_match_length,
int *ambig_end_length_5, int *ambig_end_length_3,
Splicetype_T *ambig_splicetype_5, Splicetype_T *ambig_splicetype_3,
+ double *ambig_prob_5, double *ambig_prob_3,
int *unknowns, int *mismatches, int *qopens, int *qindels, int *topens, int *tindels,
int *ncanonical, int *nsemicanonical, int *nnoncanonical, double *min_splice_prob,
Stage2_T stage2,
@@ -11918,6 +12048,7 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
Chrpos_T *last_genomedp5_fwd = NULL, *last_genomedp3_fwd = NULL, *last_genomedp5_rev = NULL, *last_genomedp3_rev = NULL;
List_T pairs_pretrim, pairs_fwd, pairs_rev, best_pairs, temp_pairs, pairs, path_fwd, path_rev, best_path, temp_path, path;
List_T copy;
+ List_T joined_ends, joined_starts;
int alignment_score_fwd, alignment_score_rev;
int ncanonical_fwd, nsemicanonical_fwd, nnoncanonical_fwd,
ncanonical_rev, nsemicanonical_rev, nnoncanonical_rev;
@@ -11927,8 +12058,9 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
avg_donor_score_rev = 0.0, avg_acceptor_score_rev = 0.0;
double defect_rate_fwd, defect_rate_rev, defect_rate_temp, defect_rate;
int nmatches_fwd, nmismatches_fwd, nmatches_rev, nmismatches_rev, nindels_fwd, nindels_rev;
- int fwd_ambig_end_length_5, fwd_ambig_end_length_3, rev_ambig_end_length_5, rev_ambig_end_length_3;
- Splicetype_T fwd_ambig_splicetype_5, fwd_ambig_splicetype_3, rev_ambig_splicetype_5, rev_ambig_splicetype_3;
+ int fwd_ambig_end_length_5 = 0, fwd_ambig_end_length_3 = 0, rev_ambig_end_length_5 = 0, rev_ambig_end_length_3 = 0, temp_ambig_end_length;
+ Splicetype_T fwd_ambig_splicetype_5, fwd_ambig_splicetype_3, rev_ambig_splicetype_5, rev_ambig_splicetype_3, temp_ambig_splicetype;
+ double fwd_ambig_prob_5, fwd_ambig_prob_3, rev_ambig_prob_5, rev_ambig_prob_3, temp_ambig_prob;
#ifdef COMPLEX_DIRECTION
int indel_alignment_score_fwd, indel_alignment_score_rev;
@@ -12072,6 +12204,8 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
/* 2. 3' and 5' ends (possibly multiple) */
+ debug(printf("Stage2 has %d starts and %d ends\n",]
+ List_length(Stage2_all_starts(stage2)),List_length(Stage2_all_ends(stage2))));
if (path_fwd == NULL) {
pairs_fwd = (List_T) NULL;
#ifdef DEBUG8
@@ -12080,9 +12214,10 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
#endif
} else {
/* 3' end */
- if ((p = Stage2_all_ends(stage2)) == NULL) {
- best_path = path_compute_end3(&fwd_ambig_end_length_3,&fwd_ambig_splicetype_3,defect_rate_fwd,path_fwd,
- /*cdna_direction*/+1,watsonp,genestrand,jump_late_p,querylength,
+ if (Stage2_all_ends(stage2) == NULL) {
+ best_path = path_compute_end3(&fwd_ambig_end_length_3,&fwd_ambig_splicetype_3,&fwd_ambig_prob_3,
+ defect_rate_fwd,path_fwd,/*cdna_direction*/+1,watsonp,genestrand,
+ jump_late_p,querylength,
#ifdef GSNAP
#ifdef END_KNOWNSPLICING_SHORTCUT
cutoff_level,queryptr,query_compress,
@@ -12098,9 +12233,10 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
maxintronlen,close_indels_mode,paired_favor_mode,zero_offset);
} else {
best_path = path_fwd;
- for ( ; p != NULL; p = List_next(p)) {
+ joined_ends = (List_T) NULL;
+ for (p = Stage2_all_ends(stage2); p != NULL; p = List_next(p)) {
#ifdef PMAP
- copy = Pairpool_join_end3(/*path*/path_fwd,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
+ copy = Pairpool_join_end3(/*path*/path_fwd,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
#else
if (path_rev == NULL) {
/* Won't need ends anymore */
@@ -12109,6 +12245,11 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
copy = Pairpool_join_end3(/*path*/path_fwd,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/true);
}
#endif
+ joined_ends = List_push(joined_ends,(void *) copy);
+ }
+
+ for (p = joined_ends; p != NULL; p = List_next(p)) {
+ copy = (List_T) List_head(p);
path_fwd = path_compute_dir(&defect_rate_temp,/*pairs*/List_reverse(copy),/*cdna_direction*/+1,
watsonp,genestrand,jump_late_p,
#ifdef GSNAP
@@ -12127,8 +12268,9 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
sufflookback,nsufflookback,maxintronlen,close_indels_mode,
paired_favor_mode,zero_offset);
- temp_path = path_compute_end3(&fwd_ambig_end_length_3,&fwd_ambig_splicetype_3,defect_rate_temp,path_fwd,
- /*cdna_direction*/+1,watsonp,genestrand,jump_late_p,querylength,
+ temp_path = path_compute_end3(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
+ defect_rate_temp,path_fwd,/*cdna_direction*/+1,watsonp,genestrand,
+ jump_late_p,querylength,
#ifdef GSNAP
#ifdef END_KNOWNSPLICING_SHORTCUT
cutoff_level,queryptr,query_compress,
@@ -12146,19 +12288,25 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
if (end_compare(best_path,temp_path,/*cdna_direction*/+1,watsonp,
chrnum,chroffset,chrhigh,nullgap,/*pairsp*/false) > 0) {
best_path = temp_path;
+ fwd_ambig_end_length_3 = temp_ambig_end_length;
+ fwd_ambig_splicetype_3 = temp_ambig_splicetype;
+ fwd_ambig_prob_3 = temp_ambig_prob;
defect_rate_fwd = defect_rate_temp;
debug21(printf("New best path:\n"));
debug21(Pair_dump_list(best_path,true));
}
}
+
+ List_free(&joined_ends);
}
/* 5' end */
pairs_fwd = List_reverse(best_path);
- if ((p = Stage2_all_starts(stage2)) == NULL) {
- best_pairs = path_compute_end5(&fwd_ambig_end_length_5,&fwd_ambig_splicetype_5,defect_rate_fwd,pairs_fwd,
- /*cdna_direction*/+1,watsonp,genestrand,jump_late_p,querylength,
+ if (Stage2_all_starts(stage2) == NULL) {
+ best_pairs = path_compute_end5(&fwd_ambig_end_length_5,&fwd_ambig_splicetype_5,&fwd_ambig_prob_5,
+ defect_rate_fwd,pairs_fwd,/*cdna_direction*/+1,watsonp,genestrand,
+ jump_late_p,querylength,
#ifdef GSNAP
#ifdef END_KNOWNSPLICING_SHORTCUT
cutoff_level,queryptr,query_compress,
@@ -12174,9 +12322,10 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
maxintronlen,close_indels_mode,paired_favor_mode,zero_offset);
} else {
best_pairs = pairs_fwd;
- for ( ; p != NULL; p = List_next(p)) {
+ joined_starts = (List_T) NULL;
+ for (p = Stage2_all_starts(stage2); p != NULL; p = List_next(p)) {
#ifdef PMAP
- copy = Pairpool_join_end5(/*pairs*/pairs_fwd,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
+ copy = Pairpool_join_end5(/*pairs*/pairs_fwd,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
#else
if (path_rev == NULL) {
/* Won't need ends anymore */
@@ -12185,6 +12334,11 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
copy = Pairpool_join_end5(/*pairs*/pairs_fwd,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/true);
}
#endif
+ joined_starts = List_push(joined_starts,(void *) copy);
+ }
+
+ for (p = joined_starts; p != NULL; p = List_next(p)) {
+ copy = (List_T) List_head(p);
path_fwd = path_compute_dir(&defect_rate_temp,/*pairs*/copy,/*cdna_direction*/+1,
watsonp,genestrand,jump_late_p,
#ifdef GSNAP
@@ -12203,8 +12357,8 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
sufflookback,nsufflookback,maxintronlen,close_indels_mode,
paired_favor_mode,zero_offset);
- temp_pairs = path_compute_end5(&fwd_ambig_end_length_5,&fwd_ambig_splicetype_5,defect_rate_temp,
- /*pairs*/List_reverse(path_fwd),
+ temp_pairs = path_compute_end5(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
+ defect_rate_temp,/*pairs*/List_reverse(path_fwd),
/*cdna_direction*/+1,watsonp,genestrand,jump_late_p,querylength,
#ifdef GSNAP
#ifdef END_KNOWNSPLICING_SHORTCUT
@@ -12222,11 +12376,16 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
if (end_compare(best_pairs,temp_pairs,/*cdna_direction*/+1,watsonp,
chrnum,chroffset,chrhigh,nullgap,/*pairsp*/true) > 0) {
best_pairs = temp_pairs;
+ fwd_ambig_end_length_5 = temp_ambig_end_length;
+ fwd_ambig_splicetype_5 = temp_ambig_splicetype;
+ fwd_ambig_prob_5 = temp_ambig_prob;
defect_rate_fwd = defect_rate_temp;
debug21(printf("New best pairs:\n"));
debug21(Pair_dump_list(best_pairs,true));
}
}
+
+ List_free(&joined_starts);
}
pairs_fwd = best_pairs;
@@ -12242,9 +12401,10 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
#endif
} else {
/* 3' end */
- if ((p = Stage2_all_ends(stage2)) == NULL) {
- best_path = path_compute_end3(&rev_ambig_end_length_3,&rev_ambig_splicetype_3,defect_rate_rev,path_rev,
- /*cdna_direction*/-1,watsonp,genestrand,jump_late_p,querylength,
+ if (Stage2_all_ends(stage2) == NULL) {
+ best_path = path_compute_end3(&rev_ambig_end_length_3,&rev_ambig_splicetype_3,&rev_ambig_prob_3,
+ defect_rate_rev,path_rev,/*cdna_direction*/-1,watsonp,genestrand,
+ jump_late_p,querylength,
#ifdef GSNAP
#ifdef END_KNOWNSPLICING_SHORTCUT
cutoff_level,queryptr,query_compress,
@@ -12261,8 +12421,14 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
} else {
best_path = path_rev;
- for ( ; p != NULL; p = List_next(p)) {
+ joined_ends = (List_T) NULL;
+ for (p = Stage2_all_ends(stage2); p != NULL; p = List_next(p)) {
copy = Pairpool_join_end3(/*path*/path_rev,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
+ joined_ends = List_push(joined_ends,(void *) copy);
+ }
+
+ for (p = joined_ends; p != NULL; p = List_next(p)) {
+ copy = (List_T) List_head(p);
path_rev = path_compute_dir(&defect_rate_temp,/*pairs*/List_reverse(copy),/*cdna_direction*/-1,
watsonp,genestrand,jump_late_p,
#ifdef GSNAP
@@ -12281,8 +12447,9 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
sufflookback,nsufflookback,maxintronlen,close_indels_mode,
paired_favor_mode,zero_offset);
- temp_path = path_compute_end3(&rev_ambig_end_length_3,&rev_ambig_splicetype_3,defect_rate_temp,path_rev,
- /*cdna_direction*/-1,watsonp,genestrand,jump_late_p,querylength,
+ temp_path = path_compute_end3(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
+ defect_rate_temp,path_rev,/*cdna_direction*/-1,watsonp,genestrand,
+ jump_late_p,querylength,
#ifdef GSNAP
#ifdef END_KNOWNSPLICING_SHORTCUT
cutoff_level,queryptr,query_compress,
@@ -12300,18 +12467,24 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
if (end_compare(best_path,temp_path,/*cdna_direction*/-1,watsonp,
chrnum,chroffset,chrhigh,nullgap,/*pairsp*/false) > 0) {
best_path = temp_path;
+ rev_ambig_end_length_3 = temp_ambig_end_length;
+ rev_ambig_splicetype_3 = temp_ambig_splicetype;
+ rev_ambig_prob_3 = temp_ambig_prob;
defect_rate_rev = defect_rate_temp;
debug21(printf("New best path:\n"));
debug21(Pair_dump_list(best_path,true));
}
}
+
+ List_free(&joined_ends);
}
/* 5' end */
pairs_rev = List_reverse(best_path);
- if ((p = Stage2_all_starts(stage2)) == NULL) {
- best_pairs = path_compute_end5(&rev_ambig_end_length_5,&rev_ambig_splicetype_5,defect_rate_rev,pairs_rev,
- /*cdna_direction*/-1,watsonp,genestrand,jump_late_p,querylength,
+ if (Stage2_all_starts(stage2) == NULL) {
+ best_pairs = path_compute_end5(&rev_ambig_end_length_5,&rev_ambig_splicetype_5,&rev_ambig_prob_5,
+ defect_rate_rev,pairs_rev,/*cdna_direction*/-1,watsonp,genestrand,
+ jump_late_p,querylength,
#ifdef GSNAP
#ifdef END_KNOWNSPLICING_SHORTCUT
cutoff_level,queryptr,query_compress,
@@ -12328,8 +12501,14 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
} else {
best_pairs = pairs_rev;
- for ( ; p != NULL; p = List_next(p)) {
+ joined_starts = (List_T) NULL;
+ for (p = Stage2_all_starts(stage2); p != NULL; p = List_next(p)) {
copy = Pairpool_join_end5(/*pairs*/pairs_rev,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
+ joined_starts = List_push(joined_starts,(void *) copy);
+ }
+
+ for (p = joined_starts; p != NULL; p = List_next(p)) {
+ copy = (List_T) List_head(p);
path_rev = path_compute_dir(&defect_rate_temp,/*pairs*/copy,/*cdna_direction*/-1,
watsonp,genestrand,jump_late_p,
#ifdef GSNAP
@@ -12348,8 +12527,8 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
sufflookback,nsufflookback,maxintronlen,close_indels_mode,
paired_favor_mode,zero_offset);
- temp_pairs = path_compute_end5(&rev_ambig_end_length_5,&rev_ambig_splicetype_5,defect_rate_temp,
- /*pairs*/List_reverse(path_rev),
+ temp_pairs = path_compute_end5(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
+ defect_rate_temp,/*pairs*/List_reverse(path_rev),
/*cdna_direction*/-1,watsonp,genestrand,jump_late_p,querylength,
#ifdef GSNAP
#ifdef END_KNOWNSPLICING_SHORTCUT
@@ -12367,11 +12546,16 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
if (end_compare(best_pairs,temp_pairs,/*cdna_direction*/-1,watsonp,
chrnum,chroffset,chrhigh,nullgap,/*pairsp*/true) > 0) {
best_pairs = temp_pairs;
+ rev_ambig_end_length_5 = temp_ambig_end_length;
+ rev_ambig_splicetype_5 = temp_ambig_splicetype;
+ rev_ambig_prob_5 = temp_ambig_prob;
defect_rate_rev = defect_rate_temp;
debug21(printf("New best pairs:\n"));
debug21(Pair_dump_list(best_pairs,true));
}
}
+
+ List_free(&joined_starts);
}
pairs_rev = best_pairs;
@@ -12533,6 +12717,7 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
*npairs = 0;
*nmatches_posttrim = 0;
*ambig_end_length_5 = *ambig_end_length_3 = 0;
+ *ambig_prob_5 = *ambig_prob_3 = 0.0;
return (struct Pair_T *) NULL;
} else {
if (*cdna_direction >= 0) {
@@ -12540,12 +12725,16 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
*ambig_end_length_3 = fwd_ambig_end_length_3;
*ambig_splicetype_5 = fwd_ambig_splicetype_5;
*ambig_splicetype_3 = fwd_ambig_splicetype_3;
+ *ambig_prob_5 = fwd_ambig_prob_5;
+ *ambig_prob_3 = fwd_ambig_prob_3;
defect_rate = defect_rate_fwd;
} else {
*ambig_end_length_5 = rev_ambig_end_length_5;
*ambig_end_length_3 = rev_ambig_end_length_3;
*ambig_splicetype_5 = rev_ambig_splicetype_5;
*ambig_splicetype_3 = rev_ambig_splicetype_3;
+ *ambig_prob_5 = rev_ambig_prob_5;
+ *ambig_prob_3 = rev_ambig_prob_3;
defect_rate = defect_rate_rev;
}
@@ -12556,6 +12745,7 @@ Stage3_compute (List_T *finalpairs, int *npairs, int *cdna_direction, int *sense
#endif
*finalpairs = path_trim(defect_rate,&(*ambig_end_length_5),&(*ambig_end_length_3),
&(*ambig_splicetype_5),&(*ambig_splicetype_3),
+ &(*ambig_prob_5),&(*ambig_prob_3),
pairs_pretrim,&(*cdna_direction),&(*sensedir),watsonp,
jump_late_p,querylength,
#ifdef GSNAP
@@ -12724,6 +12914,7 @@ Stage3_merge_chimera (T this_left, T this_right,
List_T path;
bool knownsplicep, chop_exon_p;
int ambig_end_length_5 = 0, ambig_end_length_3 = 0; /* Need to be set for build_pairs_end5 and build_path_end3 */
+ double ambig_prob_5, ambig_prob_3;
int dynprogindex_minor = 0;
Splicetype_T ambig_splicetype;
@@ -12746,7 +12937,7 @@ Stage3_merge_chimera (T this_left, T this_right,
/* To avoid indels at chimeric join, need to clean ends, extend with nogaps, and then clip*/
path = clean_path_end3_gap_indels(path);
- path = build_path_end3(&knownsplicep,&ambig_end_length_3,&ambig_splicetype,
+ path = build_path_end3(&knownsplicep,&ambig_end_length_3,&ambig_splicetype,&ambig_prob_3,
&chop_exon_p,&dynprogindex_minor,path,
this_left->chroffset,this_left->chrhigh,/*querylength*/maxpos1+1,
/*knownsplice_limit_low*/-1U,/*knownsplice_limit_high*/0,
@@ -12764,7 +12955,7 @@ Stage3_merge_chimera (T this_left, T this_right,
/* To avoid indels at chimeric join, need to clean ends, extend with nogaps, and then clip*/
this_right->pairs = clean_pairs_end5_gap_indels(this_right->pairs);
- this_right->pairs = build_pairs_end5(&knownsplicep,&ambig_end_length_5,&ambig_splicetype,
+ this_right->pairs = build_pairs_end5(&knownsplicep,&ambig_end_length_5,&ambig_splicetype,&ambig_prob_5,
&chop_exon_p,&dynprogindex_minor,this_right->pairs,
this_right->chroffset,this_right->chrhigh,
/*knownsplice_limit_low*/-1U,/*knownsplice_limit_high*/0,
diff --git a/src/stage3.h b/src/stage3.h
index a2fe52f..cc89a8b 100644
--- a/src/stage3.h
+++ b/src/stage3.h
@@ -1,4 +1,4 @@
-/* $Id: stage3.h 135447 2014-05-07 22:25:45Z twu $ */
+/* $Id: stage3.h 141665 2014-07-16 01:36:38Z twu $ */
#ifndef STAGE3_INCLUDED
#define STAGE3_INCLUDED
@@ -279,6 +279,7 @@ Stage3_compute (List_T *pairs, int *npairs, int *cdna_direction, int *sensedir,
int *matches, int *nmatches_posttrim, int *max_match_length,
int *ambig_end_length_5, int *ambig_end_length_3,
Splicetype_T *ambig_splicetype_5, Splicetype_T *ambig_splicetype_3,
+ double *ambig_prob_5, double *ambig_prob_3,
int *unknowns, int *mismatches, int *qopens, int *qindels, int *topens, int *tindels,
int *ncanonical, int *nsemicanonical, int *nnoncanonical, double *min_splice_prob,
Stage2_T stage2,
diff --git a/src/stage3hr.c b/src/stage3hr.c
index 4b2ef6f..aa80fe1 100644
--- a/src/stage3hr.c
+++ b/src/stage3hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage3hr.c 140368 2014-07-02 00:56:33Z twu $";
+static char rcsid[] = "$Id: stage3hr.c 142100 2014-07-22 03:12:43Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -39,6 +39,7 @@ static char rcsid[] = "$Id: stage3hr.c 140368 2014-07-02 00:56:33Z twu $";
/* #define TERMINAL_SECOND_CLASS 1 -- enabling this leads to poor results */
#define TERMINAL_COMPUTE_MINLENGTH 40
+#define SCORE_INDELS 1 /* Needed to compare genomic positions with and without indels */
#ifdef DEBUG
@@ -461,8 +462,7 @@ struct T {
bool trim_right_splicep;
int penalties; /* Indel penalties */
- int score_eventrim_start; /* Temporary storage used by Stage3end_optimal_score */
- int score_eventrim_end; /* Temporary storage used by Stage3end_optimal_score */
+ int score_eventrim; /* Temporary storage used by Stage3end_optimal_score */
Overlap_T gene_overlap;
long int tally;
@@ -495,6 +495,8 @@ struct T {
int end_amb_nmatches; /* For splice, shortexon, and GMAP */
int amb_nmatches_donor; /* For shortexon only */
int amb_nmatches_acceptor; /* For shortexon only */
+ double start_amb_prob; /* For GMAP currently */
+ double end_amb_prob; /* For GMAP currently */
Endtype_T gmap_start_endtype; /* For GMAP, which has no substrings */
Endtype_T gmap_end_endtype; /* For GMAP, which has no substrings */
@@ -2859,8 +2861,7 @@ Stage3end_copy (T old) {
new->trim_right_splicep = old->trim_right_splicep;
new->penalties = old->penalties;
- new->score_eventrim_start = old->score_eventrim_start;
- new->score_eventrim_end = old->score_eventrim_end;
+ new->score_eventrim = old->score_eventrim;
new->gene_overlap = old->gene_overlap;
new->tally = old->tally;
@@ -2898,6 +2899,8 @@ Stage3end_copy (T old) {
new->start_amb_nmatches = old->start_amb_nmatches;
new->end_amb_nmatches = old->end_amb_nmatches;
+ new->start_amb_prob = old->start_amb_prob;
+ new->end_amb_prob = old->end_amb_prob;
new->amb_nmatches_donor = old->amb_nmatches_donor;
new->amb_nmatches_acceptor = old->amb_nmatches_acceptor;
@@ -3080,8 +3083,7 @@ static int
compute_circularpos (int *alias, T hit) {
int circularpos;
- debug12(printf("Computing circularpos on hit at %u..%u (%u..%u) with trim left %d and trim right %d\n",
- hit->genomicstart,hit->genomicend,
+ debug12(printf("Computing circularpos on hit at %u..%u with trim left %d and trim right %d\n",
hit->genomicstart - hit->chroffset,hit->genomicend - hit->chroffset,hit->trim_left,hit->trim_right));
if (circularp[hit->chrnum] == false) {
/* This also handles hit->chrnum == 0, where translocation cannot be circular */
@@ -3254,6 +3256,7 @@ Stage3end_new_exact (int *found_score, Univcoord_T left, int genomiclength, Comp
*found_score = 0;
new->start_amb_nmatches = new->end_amb_nmatches = 0;
+ new->start_amb_prob = new->end_amb_prob = 0.0;
new->amb_nmatches_donor = new->amb_nmatches_acceptor = 0;
new->start_ambiguous_p = new->end_ambiguous_p = false;
@@ -3403,6 +3406,7 @@ Stage3end_new_substitution (int *found_score, int nmismatches_whole, Univcoord_T
}
new->start_amb_nmatches = new->end_amb_nmatches = 0;
+ new->start_amb_prob = new->end_amb_prob = 0.0;
new->amb_nmatches_donor = new->amb_nmatches_acceptor = 0;
new->start_ambiguous_p = new->end_ambiguous_p = false;
@@ -3594,6 +3598,7 @@ Stage3end_new_insertion (int *found_score, int nindels, int indel_pos, int nmism
}
new->start_amb_nmatches = new->end_amb_nmatches = 0;
+ new->start_amb_prob = new->end_amb_prob = 0.0;
new->amb_nmatches_donor = new->amb_nmatches_acceptor = 0;
new->start_ambiguous_p = new->end_ambiguous_p = false;
@@ -3813,6 +3818,7 @@ Stage3end_new_deletion (int *found_score, int nindels, int indel_pos, int nmisma
}
new->start_amb_nmatches = new->end_amb_nmatches = 0;
+ new->start_amb_prob = new->end_amb_prob = 0.0;
new->amb_nmatches_donor = new->amb_nmatches_acceptor = 0;
new->start_ambiguous_p = new->end_ambiguous_p = false;
@@ -4044,6 +4050,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->start_ambiguous_p = true;
new->start_amb_nmatches = amb_nmatches;
+ new->start_amb_prob = 5.0;
new->start_ambcoords = new->ambcoords_donor;
new->start_nambcoords = new->nambcoords_donor;
new->start_amb_knowni = new->amb_knowni_donor;
@@ -4051,6 +4058,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->end_ambiguous_p = false;
new->end_amb_nmatches = 0;
+ new->end_amb_prob = 5.0;
new->end_ambcoords = NULL;
new->end_nambcoords = 0;
new->end_amb_knowni = NULL;
@@ -4062,6 +4070,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->end_ambiguous_p = true;
new->end_amb_nmatches = amb_nmatches;
+ new->end_amb_prob = 5.0;
new->end_ambcoords = new->ambcoords_acceptor;
new->end_nambcoords = new->nambcoords_acceptor;
new->end_amb_knowni = new->amb_knowni_acceptor;
@@ -4069,6 +4078,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->start_ambiguous_p = false;
new->start_amb_nmatches = 0;
+ new->start_amb_prob = 5.0;
new->start_ambcoords = NULL;
new->start_nambcoords = 0;
new->start_amb_knowni = NULL;
@@ -4080,6 +4090,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->start_ambiguous_p = new->end_ambiguous_p = false;
new->start_amb_nmatches = new->end_amb_nmatches = 0;
+ new->start_amb_prob = new->end_amb_prob = 5.0;
new->start_ambcoords = new->end_ambcoords = NULL;
new->start_nambcoords = new->end_nambcoords = 0;
new->start_amb_knowni = new->end_amb_knowni = NULL;
@@ -4093,6 +4104,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->end_ambiguous_p = true;
new->end_amb_nmatches = amb_nmatches;
+ new->end_amb_prob = 5.0;
new->end_ambcoords = new->ambcoords_donor;
new->end_nambcoords = new->nambcoords_donor;
new->end_amb_knowni = new->amb_knowni_donor;
@@ -4100,6 +4112,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->start_ambiguous_p = false;
new->start_amb_nmatches = 0;
+ new->start_amb_prob = 5.0;
new->start_ambcoords = NULL;
new->start_nambcoords = 0;
new->start_amb_knowni = NULL;
@@ -4111,6 +4124,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->start_ambiguous_p = true;
new->start_amb_nmatches = amb_nmatches;
+ new->start_amb_prob = 5.0;
new->start_ambcoords = new->ambcoords_acceptor;
new->start_nambcoords = new->nambcoords_acceptor;
new->start_amb_knowni = new->amb_knowni_acceptor;
@@ -4118,6 +4132,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->end_ambiguous_p = false;
new->end_amb_nmatches = 0;
+ new->end_amb_prob = 5.0;
new->end_ambcoords = NULL;
new->end_nambcoords = 0;
new->end_amb_knowni = NULL;
@@ -4128,6 +4143,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
new->genomicend = Substring_genomicend(donor);
new->start_amb_nmatches = new->end_amb_nmatches = 0;
+ new->start_amb_prob = new->end_amb_prob = 5.0;
new->start_ambiguous_p = new->end_ambiguous_p = false;
new->start_ambcoords = new->end_ambcoords = NULL;
new->start_nambcoords = new->end_nambcoords = 0;
@@ -4147,7 +4163,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
}
debug0(printf(" hittype is %s, genomicpos %u..%u\n",
- hittype_string(new->hittype),new->genomicstart,new->genomicend));
+ hittype_string(new->hittype),new->genomicstart - new->chroffset,new->genomicend - new->chroffset));
debug0(printf("start_ambiguous_p %d (%d starts), end_ambiguous_p %d (%d ends)\n",
new->start_ambiguous_p,new->start_nambcoords,new->end_ambiguous_p,new->end_nambcoords));
debug0(printf("start_amb_nmatches %d, end_amb_nmatches %d\n",new->start_amb_nmatches,new->end_amb_nmatches));
@@ -4375,7 +4391,7 @@ Stage3end_new_splice (int *found_score, int nmismatches_donor, int nmismatches_a
assert(new->substring1 != NULL);
debug0(printf("Returning new splice %p at genomic %u..%u, donor %p (%u => %u), acceptor %p (%u => %u)\n",
- new,new->genomicstart,new->genomicend,new->substring_donor,
+ new,new->genomicstart - new->chroffset,new->genomicend - new->chroffset,new->substring_donor,
new->substring_donor == NULL ? 0 : Substring_left_genomicseg(new->substring_donor),
new->substring_donor == NULL ? 0 : Substring_splicecoord(new->substring_donor),
new->substring_acceptor,new->substring_acceptor == NULL ? 0 : Substring_left_genomicseg(new->substring_acceptor),
@@ -4514,12 +4530,14 @@ Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T accept
new->genomicend = (acceptor != NULL ? Substring_genomicend(acceptor) : Substring_genomicend(shortexon));
new->start_amb_nmatches = new->amb_nmatches_donor;
+ new->start_amb_prob = 5.0;
new->start_ambcoords = new->ambcoords_donor;
new->start_nambcoords = new->nambcoords_donor;
new->start_amb_knowni = new->amb_knowni_donor;
new->start_amb_nmismatches = new->amb_nmismatches_donor;
new->end_amb_nmatches = new->amb_nmatches_acceptor;
+ new->end_amb_prob = 5.0;
new->end_ambcoords = new->ambcoords_acceptor;
new->end_nambcoords = new->nambcoords_acceptor;
new->end_amb_knowni = new->amb_knowni_acceptor;
@@ -4533,12 +4551,14 @@ Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T accept
new->genomicend = (donor != NULL ? Substring_genomicend(donor) : Substring_genomicend(shortexon));
new->start_amb_nmatches = new->amb_nmatches_acceptor;
+ new->start_amb_prob = 5.0;
new->start_ambcoords = new->ambcoords_acceptor;
new->start_nambcoords = new->nambcoords_acceptor;
new->start_amb_knowni = new->amb_knowni_acceptor;
new->start_amb_nmismatches = new->amb_nmismatches_acceptor;
new->end_amb_nmatches = new->amb_nmatches_donor;
+ new->end_amb_prob = 5.0;
new->end_ambcoords = new->ambcoords_donor;
new->end_nambcoords = new->nambcoords_donor;
new->end_amb_knowni = new->amb_knowni_donor;
@@ -4560,7 +4580,7 @@ Stage3end_new_shortexon (int *found_score, Substring_T donor, Substring_T accept
}
debug0(printf(" hittype is %s, genomicpos %u..%u\n",
- hittype_string(new->hittype),new->genomicstart,new->genomicend));
+ hittype_string(new->hittype),new->genomicstart - new->chroffset,new->genomicend - new->chroffset));
debug0(printf("start_ambiguous_p %d, end_ambiguous_p %d\n",new->start_ambiguous_p,new->end_ambiguous_p));
debug0(printf("start_amb_nmatches %d, end_amb_nmatches %d\n",new->start_amb_nmatches,new->end_amb_nmatches));
@@ -4897,6 +4917,7 @@ Stage3end_new_terminal (int querystart, int queryend, Univcoord_T left, Compress
new->tally = -1L;
new->start_amb_nmatches = new->end_amb_nmatches = 0;
+ new->start_amb_prob = new->end_amb_prob = 5.0;
new->amb_nmatches_donor = new->amb_nmatches_acceptor = 0;
new->start_ambiguous_p = new->end_ambiguous_p = false;
@@ -4929,8 +4950,8 @@ T
Stage3end_new_gmap (int nmismatches_whole, int nmatches_posttrim, int max_match_length,
int ambig_end_length_5, int ambig_end_length_3,
Splicetype_T ambig_splicetype_5, Splicetype_T ambig_splicetype_3,
- double min_splice_prob, struct Pair_T *pairarray, int npairs,
- int nsegments, int nintrons, int nindelbreaks,
+ double ambig_prob_5, double ambig_prob_3, double min_splice_prob,
+ struct Pair_T *pairarray, int npairs, int nsegments, int nintrons, int nindelbreaks,
Univcoord_T left, int genomiclength, bool plusp, int genestrand, bool first_read_p,
int querylength, Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
int cdna_direction, int sensedir) {
@@ -4975,9 +4996,9 @@ ATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAG
}
new = (T) MALLOC_OUT(sizeof(*new));
- debug0(printf("Stage3end_new_gmap %p: left %lu, genomicstart/end %lu..%lu, chrhigh %lu, chrnum %d, nmismatches %d, cdna_direction %d, sensedir %d, max_match_length %d\n",
- new,left,genomicstart,genomicend,chrhigh,chrnum,nmismatches_whole,cdna_direction,sensedir,max_match_length));
- debug0(printf(" ambig_end_length_5 %d, ambig_end_length_3 %d\n",ambig_end_length_5,ambig_end_length_3));
+ debug0(printf("Stage3end_new_gmap %p: left %lu, genomicstart/end %u..%u, chrhigh %lu, chrnum %d, nmismatches %d, cdna_direction %d, sensedir %d, max_match_length %d\n",
+ new,left,genomicstart - chroffset,genomicend - chroffset,chrhigh,chrnum,nmismatches_whole,cdna_direction,sensedir,max_match_length));
+ debug0(printf(" ambig_end_length_5 %d (prob %f), ambig_end_length_3 %d (prob %f)\n",ambig_end_length_5,ambig_prob_5,ambig_end_length_3,ambig_prob_3));
new->substring1 = (Substring_T) NULL;
new->substring2 = (Substring_T) NULL;
@@ -5060,6 +5081,7 @@ ATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAG
/* new->nmismatches_bothdiff = Substring_nmismatches_bothdiff(new->substring1); */
/* new->nmismatches_refdiff = Substring_nmismatches_refdiff(new->substring1); */
+ /* Adding ambig_end_lengths to nmatches_posttrim would unnecessarily favor long ambig ends when comparing GMAP results */
new->nmatches = nmatches_posttrim; /* To make addition of ambiguous lengths work, we need to use posttrim, not pretrim */
new->nmatches_posttrim = nmatches_posttrim;
if (favor_ambiguous_p == true) {
@@ -5150,6 +5172,7 @@ ATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAG
ambig_splicetype_5,ambig_end_length_5);
abort();
}
+ new->start_amb_prob = ambig_prob_5;
if ((new->end_amb_nmatches = ambig_end_length_3) == 0) {
new->gmap_end_endtype = END;
@@ -5162,6 +5185,7 @@ ATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAG
ambig_splicetype_3,ambig_end_length_3);
abort();
}
+ new->end_amb_prob = ambig_prob_3;
new->amb_nmatches_donor = new->amb_nmatches_acceptor = 0;
@@ -5872,13 +5896,12 @@ Stage3end_optimal_score_aux (bool *eliminatedp, List_T hitlist, int cutoff_level
List_T optimal = NULL, p;
T hit;
int n;
- int minscore_start = MAX_READLENGTH, minscore_end = MAX_READLENGTH;
+ int minscore = MAX_READLENGTH;
int max_nmatches = 0, max_nmatches_posttrim = 0;
int trim_left, trim_right;
int min_trim_left = MAX_READLENGTH, min_trim_right = MAX_READLENGTH;
int max_trim_left_terminal = 0, max_trim_right_terminal = 0;
int nindelbreaks;
- int cutoff_level_start, cutoff_level_end;
#ifdef TRANSLOC_SPECIAL
bool non_translocation_p = false;
@@ -5976,9 +5999,9 @@ Stage3end_optimal_score_aux (bool *eliminatedp, List_T hitlist, int cutoff_level
if (hit->hittype == TERMINAL && finalp == false) {
/* Ignore */
- hit->score_eventrim_start = hit->score_eventrim_end = 0;
+ hit->score_eventrim = 0;
} else if (hit->hittype == GMAP) {
- hit->score_eventrim_start = hit->score_eventrim_end = 0; /* was hit->penalties */
+ hit->score_eventrim = 0; /* was hit->penalties */
debug4(printf("score GMAP:"));
#if 0
if (Stage3end_bad_stretch_p(hit,query_compress_fwd,query_compress_rev) == true) {
@@ -6002,67 +6025,53 @@ Stage3end_optimal_score_aux (bool *eliminatedp, List_T hitlist, int cutoff_level
}
#endif
- hit->score_eventrim_start += Pair_nmismatches_region(&nindelbreaks,hit->pairarray,hit->npairs,
- trim_left,/*trim_end*/hit->trim_right,hit->querylength_adj);
- hit->score_eventrim_end += Pair_nmismatches_region(&nindelbreaks,hit->pairarray,hit->npairs,
- /*trim_start*/hit->trim_left,trim_right,hit->querylength_adj);
- debug4(printf(" add nmismatches %d or %d.",
- Pair_nmismatches_region(&nindelbreaks,hit->pairarray,hit->npairs,
- trim_left,/*trim_end*/hit->trim_right,hit->querylength_adj),
- Pair_nmismatches_region(&nindelbreaks,hit->pairarray,hit->npairs,
- /*trim_start*/hit->trim_left,trim_right,hit->querylength_adj)));
- hit->score_eventrim_start += indel_penalty_middle * nindelbreaks;
- hit->score_eventrim_end += indel_penalty_middle * nindelbreaks;
-
- hit->score_eventrim_start += hit->start_amb_nmatches / ambig_end_interval;
- debug4(printf(" add amb start %d/%d.",hit->start_amb_nmatches,ambig_end_interval));
- hit->score_eventrim_end += hit->end_amb_nmatches / ambig_end_interval;
- debug4(printf(" add amb end %d/%d.",hit->end_amb_nmatches,ambig_end_interval));
- debug4(printf(" RESULT: %d or %d\n",hit->score_eventrim_start,hit->score_eventrim_end));
+ hit->score_eventrim += Pair_nmismatches_region(&nindelbreaks,hit->pairarray,hit->npairs,
+ trim_left,trim_right,hit->start_amb_nmatches,hit->end_amb_nmatches,
+ hit->querylength_adj);
+ debug4(printf(" add nmismatches %d.",Pair_nmismatches_region(&nindelbreaks,hit->pairarray,hit->npairs,
+ trim_left,trim_right,hit->start_amb_nmatches,hit->end_amb_nmatches,
+ hit->querylength_adj)));
+#ifdef SCORE_INDELS
+ hit->score_eventrim += indel_penalty_middle * nindelbreaks;
+#endif
+ if (hit->start_amb_prob < 0.9) {
+ hit->score_eventrim += hit->start_amb_nmatches / ambig_end_interval;
+ debug4(printf(" add amb start %d/%d.",hit->start_amb_nmatches,ambig_end_interval));
+ }
+ if (hit->end_amb_prob < 0.9) {
+ hit->score_eventrim += hit->end_amb_nmatches / ambig_end_interval;
+ debug4(printf(" add amb end %d/%d.",hit->end_amb_nmatches,ambig_end_interval));
+ }
+ debug4(printf(" RESULT: %d\n",hit->score_eventrim));
} else {
debug4(printf("score OTHER:"));
- hit->score_eventrim_start = hit->penalties;
- hit->score_eventrim_end = hit->penalties;
+ hit->score_eventrim = hit->penalties;
debug4(printf(" penalties %d.",hit->penalties));
- hit->score_eventrim_start += Substring_count_mismatches_region(hit->substring0,trim_left,/*trim_end*/hit->trim_right,
- query_compress_fwd,query_compress_rev);
- debug4(printf(" substring 0 %d.",Substring_count_mismatches_region(hit->substring0,trim_left,/*trim_end*/hit->trim_right,
+ hit->score_eventrim += Substring_count_mismatches_region(hit->substring0,trim_left,trim_right,
+ query_compress_fwd,query_compress_rev);
+ debug4(printf(" substring 0 %d.",Substring_count_mismatches_region(hit->substring0,trim_left,trim_right,
query_compress_fwd,query_compress_rev)));
- hit->score_eventrim_start += Substring_count_mismatches_region(hit->substring1,trim_left,/*trim_end*/hit->trim_right,
- query_compress_fwd,query_compress_rev);
- debug4(printf(" substring 1 %d.",Substring_count_mismatches_region(hit->substring1,trim_left,/*trim_end*/hit->trim_right,
+ hit->score_eventrim += Substring_count_mismatches_region(hit->substring1,trim_left,trim_right,
+ query_compress_fwd,query_compress_rev);
+ debug4(printf(" substring 1 %d.",Substring_count_mismatches_region(hit->substring1,trim_left,trim_right,
query_compress_fwd,query_compress_rev)));
- hit->score_eventrim_start += Substring_count_mismatches_region(hit->substring2,trim_left,/*trim_end*/hit->trim_right,
- query_compress_fwd,query_compress_rev);
- debug4(printf(" substring 2 %d.",Substring_count_mismatches_region(hit->substring2,trim_left,/*trim_end*/hit->trim_right,
- query_compress_fwd,query_compress_rev)));
-
-
- hit->score_eventrim_end += Substring_count_mismatches_region(hit->substring0,/*trim_start*/hit->trim_left,trim_right,
- query_compress_fwd,query_compress_rev);
- debug4(printf(" substring 0 %d.",Substring_count_mismatches_region(hit->substring0,/*trim_start*/hit->trim_left,trim_right,
- query_compress_fwd,query_compress_rev)));
-
- hit->score_eventrim_end += Substring_count_mismatches_region(hit->substring1,/*trim_start*/hit->trim_left,trim_right,
- query_compress_fwd,query_compress_rev);
- debug4(printf(" substring 1 %d.",Substring_count_mismatches_region(hit->substring1,/*trim_start*/hit->trim_left,trim_right,
- query_compress_fwd,query_compress_rev)));
-
- hit->score_eventrim_end += Substring_count_mismatches_region(hit->substring2,/*trim_start*/hit->trim_left,trim_right,
- query_compress_fwd,query_compress_rev);
- debug4(printf(" substring 2 %d.",Substring_count_mismatches_region(hit->substring2,/*trim_start*/hit->trim_left,trim_right,
+ hit->score_eventrim += Substring_count_mismatches_region(hit->substring2,trim_left,trim_right,
+ query_compress_fwd,query_compress_rev);
+ debug4(printf(" substring 2 %d.",Substring_count_mismatches_region(hit->substring2,trim_left,trim_right,
query_compress_fwd,query_compress_rev)));
+#ifdef SCORE_INDELS
+ /* Needs to match GMAP scoring */
if (hit->hittype == INSERTION || hit->hittype == DELETION) {
- hit->score_eventrim_start += indel_penalty_middle;
- hit->score_eventrim_end += indel_penalty_middle;
+ hit->score_eventrim += indel_penalty_middle;
debug4(printf(" add indel %d.",indel_penalty_middle));
}
- debug4(printf(" RESULT: %d or %d\n",hit->score_eventrim_start,hit->score_eventrim_end));
+#endif
+ debug4(printf(" RESULT: %d\n",hit->score_eventrim));
}
}
@@ -6085,19 +6094,15 @@ Stage3end_optimal_score_aux (bool *eliminatedp, List_T hitlist, int cutoff_level
/* Skip from setting minscore */
}
#endif
- if (hit->score_eventrim_start < minscore_start) {
- minscore_start = hit->score_eventrim_start;
- }
- if (hit->score_eventrim_end < minscore_end) {
- minscore_end = hit->score_eventrim_end;
+ if (hit->score_eventrim < minscore) {
+ minscore = hit->score_eventrim;
}
}
}
- debug4(printf("Stage3end_optimal_score over %d hits: minscore = (%d or %d) + subopt:%d\n",
- n,minscore_start,minscore_end,suboptimal_mismatches));
- minscore_start += suboptimal_mismatches;
- minscore_end += suboptimal_mismatches;
+ debug4(printf("Stage3end_optimal_score over %d hits: minscore = %d + subopt:%d\n",
+ n,minscore,suboptimal_mismatches));
+ minscore += suboptimal_mismatches;
max_nmatches -= suboptimal_mismatches;
max_nmatches_posttrim -= suboptimal_mismatches;
@@ -6107,8 +6112,7 @@ Stage3end_optimal_score_aux (bool *eliminatedp, List_T hitlist, int cutoff_level
cutoff_level = minscore;
}
#else
- cutoff_level_start = minscore_start;
- cutoff_level_end = minscore_end;
+ cutoff_level = minscore;
#endif
for (p = hitlist; p != NULL; p = p->rest) {
@@ -6143,23 +6147,22 @@ Stage3end_optimal_score_aux (bool *eliminatedp, List_T hitlist, int cutoff_level
}
#endif
- } else if (hit->score_eventrim_start > cutoff_level_start && hit->score_eventrim_end > cutoff_level_end) {
+ } else if (hit->score_eventrim > cutoff_level) {
/* For dibasep were previously using hit->ntscore, but gives false positives */
- debug4(printf("Eliminating a hit of type %s with score_eventrim %d or %d > cutoff_level %d\n",
- hittype_string(hit->hittype),hit->score_eventrim_start,hit->score_eventrim_end,cutoff_level));
+ debug4(printf("Eliminating a hit of type %s with score_eventrim %d > cutoff_level %d\n",
+ hittype_string(hit->hittype),hit->score_eventrim,cutoff_level));
*eliminatedp = true;
Stage3end_free(&hit);
- } else if (hit->score_eventrim_start > minscore_start && hit->score_eventrim_end > minscore_end
- /* && hit->nmatches_posttrim < max_nmatches_posttrim */) {
- debug4(printf("Eliminating a hit with score_eventrim %d or %d and type %s\n",
- hit->score_eventrim_start,hit->score_eventrim_end,hittype_string(hit->hittype)));
+ } else if (hit->score_eventrim > minscore /* && hit->nmatches_posttrim < max_nmatches_posttrim */) {
+ debug4(printf("Eliminating a hit with score_eventrim %d and type %s\n",
+ hit->score_eventrim,hittype_string(hit->hittype)));
*eliminatedp = true;
Stage3end_free(&hit);
} else {
- debug4(printf("Keeping a hit with score_eventrim %d or %d and type %s\n",
- hit->score_eventrim_start,hit->score_eventrim_end,hittype_string(hit->hittype)));
+ debug4(printf("Keeping a hit with score_eventrim %d and type %s\n",
+ hit->score_eventrim,hittype_string(hit->hittype)));
optimal = List_push(optimal,hit);
}
}
@@ -6571,8 +6574,8 @@ Stage3end_mark_ambiguous_splices (bool *ambiguousp, List_T hitlist) {
for (i = 0; i < n; i++) {
x = hits[i];
if (x->hittype == SPLICE) {
- printf(" %d: %lu..%lu (plusp = %d) ambiguousp:%d\n",
- i,x->genomicstart,x->genomicend,x->plusp,x->chimera_ambiguous_p);
+ printf(" %d: %u..%u (plusp = %d) ambiguousp:%d\n",
+ i,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,x->plusp,x->chimera_ambiguous_p);
}
}
);
@@ -6643,8 +6646,8 @@ Stage3end_remove_duplicates (List_T hitlist) {
debug7(
for (i = 0; i < n; i++) {
x = hits[i];
- printf(" Initial %d (%s): %p #%d:%lu..%lu, alias %d, nmatches %d, score %d ",
- i,hittype_string(x->hittype),x,x->chrnum,x->genomicstart-x->chroffset,x->genomicend-x->chroffset,
+ printf(" Initial %d (%s): %p #%d:%u..%u, alias %d, nmatches %d, score %d ",
+ i,hittype_string(x->hittype),x,x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,
x->alias,x->nmatches,x->score);
Stage3end_print_substrings(x);
printf("\n");
@@ -6734,13 +6737,13 @@ AGTGATGAATCCAAGAGGCGTTTCTATAAGAATTGGCATAAATCTAAGAAGAAGGCCCACCTGATGGAGATCCAG */
for (i = n-1; i >= 0; i--) {
x = hits[i];
if (eliminate[i] == false) {
- debug7(printf(" Keeping #%d:%lu..%lu, nmatches %d (nindels %d, indel_pos %d, distance %u, chrnum %d) (plusp = %d)\n",
- x->chrnum,x->genomicstart-x->chroffset,x->genomicend-x->chroffset,
+ debug7(printf(" Keeping #%d:%u..%u, nmatches %d (nindels %d, indel_pos %d, distance %u, chrnum %d) (plusp = %d)\n",
+ x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,
x->nmatches,x->nindels,x->indel_pos,x->distance,x->chrnum,x->plusp));
hitlist = List_push(hitlist,x);
} else {
- debug7(printf(" Eliminating #%d:%lu..%lu, nmatches %d (nindels %d, indel_pos %d, distance %u, chrnum %d) (plusp = %d)\n",
- x->chrnum,x->genomicstart-x->chroffset,x->genomicend-x->chroffset,
+ debug7(printf(" Eliminating #%d:%u..%u, nmatches %d (nindels %d, indel_pos %d, distance %u, chrnum %d) (plusp = %d)\n",
+ x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,
x->nmatches,x->nindels,x->indel_pos,x->distance,x->chrnum,x->plusp));
Stage3end_free(&x);
}
@@ -6753,8 +6756,8 @@ AGTGATGAATCCAAGAGGCGTTTCTATAAGAATTGGCATAAATCTAAGAAGAAGGCCCACCTGATGGAGATCCAG */
debug7(
for (p = hitlist, i = 0; p != NULL; p = p->rest, i++) {
x = (T) p->first;
- printf(" Final %d: #%d:%lu..%lu (plusp = %d)\n",
- i,x->chrnum,x->genomicstart-x->chroffset,x->genomicend-x->chroffset,x->plusp);
+ printf(" Final %d: #%d:%u..%u (plusp = %d)\n",
+ i,x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,x->plusp);
}
);
@@ -6989,10 +6992,10 @@ hit_goodness_cmp (bool *equalp, Stage3end_T hit,
#endif
if (hit->nmatches_posttrim < best_hit->nmatches_posttrim) {
- debug7(printf(" => %d loses by nmatches\n",k));
+ debug7(printf(" => %d loses by nmatches (post-trim)\n",k));
return -1;
} else if (hit->nmatches_posttrim > best_hit->nmatches_posttrim) {
- debug7(printf(" => %d wins by nmatches\n",k));
+ debug7(printf(" => %d wins by nmatches (post-trim)\n",k));
return +1;
} else if (hit->nchimera_novel > best_hit->nchimera_novel) {
@@ -7046,10 +7049,10 @@ hit_goodness_cmp (bool *equalp, Stage3end_T hit,
prob1 = Substring_chimera_prob(hit->substring_donor) + Substring_chimera_prob(hit->substring_acceptor);
prob2 = Substring_chimera_prob(best_hit->substring_donor) + Substring_chimera_prob(best_hit->substring_acceptor);
if (prob1 < prob2) {
- debug7(printf(" => %d loses by splice prob %f vs %f\n",prob1,prob2));
+ debug7(printf(" => %d loses by splice prob %f vs %f\n",k,prob1,prob2));
return -1;
} else if (prob1 > prob2) {
- debug7(printf(" => %d wins by splice prob %f vs %f\n",prob1,prob2));
+ debug7(printf(" => %d wins by splice prob %f vs %f\n",k,prob1,prob2));
return +1;
}
}
@@ -7132,13 +7135,13 @@ Stage3end_remove_overlaps (List_T hitlist, bool finalp) {
/* Step 1. Check for exact duplicates */
/* Probably don't want to eliminate aliases at this point */
- debug7(printf("Checking for exact duplicates\n"));
+ debug7(printf("Step 1. Checking for exact duplicates\n"));
qsort(hits,n,sizeof(Stage3end_T),hit_sort_cmp);
debug7(
for (i = 0; i < n; i++) {
hit = hits[i];
- printf(" Initial %d (%s): %p #%d:%lu..%lu, alias %d, nmatches %d, score %d\n",
+ printf(" Initial %d (%s): %p #%d:%u..%u, alias %d, nmatches %d, score %d\n",
i,hittype_string(hit->hittype),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
hit->alias,hit->nmatches,hit->score);
}
@@ -7178,16 +7181,16 @@ Stage3end_remove_overlaps (List_T hitlist, bool finalp) {
for (i = 0, j = 0; i < n; i++) {
hit = prev[i];
if (eliminate[i] == false) {
- debug7(printf(" Keeping %lu..%lu, nmatches (trimmed) %d (plusp = %d)\n",
- hit->low,hit->high,hit->nmatches,hit->plusp));
+ debug7(printf(" Keeping %u..%u, nmatches (trimmed) %d (plusp = %d)\n",
+ hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nmatches,hit->plusp));
hits[j++] = hit;
} else if (hit->paired_usedp == true) {
- debug7(printf(" Already paired %lu..%lu, nmatches (trimmed) %d (plusp = %d)\n",
- hit->low,hit->high,hit->nmatches,hit->plusp));
+ debug7(printf(" Already paired %u..%u, nmatches (trimmed) %d (plusp = %d)\n",
+ hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nmatches,hit->plusp));
hits[j++] = hit;
} else {
- debug7(printf(" Eliminating %lu..%lu, nmatches (trimmed) %d (plusp = %d)\n",
- hit->low,hit->high,hit->nmatches,hit->plusp));
+ debug7(printf(" Eliminating %u..%u, nmatches (trimmed) %d (plusp = %d)\n",
+ hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nmatches,hit->plusp));
Stage3end_free(&hit);
}
}
@@ -7197,7 +7200,7 @@ Stage3end_remove_overlaps (List_T hitlist, bool finalp) {
/* Step 2: Check for superstretches */
n = nkept;
- debug7(printf("Checking for superstretches among %d hits within subsumption clusters\n",n));
+ debug7(printf("Step 2. Checking for superstretches among %d hits within subsumption clusters\n",n));
for (i = 0; i < n; i++) {
eliminate[i] = false;
@@ -7206,9 +7209,9 @@ Stage3end_remove_overlaps (List_T hitlist, bool finalp) {
debug7(
for (i = 0; i < n; i++) {
hit = hits[i];
- printf(" Initial %d (%s): %p #%d:%u..%u, nmatches %d, score %d\n",
+ printf(" Initial %d (%s): %p #%d:%u..%u, nmatches (trimmed) %d, nmatches (post-trim) %d, score %d\n",
i,hittype_string(hit->hittype),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
- hit->nmatches,hit->score);
+ hit->nmatches,hit->nmatches_posttrim,hit->score);
}
);
@@ -7258,16 +7261,16 @@ Stage3end_remove_overlaps (List_T hitlist, bool finalp) {
for (i = 0, j = 0; i < n; i++) {
hit = prev[i];
if (eliminate[i] == false) {
- debug7(printf(" Keeping %lu..%lu, nmatches (trimmed) %d (plusp = %d)\n",
- hit->low,hit->high,hit->nmatches,hit->plusp));
+ debug7(printf(" Keeping %u..%u, nmatches (trimmed) %d (plusp = %d)\n",
+ hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nmatches,hit->plusp));
hits[j++] = hit;
} else if (hit->paired_usedp == true) {
- debug7(printf(" Already paired %lu..%lu, nmatches (trimmed) %d (plusp = %d)\n",
- hit->low,hit->high,hit->nmatches,hit->plusp));
+ debug7(printf(" Already paired %u..%u, nmatches (trimmed) %d (plusp = %d)\n",
+ hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nmatches,hit->plusp));
hits[j++] = hit;
} else {
- debug7(printf(" Eliminating %lu..%lu, nmatches (trimmed) %d (plusp = %d)\n",
- hit->low,hit->high,hit->nmatches,hit->plusp));
+ debug7(printf(" Eliminating %u..%u, nmatches (trimmed) %d (plusp = %d)\n",
+ hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nmatches,hit->plusp));
Stage3end_free(&hit);
}
}
@@ -7421,16 +7424,16 @@ Stage3end_remove_overlaps (List_T hitlist, bool finalp) {
for (i = 0, j = 0; i < n; i++) {
hit = prev[i];
if (eliminate[i] == false) {
- debug7(printf(" Keeping %lu..%lu, nmatches (trimmed) %d (plusp = %d)\n",
- hit->low,hit->high,hit->nmatches,hit->plusp));
+ debug7(printf(" Keeping %u..%u, nmatches (trimmed) %d (plusp = %d)\n",
+ hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nmatches,hit->plusp));
hits[j++] = hit;
} else if (hit->paired_usedp == true) {
- debug7(printf(" Already paired %lu..%lu, nmatches (trimmed) %d (plusp = %d)\n",
- hit->low,hit->high,hit->nmatches,hit->plusp));
+ debug7(printf(" Already paired %u..%u, nmatches (trimmed) %d (plusp = %d)\n",
+ hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nmatches,hit->plusp));
hits[j++] = hit;
} else {
- debug7(printf(" Eliminating %lu..%lu, nmatches (trimmed) %d (plusp = %d)\n",
- hit->low,hit->high,hit->nmatches,hit->plusp));
+ debug7(printf(" Eliminating %u..%u, nmatches (trimmed) %d (plusp = %d)\n",
+ hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nmatches,hit->plusp));
Stage3end_free(&hit);
}
}
@@ -10602,7 +10605,9 @@ Stage3pair_new (T hit5, T hit3, Univcoord_T *splicesites,
debug0(printf("Created new pair %p from %p and %p with private %d, %d\n",new,hit5,hit3,private5p,private3p));
debug0(printf(" hittypes %s and %s\n",hittype_string(hit5->hittype),hittype_string(hit3->hittype)));
debug0(printf(" sensedirs %d and %d\n",hit5->sensedir,hit3->sensedir));
- debug0(printf(" genomicpos %lu..%lu and %lu..%lu\n",hit5->genomicstart,hit5->genomicend,hit3->genomicstart,hit3->genomicend));
+ debug0(printf(" chrpos %u..%u and %u..%u\n",
+ hit5->genomicstart - hit5->chroffset,hit5->genomicend - hit5->chroffset,
+ hit3->genomicstart - hit3->chroffset,hit3->genomicend - hit3->chroffset));
if (hit5->circularpos < 0 && hit3->circularpos < 0) {
new->circularp = false;
@@ -11203,9 +11208,9 @@ Stage3pair_remove_duplicates_exact (List_T hitpairlist) {
debug8(
for (i = 0; i < n; i++) {
hitpair = hitpairs[i];
- printf(" Initial %d (%s, %s-%s): %p, %lu..%lu %u..%u|%u..%u (dir = %d), alias %d|%d, nmatches: %d\n",
+ printf(" Initial %d (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), alias %d|%d, nmatches: %d\n",
i,Pairtype_string(hitpair->pairtype),hittype_string(hitpair->hit5->hittype),
- hittype_string(hitpair->hit3->hittype),hitpair,hitpair->low,hitpair->high,
+ hittype_string(hitpair->hit3->hittype),hitpair,
hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
hitpair->dir,hitpair->hit5->alias,hitpair->hit3->alias,hitpair->nmatches);
@@ -11608,7 +11613,7 @@ static List_T
pair_remove_bad_superstretches (bool *keep_p, Stage3pair_T superstretch, List_T list, bool finalp) {
List_T result = NULL, subsumed, p, q, r;
Stage3pair_T stage3pair, hitpair;
- bool equalp, this_kept_p, child_better_p;
+ bool equalp, this_kept_p, child_better_exists_p;
*keep_p = true;
@@ -11616,20 +11621,20 @@ pair_remove_bad_superstretches (bool *keep_p, Stage3pair_T superstretch, List_T
while (p != NULL) {
stage3pair = (Stage3pair_T) List_head(p);
- q = p;
- while (List_next(q) != NULL && hitpair_subsumption(stage3pair,(Stage3pair_T) List_head(List_next(q))) == true) {
-#if 0
- printf(" This (%s, %s-%s): %p, %lu..%lu %u..%u|%u..%u (dir = %d), nmatches: %d (%d posttrim), indel_low %d and %d\n",
+ q = List_next(p);
+ while (q != NULL && hitpair_subsumption(stage3pair,(Stage3pair_T) List_head(q)) == true) {
+#ifdef DEBUG8
+ printf(" This (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), nmatches: %d (%d posttrim), indel_low %d and %d\n",
Pairtype_string(stage3pair->pairtype),hittype_string(stage3pair->hit5->hittype),
- hittype_string(stage3pair->hit3->hittype),stage3pair,stage3pair->low,stage3pair->high,
+ hittype_string(stage3pair->hit3->hittype),stage3pair,
stage3pair->hit5->low - stage3pair->hit5->chroffset,stage3pair->hit5->high - stage3pair->hit5->chroffset,
stage3pair->hit3->low - stage3pair->hit3->chroffset,stage3pair->hit3->high - stage3pair->hit3->chroffset,
stage3pair->dir,stage3pair->nmatches,stage3pair->nmatches_posttrim,
stage3pair->hit5->indel_low,stage3pair->hit3->indel_low);
- hitpair = (Stage3pair_T) List_head(List_next(q));
- printf("subsumes that (%s, %s-%s): %p, %lu..%lu %u..%u|%u..%u (dir = %d), nmatches: %d (%d posttrim), indel_low %d and %d\n",
+ hitpair = (Stage3pair_T) List_head(q);
+ printf("subsumes that (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), nmatches: %d (%d posttrim), indel_low %d and %d\n",
Pairtype_string(hitpair->pairtype),hittype_string(hitpair->hit5->hittype),
- hittype_string(hitpair->hit3->hittype),hitpair,hitpair->low,hitpair->high,
+ hittype_string(hitpair->hit3->hittype),hitpair,
hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
hitpair->dir,hitpair->nmatches,hitpair->nmatches_posttrim,
@@ -11648,16 +11653,20 @@ pair_remove_bad_superstretches (bool *keep_p, Stage3pair_T superstretch, List_T
} else {
/* Cluster */
+ debug8(printf("Processing cluster of size %d - %d\n",List_length(p),List_length(q)));
subsumed = NULL;
- child_better_p = false;
+ child_better_exists_p = false;
for (r = List_next(p); r != q; r = List_next(r)) {
- if (hitpair_goodness_cmp(&equalp,(Stage3pair_T) List_head(r),stage3pair,finalp) > 0 || equalp == true) {
- child_better_p = true;
+ debug8(printf("Calling hitpair_goodness_cmp\n"));
+ if (hitpair_goodness_cmp(&equalp,(Stage3pair_T) List_head(r),stage3pair,finalp) >= 0 || equalp == true) {
+ child_better_exists_p = true;
}
subsumed = List_push(subsumed,(void *) List_head(r));
}
- if (child_better_p == false) {
+ if (child_better_exists_p == false) {
+ /* All children are worse */
+ debug8(printf("All children are worse, so deleting all of them\n"));
result = List_push(result,(void *) stage3pair);
for (r = subsumed; r != NULL; r = List_next(r)) {
hitpair = (Stage3pair_T) List_head(r);
@@ -11666,7 +11675,7 @@ pair_remove_bad_superstretches (bool *keep_p, Stage3pair_T superstretch, List_T
List_free(&subsumed);
} else {
- /* printf("Found cluster of length %d. Calling remove_bad_superstretches recursively\n",List_length(subsumed)); */
+ debug8(printf("Found cluster of length %d. Calling remove_bad_superstretches recursively\n",List_length(subsumed)));
result = List_append(result,pair_remove_bad_superstretches(&this_kept_p,/*superstretch*/stage3pair,subsumed,finalp));
if (this_kept_p == false) {
Stage3pair_free(&stage3pair);
@@ -11674,7 +11683,7 @@ pair_remove_bad_superstretches (bool *keep_p, Stage3pair_T superstretch, List_T
/* Compare stage3pair against the current parent */
result = List_push(result,(void *) stage3pair);
if (superstretch != NULL &&
- (hitpair_goodness_cmp(&equalp,stage3pair,superstretch,finalp) > 0 || equalp == true)) {
+ (hitpair_goodness_cmp(&equalp,stage3pair,superstretch,finalp) >= 0 || equalp == true)) {
*keep_p = false;
}
}
@@ -11686,6 +11695,7 @@ pair_remove_bad_superstretches (bool *keep_p, Stage3pair_T superstretch, List_T
List_free(&list);
+ debug8(printf("Returning result of length %d\n",List_length(result)));
return result;
}
@@ -11712,15 +11722,15 @@ pair_remove_overlaps (List_T hitpairlist, bool translocp, bool finalp) {
}
/* Step 1. Check for exact duplicates */
- debug8(printf(" Step 1. Checking for exact duplicates\n"));
+ debug8(printf(" Step 1. Checking for exact duplicates\n"));
qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_sort_cmp);
debug8(
for (i = 0; i < n; i++) {
hitpair = hitpairs[i];
- printf(" Initial %d (%s, %s-%s): %p, %lu..%lu %u..%u|%u..%u (dir = %d), alias %d|%d, nmatches: %d (%d posttrim), indel_low %d and %d\n",
+ printf(" Initial %d (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), alias %d|%d, nmatches: %d (%d posttrim), indel_low %d and %d\n",
i,Pairtype_string(hitpair->pairtype),hittype_string(hitpair->hit5->hittype),
- hittype_string(hitpair->hit3->hittype),hitpair,hitpair->low,hitpair->high,
+ hittype_string(hitpair->hit3->hittype),hitpair,
hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
hitpair->dir,hitpair->hit5->alias,hitpair->hit3->alias,hitpair->nmatches,hitpair->nmatches_posttrim,
@@ -11756,15 +11766,13 @@ pair_remove_overlaps (List_T hitpairlist, bool translocp, bool finalp) {
for (i = n - 1; i >= 0; --i) {
hitpair = hitpairs[i];
if (eliminate[i] == false) {
- debug8(printf(" Keeping %lu..%lu %u..%u|%u..%u, nmatches (trimmed) %d, score %d, (dir = %d)\n",
- hitpair->low,hitpair->high,
+ debug8(printf(" Keeping %u..%u|%u..%u, nmatches (trimmed) %d, score %d, (dir = %d)\n",
hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
hitpair->nmatches,hitpair->score,hitpair->dir));
unique = List_push(unique,(void *) hitpair);
} else {
- debug8(printf(" Eliminating %lu..%lu %u..%u|%u..%u, nmatches (trimmed) %d, score %d, (dir = %d)\n",
- hitpair->low,hitpair->high,
+ debug8(printf(" Eliminating %u..%u|%u..%u, nmatches (trimmed) %d, score %d, (dir = %d)\n",
hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
hitpair->nmatches,hitpair->score,hitpair->dir));
@@ -11775,6 +11783,7 @@ pair_remove_overlaps (List_T hitpairlist, bool translocp, bool finalp) {
FREE(hitpairs);
FREE(eliminate);
+ debug8(printf(" Step 2. Checking for bad superstretches\n"));
if (translocp == true) {
return unique;
} else {
@@ -11783,454 +11792,6 @@ pair_remove_overlaps (List_T hitpairlist, bool translocp, bool finalp) {
}
-#if 0
-static List_T
-pair_remove_overlaps_old (List_T hitpairlist, bool translocp, bool finalp) {
- List_T unique = NULL;
- Stage3pair_T best_hitpair, hitpair, *hitpairs, *prev;
- int cmp;
- int nkept, n, i, j, k, besti;
- bool *eliminate, equalp;
-#ifdef PRE_RESOLVE_MULTIMAPPING
- long int best_tally;
-#endif
-
- n = List_length(hitpairlist);
- debug8(printf(" Entering pair_remove_overlaps with %d pairs: %s\n",
- n,finalp == true ? "FINAL" : "not final"));
-
- if (n < 2) {
- debug8(printf(" Exiting pair_remove_overlaps with %d < 2 pairs\n",n));
- return hitpairlist;
- } else {
- eliminate = (bool *) CALLOC(n,sizeof(bool));
- hitpairs = (Stage3pair_T *) List_to_array(hitpairlist,NULL);
- List_free(&hitpairlist);
- }
-
- /* Step 1. Check for exact duplicates */
- debug8(printf(" Step 1. Checking for exact duplicates\n"));
- qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_sort_cmp);
-
- debug8(
- for (i = 0; i < n; i++) {
- hitpair = hitpairs[i];
- printf(" Initial %d (%s, %s-%s): %p, %lu..%lu %u..%u|%u..%u (dir = %d), alias %d|%d, nmatches: %d (%d posttrim), indel_low %d and %d\n",
- i,Pairtype_string(hitpair->pairtype),hittype_string(hitpair->hit5->hittype),
- hittype_string(hitpair->hit3->hittype),hitpair,hitpair->low,hitpair->high,
- hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
- hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hitpair->dir,hitpair->hit5->alias,hitpair->hit3->alias,hitpair->nmatches,hitpair->nmatches_posttrim,
- hitpair->hit5->indel_low,hitpair->hit3->indel_low);
- }
- );
-
- i = 0;
- while (i < n) {
- j = i+1;
- debug8(printf(" %d,%d",i,j));
- while (j < n && hitpair_equiv_cmp(hitpairs[j],hitpairs[i]) == 0) {
- debug8(printf(" %d is identical to %d => eliminating\n",j,i));
- eliminate[j] = true;
- j++;
- }
- i = j;
- }
- debug8(printf("\n"));
-
- nkept = 0;
- for (i = 0; i < n; i++) {
- if (eliminate[i] == false) {
- nkept++;
- }
- }
- if (nkept == 0) {
- /* All entries eliminated one another, so keep the first one */
- eliminate[0] = false;
- nkept = 1;
- }
-
- prev = hitpairs;
- hitpairs = (Stage3pair_T *) CALLOC(nkept,sizeof(Stage3pair_T));
-
- for (i = 0, j = 0; i < n; i++) {
- hitpair = prev[i];
- if (eliminate[i] == false) {
- debug8(printf(" Keeping %lu..%lu %u..%u|%u..%u, nmatches (trimmed) %d, score %d, (dir = %d)\n",
- hitpair->low,hitpair->high,
- hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
- hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hitpair->nmatches,hitpair->score,hitpair->dir));
- hitpairs[j++] = hitpair;
- } else {
- debug8(printf(" Eliminating %lu..%lu %u..%u|%u..%u, nmatches (trimmed) %d, score %d, (dir = %d)\n",
- hitpair->low,hitpair->high,
- hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
- hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hitpair->nmatches,hitpair->score,hitpair->dir));
- Stage3pair_free(&hitpair);
- }
- }
-
- FREE(prev);
-
- if (translocp == false) {
- /* Step 2: Check for superstretches */
- n = nkept;
- debug8(printf(" Step 2. Checking for superstretches among %d hitpairs within subsumption clusters\n",n));
-
-#if 0
- for (i = 0; i < n; i++) {
- eliminate[i] = false;
- }
-#else
- memset(eliminate,false,n*sizeof(bool));
-#endif
-
- debug8(
- for (i = 0; i < n; i++) {
- hitpair = hitpairs[i];
- printf(" Initial %d (%s, %s-%s): %p, %lu..%lu %u..%u|%u..%u (dir = %d), score: %d, alias %d|%d, nmatches: %d (%d posttrim), nnovel: %d, nknown: %d, insertlength: %u, outerlength: %u\n",
- i,Pairtype_string(hitpair->pairtype),hittype_string(hitpair->hit5->hittype),
- hittype_string(hitpair->hit3->hittype),hitpair,hitpair->low,hitpair->high,
- hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
- hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hitpair->dir,hitpair->score,hitpair->hit5->alias,hitpair->hit3->alias,hitpair->nmatches,hitpair->nmatches_posttrim,
- hitpair->nchimera_novel,hitpair->nchimera_known,
- hitpair->insertlength,hitpair->outerlength);
- }
- );
-
- /* Find clusters */
- i = 0;
- while (i < n) {
- j = i;
- while (j+1 < n && hitpair_subsumption(hitpairs[i],hitpairs[j+1]) == true) {
- j = j+1;
- }
-
- if (j > i) {
- debug8(printf("Cluster from %d up through %d\n",i,j));
-
- /* Find bad superstretches */
- for (k = i; k <= j; k++) {
- if (hitpair_bad_superstretch_p(hitpairs[k],hitpairs,k,j,finalp) == true) {
- eliminate[k] = true;
- }
- }
- }
-
- i = j+1;
- }
-
- nkept = 0;
- for (i = 0; i < n; i++) {
- if (eliminate[i] == false) {
- nkept++;
- }
- }
- if (nkept == 0) {
- /* All entries eliminated one another, so keep the first one */
- eliminate[0] = false;
- nkept = 1;
- }
-
- prev = hitpairs;
- hitpairs = (Stage3pair_T *) CALLOC(nkept,sizeof(Stage3pair_T));
-
- for (i = 0, j = 0; i < n; i++) {
- hitpair = prev[i];
- if (eliminate[i] == false) {
- debug8(printf(" Keeping %lu..%lu %u..%u|%u..%u, nmatches (trimmed) %d (dir = %d)\n",
- hitpair->low,hitpair->high,
- hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
- hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hitpair->nmatches,hitpair->dir));
- hitpairs[j++] = hitpair;
- } else {
- debug8(printf(" Eliminating %lu..%lu %u..%u|%u..%u, nmatches (trimmed) %d (dir = %d)\n",
- hitpair->low,hitpair->high,
- hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
- hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hitpair->nmatches,hitpair->dir));
- Stage3pair_free(&hitpair);
- }
- }
-
- FREE(prev);
- }
-
-
- if (translocp == false) {
- /* Step 3: Check for best within subsumption clusters */
- n = nkept;
- debug8(printf(" Step 3. Checking for best among %d hitpairs within subsumption clusters\n",n));
-
- for (i = 0; i < n; i++) {
- eliminate[i] = false;
- }
- /* qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_sort_cmp); -- No need since original order was kept */
-
- debug8(
- for (i = 0; i < n; i++) {
- hitpair = hitpairs[i];
- printf(" Initial %d (%s, %s-%s): %p, %lu..%lu %u..%u|%u..%u (dir = %d), score: %d, alias %d|%d, nmatches: %d (%d posttrim), nnovel: %d, nknown: %d, insertlength: %u, outerlength: %u\n",
- i,Pairtype_string(hitpair->pairtype),hittype_string(hitpair->hit5->hittype),
- hittype_string(hitpair->hit3->hittype),hitpair,hitpair->low,hitpair->high,
- hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
- hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hitpair->dir,hitpair->score,hitpair->hit5->alias,hitpair->hit3->alias,hitpair->nmatches,hitpair->nmatches_posttrim,
- hitpair->nchimera_novel,hitpair->nchimera_known,
- hitpair->insertlength,hitpair->outerlength);
- }
- );
-
-
- /* Find clusters from left */
- i = 0;
- while (i < n) {
- j = i;
- while (j+1 < n && hitpair_subsumption(hitpairs[i],hitpairs[j+1]) == true) {
- j = j+1;
- }
-
- if (j > i) {
- debug8(printf("Cluster from %d up through %d\n",i,j));
-
- best_hitpair = hitpairs[i];
- besti = i;
- debug8(printf("Assume best is %d\n",besti));
-
- for (k = i+1; k <= j; k++) {
- cmp = hitpair_goodness_cmp(&equalp,hitpairs[k],
- best_hitpair,finalp);
- debug8(printf("Comparison of %d with best %d yields %d\n",k,besti,cmp));
- if (cmp > 0) {
- best_hitpair = hitpairs[k];
- besti = k;
- debug8(printf("Best is now %d\n",besti));
- }
- }
-
- for (k = i; k <= j; k++) {
- if (k == besti) {
- /* Skip */
- } else if (hitpair_goodness_cmp(&equalp,hitpairs[k],
- best_hitpair,finalp) < 0 || equalp == true) {
- debug8(printf(" Eliminating hitpair %d from left, because beaten by %d\n",k,besti));
- eliminate[k] = true;
- }
- }
- }
-
- i = j+1;
- }
-
-
- /* Find clusters starting from right */
- j = n - 1;
- while (j >= 0) {
- i = j;
- while (i-1 >= 0 && hitpair_subsumption(hitpairs[j],hitpairs[i-1]) == true) {
- i = i-1;
- }
-
- if (i < j) {
- debug8(printf("Cluster from %d down through %d\n",j,i));
- best_hitpair = hitpairs[i];
- besti = i;
- debug8(printf("Assume best is %d\n",besti));
-
- for (k = i+1; k <= j; k++) {
- cmp = hitpair_goodness_cmp(&equalp,hitpairs[k],
- best_hitpair,finalp);
- debug8(printf("Comparison of %d with best %d yields %d\n",k,besti,cmp));
- if (cmp > 0) {
- best_hitpair = hitpairs[k];
- besti = k;
- debug8(printf("Best is now %d\n",besti));
- }
- }
-
- for (k = i; k <= j; k++) {
- if (k == besti) {
- /* Skip */
- } else if (hitpair_goodness_cmp(&equalp,hitpairs[k],
- best_hitpair,finalp) < 0 || equalp == true) {
- debug8(printf(" Eliminating hitpair %d from right, because beaten by %d\n",k,besti));
- eliminate[k] = true;
- }
- }
- }
-
- j = i-1;
- }
-
-
- nkept = 0;
- for (i = 0; i < n; i++) {
- if (eliminate[i] == false) {
- nkept++;
- }
- }
- if (nkept == 0) {
- eliminate[0] = false;
- nkept = 1;
- }
-
- prev = hitpairs;
- hitpairs = (Stage3pair_T *) CALLOC(nkept,sizeof(Stage3pair_T));
-
- for (i = 0, j = 0; i < n; i++) {
- hitpair = prev[i];
- if (eliminate[i] == false) {
- debug8(printf(" Keeping %lu..%lu %u..%u|%u..%u, nmatches (trimmed) %d (dir = %d)\n",
- hitpair->low,hitpair->high,
- hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
- hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hitpair->nmatches,hitpair->dir));
- hitpairs[j++] = hitpair;
- } else {
- debug8(printf(" Eliminating %lu..%lu %u..%u|%u..%u, nmatches (trimmed) %d (dir = %d)\n",
- hitpair->low,hitpair->high,
- hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
- hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hitpair->nmatches,hitpair->dir));
- Stage3pair_free(&hitpair);
- }
- }
-
- FREE(prev);
- }
-
- /* Step 4: Check for identity */
- n = nkept;
- debug8(printf(" Step 4. Checking for duplicates among %d hitpairs by identity\n",n));
-
- for (i = 0; i < n; i++) {
- eliminate[i] = false;
- }
- /* qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_sort_cmp); -- No need since original order was kept */
-
- debug8(
- for (i = 0; i < n; i++) {
- hitpair = hitpairs[i];
- printf(" Initial %d (%s, %s-%s): %p, %lu..%lu %u..%u|%u..%u (dir = %d), score: %d, alias %d|%d, nmatches: %d (%d posttrim), insertlength: %u, outerlength: %u\n",
- i,Pairtype_string(hitpair->pairtype),hittype_string(hitpair->hit5->hittype),
- hittype_string(hitpair->hit3->hittype),hitpair,hitpair->low,hitpair->high,
- hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
- hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hitpair->dir,hitpair->score,hitpair->hit5->alias,hitpair->hit3->alias,hitpair->nmatches,hitpair->nmatches_posttrim,
- hitpair->insertlength,hitpair->outerlength);
- }
- )
-
-#if 0
- i = 0;
- while (i < n) {
- debug8(printf("Looking at %d with score %d, absdifflength %d, and outerlength %u\n",
- i,hitpairs[i]->score,hitpairs[i]->absdifflength,hitpairs[i]->outerlength));
- besti = i;
- best_score = hitpairs[i]->score;
- best_absdifflength = hitpairs[i]->absdifflength;
- best_outerlength = hitpairs[i]->outerlength;
-
- j = i+1;
- while (j < n && hitpair_overlap(hitpairs[j],hitpairs[j-1]) == true) {
- debug8(printf(" %d overlaps with %d with score %d, absdifflength %d, and outerlength %u",
- i,j,hitpairs[j]->score,hitpairs[j]->outerlength));
- if (hitpairs[j]->score < best_score) {
- debug8(printf(" => best by score\n"));
- best_score = hitpairs[j]->score;
- best_absdifflength = hitpairs[j]->absdifflength;
- best_outerlength = hitpairs[j]->outerlength;
- besti = j;
- } else if (hitpairs[j]->score > best_score) {
- /* Skip */
- } else if (hitpairs[j]->absdifflength < best_absdifflength) {
- debug8(printf(" => best by absdifflength\n"));
- best_absdifflength = hitpairs[j]->absdifflength;
- best_outerlength = hitpairs[j]->outerlength;
- besti = j;
- } else if (hitpairs[j]->absdifflength > best_absdifflength) {
- /* Skip */
- } else if (hitpairs[j]->outerlength < best_outerlength) {
- debug8(printf(" => best by outerlength\n"));
- best_outerlength = hitpairs[j]->outerlength;
- besti = j;
- }
-
- debug8(printf("\n"));
- j++;
- }
-
- i = j;
- }
-#else
- i = 0;
- while (i < n) {
- debug8(printf("Looking at %d with score %d, insert length %d, and outerlength %u\n",
- i,hitpairs[i]->score,hitpairs[i]->insertlength,hitpairs[i]->outerlength));
- j = i+1;
- while (j < n && hitpair_equiv_cmp(hitpairs[j],hitpairs[i]) == 0) {
- debug8(printf(" %d equal to %d\n",j,i));
- eliminate[j] = true;
- j++;
- }
-
- i = j;
- }
-#endif
-
- for (i = n-1; i >= 0; i--) {
- hitpair = hitpairs[i];
- if (eliminate[i] == false) {
- unique = List_push(unique,hitpair);
- } else {
- Stage3pair_free(&hitpair);
- }
- }
-
- FREE(hitpairs);
- FREE(eliminate);
-
-
-#ifdef PRE_RESOLVE_MULTIMAPPING
- if (use_tally_p == true && tally_iit != NULL) {
- if ((n = List_length(unique)) > 1) {
- hitpairs = (Stage3pair_T *) List_to_array(unique,NULL);
- List_free(&unique);
-
- best_tally = 0;
- for (i = 0; i < n; i++) {
- if (hitpairs[i]->tally < 0) {
- hitpairs[i]->tally = Stage3end_compute_tally(hitpairs[i]->hit5) + Stage3end_compute_tally(hitpairs[i]->hit3);
- }
- if (hitpairs[i]->tally > best_tally) {
- best_tally = hitpairs[i]->tally;
- }
- }
-
- unique = (List_T) NULL;
- for (i = 0; i < n; i++) {
- if (hitpairs[i]->tally < best_tally) {
- Stage3pair_free(&(hitpairs[i]));
- } else {
- unique = List_push(unique,(void *) hitpairs[i]);
- }
- }
-
- FREE(hitpairs);
- }
- }
-#endif
-
- debug8(printf(" Exited pair_remove_overlaps with %d pairs\n",List_length(unique)));
- return unique;
-}
-#endif
-
-
List_T
Stage3pair_remove_overlaps (List_T hitpairlist, bool translocp, bool finalp) {
List_T unique_separate, unique_overlapping,
@@ -12657,10 +12218,9 @@ Stage3pair_optimal_score_aux (bool *eliminatedp, List_T hitpairlist, int cutoff_
List_T optimal = NULL, p;
Stage3pair_T hitpair;
T hit5, hit3;
- int cutoff_level_5_start, cutoff_level_5_end, cutoff_level_3_start, cutoff_level_3_end, score;
+ int cutoff_level_5, cutoff_level_3, score;
int n;
- int minscore5_start = MAX_READLENGTH, minscore5_end = MAX_READLENGTH,
- minscore3_start = MAX_READLENGTH, minscore3_end = MAX_READLENGTH, minscore = MAX_READLENGTH + MAX_READLENGTH;
+ int minscore5 = MAX_READLENGTH, minscore3 = MAX_READLENGTH, minscore = MAX_READLENGTH + MAX_READLENGTH;
/* int max_nmatches = 0, max_nmatches_posttrim, minscore = MAX_READLENGTH + MAX_READLENGTH; */
#ifdef USE_OPTIMAL_SCORE_BINGO
int minscore_bingo = MAX_READLENGTH + MAX_READLENGTH;
@@ -12728,15 +12288,17 @@ Stage3pair_optimal_score_aux (bool *eliminatedp, List_T hitpairlist, int cutoff_
hit5 = hitpair->hit5;
hit3 = hitpair->hit3;
- debug6(printf("hit5 %lu..%lu type %s, trim_left: %d%s, trim_right %d%s. hit3 %lu..%lu type %s, trim_left %d%s, trim_right %d%s.\n",
- hit5->genomicstart,hit5->genomicend,hittype_string(hit5->hittype),
+ debug6(printf("hit5 %u..%u type %s, trim_left: %d%s, trim_right %d%s, start_ambig %d, end_ambig %d. hit3 %u..%u type %s, trim_left %d%s, trim_right %d%s, start_ambig %d, end_ambig %d.\n",
+ hit5->genomicstart - hit5->chroffset,hit5->genomicend - hit5->chroffset,hittype_string(hit5->hittype),
hit5->trim_left,hit5->trim_left_splicep ? " (splice)" : "",
hit5->trim_right,hit5->trim_right_splicep ? " (splice)" : "",
- hit3->genomicstart,hit3->genomicend,hittype_string(hit3->hittype),
+ hit5->start_amb_nmatches,hit5->end_amb_nmatches,
+ hit3->genomicstart - hit3->chroffset,hit3->genomicend - hit3->chroffset,hittype_string(hit3->hittype),
hit3->trim_left,hit3->trim_left_splicep ? " (splice)" : "",
- hit3->trim_right,hit3->trim_right_splicep ? " (splice)" : ""));
+ hit3->trim_right,hit3->trim_right_splicep ? " (splice)" : "",
+ hit3->start_amb_nmatches,hit3->end_amb_nmatches));
if (hit5->hittype == TERMINAL) {
- /* Don't allow terminals to set trims */
+ /* Don't allow terminals to set trims, because they don't attempt to extend to ends */
#if 0
if (hit5->trim_left > max_trim_left_terminal_5) {
max_trim_left_terminal_5 = hit5->trim_left;
@@ -12768,7 +12330,7 @@ Stage3pair_optimal_score_aux (bool *eliminatedp, List_T hitpairlist, int cutoff_
}
if (hit3->hittype == TERMINAL) {
- /* Don't allow terminals to set trims */
+ /* Don't allow terminals to set trims, because they don't attempt to extend to ends */
#if 0
if (hit3->trim_left > max_trim_left_terminal_3) {
max_trim_left_terminal_3 = hit3->trim_left;
@@ -12842,9 +12404,9 @@ Stage3pair_optimal_score_aux (bool *eliminatedp, List_T hitpairlist, int cutoff_
if (hit5->hittype == TERMINAL && non_double_terminal_p == true && finalp == false) {
/* Ignore */
- hit5->score_eventrim_start = hit5->score_eventrim_end = 0;
+ hit5->score_eventrim = 0;
} else if (hit5->hittype == GMAP) {
- hit5->score_eventrim_start = hit5->score_eventrim_end = 0; /* was hit5->penalties */
+ hit5->score_eventrim = 0; /* was hit5->penalties */
debug6(printf("score 5' GMAP:"));
#if 0
if (Stage3end_bad_stretch_p(hit5,query5_compress_fwd,query5_compress_rev) == true) {
@@ -12868,73 +12430,61 @@ Stage3pair_optimal_score_aux (bool *eliminatedp, List_T hitpairlist, int cutoff_
}
#endif
- hit5->score_eventrim_start += Pair_nmismatches_region(&nindelbreaks,hit5->pairarray,hit5->npairs,
- trim_left_5,/*trim_end*/hit5->trim_right,hit5->querylength_adj);
- hit5->score_eventrim_end += Pair_nmismatches_region(&nindelbreaks,hit5->pairarray,hit5->npairs,
- /*trim_start*/hit5->trim_left,trim_right_5,hit5->querylength_adj);
- debug6(printf(" add nmismatches %d or %d.",
- Pair_nmismatches_region(&nindelbreaks,hit5->pairarray,hit5->npairs,
- trim_left_5,/*trim_end*/hit5->trim_right,hit5->querylength_adj),
- Pair_nmismatches_region(&nindelbreaks,hit5->pairarray,hit5->npairs,
- /*trim_start*/hit5->trim_left,trim_right_5,hit5->querylength_adj)));
- hit5->score_eventrim_start += indel_penalty_middle * nindelbreaks;
- hit5->score_eventrim_end += indel_penalty_middle * nindelbreaks;
- hit5->score_eventrim_start += hit5->start_amb_nmatches / ambig_end_interval;
- debug6(printf(" add amb start %d/%d.",hit5->start_amb_nmatches,ambig_end_interval));
- hit5->score_eventrim_end += hit5->end_amb_nmatches / ambig_end_interval;
- debug6(printf(" add amb end %d/%d.",hit5->end_amb_nmatches,ambig_end_interval));
- debug6(printf(" RESULT: %d or %d\n",hit5->score_eventrim_start,hit5->score_eventrim_end));
+ hit5->score_eventrim += Pair_nmismatches_region(&nindelbreaks,hit5->pairarray,hit5->npairs,
+ trim_left_5,trim_right_5,hit5->start_amb_nmatches,hit5->end_amb_nmatches,
+ hit5->querylength_adj);
+ debug6(printf(" add nmismatches %d.",Pair_nmismatches_region(&nindelbreaks,hit5->pairarray,hit5->npairs,
+ trim_left_5,trim_right_5,hit5->start_amb_nmatches,hit5->end_amb_nmatches,
+ hit5->querylength_adj)));
+#ifdef SCORE_INDELS
+ hit5->score_eventrim += indel_penalty_middle * nindelbreaks;
+ debug6(printf(" add indelbreaks %d.",indel_penalty_middle * nindelbreaks));
+#endif
+ if (hit5->start_amb_prob < 0.9) {
+ hit5->score_eventrim += hit5->start_amb_nmatches / ambig_end_interval;
+ debug6(printf(" add amb start %d/%d (prob %f).",hit5->start_amb_nmatches,ambig_end_interval,hit5->start_amb_prob));
+ }
+ if (hit5->end_amb_prob < 0.9) {
+ hit5->score_eventrim += hit5->end_amb_nmatches / ambig_end_interval;
+ debug6(printf(" add amb end %d/%d (prob %f).",hit5->end_amb_nmatches,ambig_end_interval,hit5->end_amb_prob));
+ }
+ debug6(printf(" RESULT: %d\n",hit5->score_eventrim));
} else {
debug6(printf("score 5' OTHER:"));
- hit5->score_eventrim_start = hit5->penalties;
- hit5->score_eventrim_end = hit5->penalties;
+ hit5->score_eventrim = hit5->penalties;
debug6(printf(" penalties %d.",hit5->penalties));
- hit5->score_eventrim_start += Substring_count_mismatches_region(hit5->substring0,trim_left_5,/*trim_end*/hit5->trim_right,
- query5_compress_fwd,query5_compress_rev);
- debug6(printf(" substring 0 %d.",Substring_count_mismatches_region(hit5->substring0,trim_left_5,/*trim_end*/hit5->trim_right,
- query5_compress_fwd,query5_compress_rev)));
-
- hit5->score_eventrim_start += Substring_count_mismatches_region(hit5->substring1,trim_left_5,/*trim_end*/hit5->trim_right,
- query5_compress_fwd,query5_compress_rev);
- debug6(printf(" substring 1 %d.",Substring_count_mismatches_region(hit5->substring1,trim_left_5,/*trim_end*/hit5->trim_right,
- query5_compress_fwd,query5_compress_rev)));
-
- hit5->score_eventrim_start += Substring_count_mismatches_region(hit5->substring2,trim_left_5,/*trim_end*/hit5->trim_right,
- query5_compress_fwd,query5_compress_rev);
- debug6(printf(" substring 2 %d.",Substring_count_mismatches_region(hit5->substring2,trim_left_5,/*trim_end*/hit5->trim_right,
+ hit5->score_eventrim += Substring_count_mismatches_region(hit5->substring0,trim_left_5,trim_right_5,
+ query5_compress_fwd,query5_compress_rev);
+ debug6(printf(" substring 0 %d.",Substring_count_mismatches_region(hit5->substring0,trim_left_5,trim_right_5,
query5_compress_fwd,query5_compress_rev)));
-
- hit5->score_eventrim_end += Substring_count_mismatches_region(hit5->substring0,/*trim_start*/hit5->trim_left,trim_right_5,
- query5_compress_fwd,query5_compress_rev);
- debug6(printf(" substring 0 %d.",Substring_count_mismatches_region(hit5->substring0,/*trim_start*/hit5->trim_left,trim_right_5,
- query5_compress_fwd,query5_compress_rev)));
-
- hit5->score_eventrim_end += Substring_count_mismatches_region(hit5->substring1,/*trim_start*/hit5->trim_left,trim_right_5,
- query5_compress_fwd,query5_compress_rev);
- debug6(printf(" substring 1 %d.",Substring_count_mismatches_region(hit5->substring1,/*trim_start*/hit5->trim_left,trim_right_5,
+ hit5->score_eventrim += Substring_count_mismatches_region(hit5->substring1,trim_left_5,trim_right_5,
+ query5_compress_fwd,query5_compress_rev);
+ debug6(printf(" substring 1 %d.",Substring_count_mismatches_region(hit5->substring1,trim_left_5,trim_right_5,
query5_compress_fwd,query5_compress_rev)));
- hit5->score_eventrim_end += Substring_count_mismatches_region(hit5->substring2,/*trim_start*/hit5->trim_left,trim_right_5,
- query5_compress_fwd,query5_compress_rev);
- debug6(printf(" substring 2 %d.",Substring_count_mismatches_region(hit5->substring2,/*trim_start*/hit5->trim_left,trim_right_5,
+ hit5->score_eventrim += Substring_count_mismatches_region(hit5->substring2,trim_left_5,trim_right_5,
+ query5_compress_fwd,query5_compress_rev);
+ debug6(printf(" substring 2 %d.",Substring_count_mismatches_region(hit5->substring2,trim_left_5,trim_right_5,
query5_compress_fwd,query5_compress_rev)));
+#ifdef SCORE_INDELS
+ /* Needs to match GMAP scoring */
if (hit5->hittype == INSERTION || hit5->hittype == DELETION) {
- hit5->score_eventrim_start += indel_penalty_middle;
- hit5->score_eventrim_end += indel_penalty_middle;
+ hit5->score_eventrim += indel_penalty_middle;
debug6(printf(" add indel %d.",indel_penalty_middle));
}
- debug6(printf(" RESULT: %d or %d\n",hit5->score_eventrim_start,hit5->score_eventrim_end));
+#endif
+ debug6(printf(" RESULT: %d\n",hit5->score_eventrim));
}
if (hit3->hittype == TERMINAL && non_double_terminal_p == true && finalp == false) {
/* Ignore */
- hit3->score_eventrim_start = hit3->score_eventrim_end = 0;
+ hit3->score_eventrim = 0;
} else if (hit3->hittype == GMAP) {
- hit3->score_eventrim_start = hit3->score_eventrim_end = 0; /* was hit3->penalties */
+ hit3->score_eventrim = 0; /* was hit3->penalties */
debug6(printf("score 3' GMAP:"));
#if 0
if (Stage3end_bad_stretch_p(hit3,query3_compress_fwd,query3_compress_rev) == true) {
@@ -12958,76 +12508,56 @@ Stage3pair_optimal_score_aux (bool *eliminatedp, List_T hitpairlist, int cutoff_
}
#endif
- hit3->score_eventrim_start += Pair_nmismatches_region(&nindelbreaks,hit3->pairarray,hit3->npairs,
- trim_left_3,/*trim_end*/hit3->trim_right,hit3->querylength_adj);
- hit3->score_eventrim_end += Pair_nmismatches_region(&nindelbreaks,hit3->pairarray,hit3->npairs,
- /*trim_start*/hit3->trim_left,trim_right_3,hit3->querylength_adj);
- debug6(printf(" add nmismatches %d or %d.",
- Pair_nmismatches_region(&nindelbreaks,hit3->pairarray,hit3->npairs,
- trim_left_3,/*trim_end*/hit3->trim_right,hit3->querylength_adj),
- Pair_nmismatches_region(&nindelbreaks,hit3->pairarray,hit3->npairs,
- /*trim_start*/hit3->trim_left,trim_right_3,hit3->querylength_adj)));
- hit3->score_eventrim_start += indel_penalty_middle * nindelbreaks;
- hit3->score_eventrim_end += indel_penalty_middle * nindelbreaks;
- hit3->score_eventrim_start += hit3->start_amb_nmatches / ambig_end_interval;
- debug6(printf(" add amb start %d/%d.",hit3->start_amb_nmatches,ambig_end_interval));
- hit3->score_eventrim_end += hit3->end_amb_nmatches / ambig_end_interval;
- debug6(printf(" add amb end %d/%d.",hit3->end_amb_nmatches,ambig_end_interval));
- debug6(printf(" RESULT: %d or %d\n",hit3->score_eventrim_start,hit3->score_eventrim_end));
+ hit3->score_eventrim += Pair_nmismatches_region(&nindelbreaks,hit3->pairarray,hit3->npairs,
+ trim_left_3,trim_right_3,hit3->start_amb_nmatches,hit3->end_amb_nmatches,
+ hit3->querylength_adj);
+ debug6(printf(" add nmismatches %d.",Pair_nmismatches_region(&nindelbreaks,hit3->pairarray,hit3->npairs,
+ trim_left_3,trim_right_3,hit3->start_amb_nmatches,hit3->end_amb_nmatches,
+ hit3->querylength_adj)));
+#ifdef SCORE_INDELS
+ hit3->score_eventrim += indel_penalty_middle * nindelbreaks;
+ debug6(printf(" add indelbreaks %d.",indel_penalty_middle * nindelbreaks));
+#endif
+ if (hit3->start_amb_prob < 0.9) {
+ hit3->score_eventrim += hit3->start_amb_nmatches / ambig_end_interval;
+ debug6(printf(" add amb start %d/%d (prob %f).",hit3->start_amb_nmatches,ambig_end_interval,hit3->start_amb_prob));
+ }
+ if (hit3->end_amb_prob < 0.9) {
+ hit3->score_eventrim += hit3->end_amb_nmatches / ambig_end_interval;
+ debug6(printf(" add amb end %d/%d (prob %f).",hit3->end_amb_nmatches,ambig_end_interval,hit3->end_amb_prob));
+ }
+ debug6(printf(" RESULT: %d\n",hit3->score_eventrim));
} else {
debug6(printf("score 3' OTHER:"));
- hit3->score_eventrim_start = hit3->score_eventrim_end = hit3->penalties;
+ hit3->score_eventrim = hit3->penalties;
debug6(printf(" penalties %d.",hit3->penalties));
- hit3->score_eventrim_start += Substring_count_mismatches_region(hit3->substring0,trim_left_3,/*trim_end*/hit3->trim_right,
- query3_compress_fwd,query3_compress_rev);
- debug6(printf(" substring 0 %d.",Substring_count_mismatches_region(hit3->substring0,trim_left_3,/*trim_end*/hit3->trim_right
- query3_compress_fwd,query3_compress_rev)));
-
- hit3->score_eventrim_start += Substring_count_mismatches_region(hit3->substring1,trim_left_3,/*trim_end*/hit3->trim_right,
- query3_compress_fwd,query3_compress_rev);
- debug6(printf(" substring 1 %d.",Substring_count_mismatches_region(hit3->substring1,trim_left_3,/*trim_end*/hit3->trim_right,
- query3_compress_fwd,query3_compress_rev)));
-
- hit3->score_eventrim_start += Substring_count_mismatches_region(hit3->substring2,trim_left_3,/*trim_end*/hit3->trim_right,
- query3_compress_fwd,query3_compress_rev);
- debug6(printf(" substring 2 %d.",Substring_count_mismatches_region(hit3->substring2,trim_left_3,/*trim_end*/hit3->trim_right,
+ hit3->score_eventrim += Substring_count_mismatches_region(hit3->substring0,trim_left_3,trim_right_3,
+ query3_compress_fwd,query3_compress_rev);
+ debug6(printf(" substring 0 %d.",Substring_count_mismatches_region(hit3->substring0,trim_left_3,trim_right_3,
query3_compress_fwd,query3_compress_rev)));
-
- hit3->score_eventrim_end += Substring_count_mismatches_region(hit3->substring0,/*trim_start*/hit3->trim_left,trim_right_3,
- query3_compress_fwd,query3_compress_rev);
- debug6(printf(" substring 0 %d.",Substring_count_mismatches_region(hit3->substring0,/*trim_start*/hit3->trim_left,trim_right_3,
+ hit3->score_eventrim += Substring_count_mismatches_region(hit3->substring1,trim_left_3,trim_right_3,
+ query3_compress_fwd,query3_compress_rev);
+ debug6(printf(" substring 1 %d.",Substring_count_mismatches_region(hit3->substring1,trim_left_3,trim_right_3,
query3_compress_fwd,query3_compress_rev)));
- hit3->score_eventrim_end += Substring_count_mismatches_region(hit3->substring1,/*trim_start*/hit3->trim_left,trim_right_3,
- query3_compress_fwd,query3_compress_rev);
- debug6(printf(" substring 1 %d.",Substring_count_mismatches_region(hit3->substring1,/*trim_start*/hit3->trim_left,trim_right_3,
- query3_compress_fwd,query3_compress_rev)));
-
- hit3->score_eventrim_end += Substring_count_mismatches_region(hit3->substring2,/*trim_start*/hit3->trim_left,trim_right_3,
- query3_compress_fwd,query3_compress_rev);
- debug6(printf(" substring 2 %d.",Substring_count_mismatches_region(hit3->substring2,/*trim_start*/hit3->trim_left,trim_right_3,
+ hit3->score_eventrim += Substring_count_mismatches_region(hit3->substring2,trim_left_3,trim_right_3,
+ query3_compress_fwd,query3_compress_rev);
+ debug6(printf(" substring 2 %d.",Substring_count_mismatches_region(hit3->substring2,trim_left_3,trim_right_3,
query3_compress_fwd,query3_compress_rev)));
+#ifdef SCORE_INDELS
+ /* Needs to match GMAP scoring */
if (hit3->hittype == INSERTION || hit3->hittype == DELETION) {
- hit3->score_eventrim_start += indel_penalty_middle;
- hit3->score_eventrim_end += indel_penalty_middle;
+ hit3->score_eventrim += indel_penalty_middle;
debug6(printf(" add indel %d.",indel_penalty_middle));
}
- debug6(printf(" RESULT: %d or %d\n",hit3->score_eventrim_start,hit3->score_eventrim_end));
+#endif
+ debug6(printf(" RESULT: %d\n",hit3->score_eventrim));
}
- if (hit5->score_eventrim_start < hit5->score_eventrim_end) {
- hitpair->score_eventrim = hit5->score_eventrim_start;
- } else {
- hitpair->score_eventrim = hit5->score_eventrim_end;
- }
- if (hit3->score_eventrim_start < hit3->score_eventrim_end) {
- hitpair->score_eventrim += hit3->score_eventrim_start;
- } else {
- hitpair->score_eventrim += hit3->score_eventrim_end;
- }
+ hitpair->score_eventrim = hit5->score_eventrim + hit3->score_eventrim;
if (hitpair->score_eventrim < minscore) {
minscore = hitpair->score_eventrim;
}
@@ -13036,85 +12566,61 @@ Stage3pair_optimal_score_aux (bool *eliminatedp, List_T hitpairlist, int cutoff_
for (p = hitpairlist; p != NULL; p = p->rest) {
hitpair = (Stage3pair_T) p->first;
- debug6(printf("%lu..%lu %u..%u|%u..%u types %s and %s, score_eventrim %d|%d+%d|%d, pairlength %d, outerlength %u\n",
- hitpair->low,hitpair->high,
+ debug6(printf("%u..%u|%u..%u types %s and %s, score_eventrim %d+%d, pairlength %d, outerlength %u\n",
hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
hittype_string(hitpair->hit5->hittype),hittype_string(hitpair->hit3->hittype),
- hitpair->hit5->score_eventrim_start,hitpair->hit5->score_eventrim_end,
- hitpair->hit3->score_eventrim_start,hitpair->hit3->score_eventrim_end,
+ hitpair->hit5->score_eventrim,hitpair->hit3->score_eventrim,
hitpair->insertlength,hitpair->outerlength));
if (hitpair->hit5->hittype == TERMINAL && non_terminal_5p == true) {
/* Don't use to determine minscore5 */
- } else {
- if (hitpair->hit5->score_eventrim_start < minscore5_start) {
- minscore5_start = hitpair->hit5->score_eventrim_start;
- }
- if (hitpair->hit5->score_eventrim_end < minscore5_end) {
- minscore5_end = hitpair->hit5->score_eventrim_end;
- }
+ } else if (hitpair->hit5->score_eventrim < minscore5) {
+ minscore5 = hitpair->hit5->score_eventrim;
}
if (hitpair->hit3->hittype == TERMINAL && non_terminal_3p == true) {
/* Don't use to determine minscore3 */
- } else {
- if (hitpair->hit3->score_eventrim_start < minscore3_start) {
- minscore3_start = hitpair->hit3->score_eventrim_start;
- }
- if (hitpair->hit3->score_eventrim_end < minscore3_end) {
- minscore3_end = hitpair->hit3->score_eventrim_end;
- }
+ } else if (hitpair->hit3->score_eventrim < minscore3) {
+ minscore3 = hitpair->hit3->score_eventrim;
}
}
- debug6(printf("Stage3pair_optimal_score over %d pairs: minscore = %d|%d and %d|%d + subopt:%d\n",
- n,minscore5_start,minscore5_end,minscore3_start,minscore3_end,suboptimal_mismatches));
+ debug6(printf("Stage3pair_optimal_score over %d pairs: minscore = %d and %d + subopt:%d\n",
+ n,minscore5,minscore3,suboptimal_mismatches));
if (non_double_terminal_p == true && finalp == false) {
/* finalp == false. Add suboptimal_mismatches to each end. */
- minscore5_start += suboptimal_mismatches;
- minscore5_end += suboptimal_mismatches;
- minscore3_start += suboptimal_mismatches;
- minscore3_end += suboptimal_mismatches;
- cutoff_level_5_start = minscore5_start;
- cutoff_level_5_end = minscore5_end;
- cutoff_level_3_start = minscore3_start;
- cutoff_level_3_end = minscore3_end;
+ minscore5 += suboptimal_mismatches;
+ minscore3 += suboptimal_mismatches;
+ cutoff_level_5 = minscore5;
+ cutoff_level_3 = minscore3;
for (p = hitpairlist; p != NULL; p = p->rest) {
hitpair = (Stage3pair_T) p->first;
if (hitpair->hit5->hittype == TERMINAL || hitpair->hit3->hittype == TERMINAL) {
- debug6(printf("Prefinal: Keeping a hit pair of type %s-%s with score_eventrim %d|%d and %d|%d, because finalp is false\n",
+ debug6(printf("Prefinal: Keeping a hit pair of type %s-%s with score_eventrim %d and %d, because finalp is false\n",
hittype_string(hitpair->hit5->hittype),hittype_string(hitpair->hit3->hittype),
- hitpair->hit5->score_eventrim_start,hitpair->hit5->score_eventrim_end,
- hitpair->hit3->score_eventrim_start,hitpair->hit3->score_eventrim_end));
+ hitpair->hit5->score_eventrim,hitpair->hit3->score_eventrim));
optimal = List_push(optimal,hitpair);
} else if (keep_gmap_p == true && (hitpair->hit5->hittype == GMAP || hitpair->hit3->hittype == GMAP)) {
/* GMAP hits already found to be better than their corresponding terminals */
- debug6(printf("Prefinal: Keeping a hit pair of type %s-%s with score_eventrim %d|%d and %d|%d, because keep_gmap_p is true\n",
+ debug6(printf("Prefinal: Keeping a hit pair of type %s-%s with score_eventrim %d and %d, because keep_gmap_p is true\n",
hittype_string(hitpair->hit5->hittype),hittype_string(hitpair->hit3->hittype),
- hitpair->hit5->score_eventrim_start,hitpair->hit5->score_eventrim_end,
- hitpair->hit3->score_eventrim_start,hitpair->hit3->score_eventrim_end));
+ hitpair->hit5->score_eventrim,hitpair->hit3->score_eventrim));
optimal = List_push(optimal,hitpair);
- } else if (hitpair->hit5->score_eventrim_start > cutoff_level_5_start && hitpair->hit5->score_eventrim_end > cutoff_level_5_end &&
- hitpair->hit3->score_eventrim_start > cutoff_level_3_start && hitpair->hit3->score_eventrim_end > cutoff_level_3_end) {
- debug6(printf("Prefinal: Eliminating a hit pair at %lu..%lu %u..%u|%u..%u with score_eventrim_5 %d|%d > cutoff_level_5 %d|%d and score_eventrim_3 %d|%d > cutoff_level_3 %d|%d (finalp %d)\n",
- hitpair->low,hitpair->high,
+ } else if (hitpair->hit5->score_eventrim > cutoff_level_5 && hitpair->hit3->score_eventrim > cutoff_level_3) {
+ debug6(printf("Prefinal: Eliminating a hit pair at %u..%u|%u..%u with score_eventrim_5 %d > cutoff_level_5 %d and score_eventrim_3 %d > cutoff_level_3 %d (finalp %d)\n",
hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hitpair->hit5->score_eventrim_start,hitpair->hit5->score_eventrim_end,cutoff_level_5_start,cutoff_level_5_end,
- hitpair->hit3->score_eventrim_start,hitpair->hit3->score_eventrim_end,cutoff_level_3_start,cutoff_level_3_end,
- finalp));
+ hitpair->hit5->score_eventrim,cutoff_level_5,hitpair->hit3->score_eventrim,cutoff_level_3,finalp));
*eliminatedp = true;
Stage3pair_free(&hitpair);
} else {
- debug6(printf("Prefinal: Keeping a hit pair with score_eventrim %d|%d and %d|%d (cutoff_level %d|%d and %d|%d)\n",
- hitpair->hit5->score_eventrim_start,hitpair->hit5->score_eventrim_end,
- hitpair->hit3->score_eventrim_start,hitpair->hit3->score_eventrim_end,
- cutoff_level_5_start,cutoff_level_5_end,cutoff_level_3_start,cutoff_level_3_end));
+ debug6(printf("Prefinal: Keeping a hit pair with score_eventrim %d and %d (cutoff_level %d and %d)\n",
+ hitpair->hit5->score_eventrim,hitpair->hit3->score_eventrim,cutoff_level_5,cutoff_level_3));
optimal = List_push(optimal,hitpair);
}
}
@@ -13138,13 +12644,11 @@ Stage3pair_optimal_score_aux (bool *eliminatedp, List_T hitpairlist, int cutoff_
for (p = hitpairlist; p != NULL; p = p->rest) {
hitpair = (Stage3pair_T) p->first;
- debug6(printf("Final: %lu..%lu %u..%u|%u..%u types %s and %s, score_eventrim %d (%d|%d+%d|%d), pairlength %d, outerlength %u\n",
- hitpair->low,hitpair->high,
+ debug6(printf("Final: %u..%u|%u..%u types %s and %s, score_eventrim %d (%d+%d), pairlength %d, outerlength %u\n",
hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
- hittype_string(hitpair->hit5->hittype),hittype_string(hitpair->hit3->hittype),hitpair->score_eventrim,
- hitpair->hit5->score_eventrim_start,hitpair->hit5->score_eventrim_end,
- hitpair->hit3->score_eventrim_start,hitpair->hit3->score_eventrim_end,
+ hittype_string(hitpair->hit5->hittype),hittype_string(hitpair->hit3->hittype),
+ hitpair->score_eventrim,hitpair->hit5->score_eventrim,hitpair->hit3->score_eventrim,
hitpair->insertlength,hitpair->outerlength));
#if 0
@@ -13178,8 +12682,7 @@ Stage3pair_optimal_score_aux (bool *eliminatedp, List_T hitpairlist, int cutoff_
optimal = List_push(optimal,hitpair);
} else if (score > cutoff_level) {
- debug6(printf("Final: Eliminating a hit pair at %lu..%lu %u..%u|%u..%u with score %d > cutoff_level %d (finalp %d)\n",
- hitpair->low,hitpair->high,
+ debug6(printf("Final: Eliminating a hit pair at %u..%u|%u..%u with score %d > cutoff_level %d (finalp %d)\n",
hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
score,cutoff_level,finalp));
@@ -13303,8 +12806,8 @@ Stage3end_linearize_5 (List_T hitlist) {
for (p = hitlist; p != NULL; p = List_next(p)) {
hit = (T) List_head(p);
debug12(chrlength = hit->chrlength);
- debug12(printf("Looking at 5' end %lu..%lu against chrlength %u\n",
- hit->genomicstart,hit->genomicend,chrlength));
+ debug12(printf("Looking at 5' end %u..%u against chrlength %u\n",
+ hit->genomicstart - hit->chroffset,hit->genomicend - hit->chroffset,chrlength));
if (hit->alias == 0) {
/* Skip */
@@ -13336,8 +12839,8 @@ Stage3end_linearize_3 (List_T hitlist) {
for (p = hitlist; p != NULL; p = List_next(p)) {
hit = (T) List_head(p);
debug12(chrlength = hit->chrlength);
- debug12(printf("Looking at 3' end %lu..%lu against chrlength %u\n",
- hit->genomicstart,hit->genomicend,chrlength));
+ debug12(printf("Looking at 3' end %u..%u against chrlength %u\n",
+ hit->genomicstart - hit->chroffset,hit->genomicend - hit->chroffset,chrlength));
if (hit->alias == 0) {
/* Skip */
@@ -13466,14 +12969,14 @@ pair_up_concordant_aux (bool *abort_pairing_p, int *found_score, int *nconcordan
while (*abort_pairing_p == false && i < nhits5) {
hit5 = hits5[i];
insert_start = hit5->genomicend - querylength5;
- debug5(printf("plus/plus: i=%d/%d %lu..%lu %s %s %p\n",
- i,nhits5,hit5->genomicstart,hit5->genomicend,
+ debug5(printf("plus/plus: i=%d/%d %u..%u %s %s %p\n",
+ i,nhits5,hit5->genomicstart - hit5->chroffset,hit5->genomicend - hit5->chroffset,
print_sense(hit5->sensedir),hittype_string(hit5->hittype),hit5));
while (j >= 0 &&
hits3[j]->genomicstart + querylength3 /* for scramble: */ + pairmax > insert_start) {
- debug5(printf(" backup: j=%d/%d %lu..%lu %s %s %p\n",
- j,nhits3,hits3[j]->genomicstart,hits3[j]->genomicend,
+ debug5(printf(" backup: j=%d/%d %u..%u %s %s %p\n",
+ j,nhits3,hits3[j]->genomicstart - hits3[j]->chroffset,hits3[j]->genomicend - hits3[j]->chroffset,
print_sense(hits3[j]->sensedir),hittype_string(hits3[j]->hittype),hits3[j]));
j--;
}
@@ -13481,15 +12984,15 @@ pair_up_concordant_aux (bool *abort_pairing_p, int *found_score, int *nconcordan
while (j < nhits3 &&
hits3[j]->genomicstart + querylength3 /* for scramble: */ + pairmax <= insert_start) {
- debug5(printf(" advance: j=%d/%d %lu..%lu %s %s %p\n",
- j,nhits3,hits3[j]->genomicstart,hits3[j]->genomicend,
+ debug5(printf(" advance: j=%d/%d %u..%u %s %s %p\n",
+ j,nhits3,hits3[j]->genomicstart - hits3[j]->chroffset,hits3[j]->genomicend - hits3[j]->chroffset,
print_sense(hits3[j]->sensedir),hittype_string(hits3[j]->hittype),hits3[j]));
j++;
}
while (j < nhits3 && hits3[j]->genomicstart + querylength3 <= pairmax + insert_start) {
- debug5(printf(" overlap: j=%d/%d %lu..%lu %s %s %p",
- j,nhits3,hits3[j]->genomicstart,hits3[j]->genomicend,
+ debug5(printf(" overlap: j=%d/%d %u..%u %s %s %p",
+ j,nhits3,hits3[j]->genomicstart - hits3[j]->chroffset,hits3[j]->genomicend - hits3[j]->chroffset,
print_sense(hits3[j]->sensedir),hittype_string(hits3[j]->hittype),hits3[j]));
hit3 = hits3[j];
@@ -13574,14 +13077,14 @@ pair_up_concordant_aux (bool *abort_pairing_p, int *found_score, int *nconcordan
while (*abort_pairing_p == false && i < nhits3) {
hit3 = hits3[i];
insert_start = hit3->genomicstart - querylength3;
- debug5(printf("minus/minus: i=%d/%d %lu..%lu %s %s %p\n",
- i,nhits3,hit3->genomicstart,hit3->genomicend,
+ debug5(printf("minus/minus: i=%d/%d %u..%u %s %s %p\n",
+ i,nhits3,hit3->genomicstart - hit3->chroffset,hit3->genomicend - hit3->chroffset,
print_sense(hit3->sensedir),hittype_string(hit3->hittype),hit3));
while (j >= 0 &&
hits5[j]->genomicend + querylength5 /* for scramble: */ + pairmax > insert_start) {
- debug5(printf(" backup: j=%d/%d %lu..%lu %s %s %p\n",
- j,nhits5,hits5[j]->genomicstart,hits5[j]->genomicend,
+ debug5(printf(" backup: j=%d/%d %u..%u %s %s %p\n",
+ j,nhits5,hits5[j]->genomicstart - hits5[j]->chroffset,hits5[j]->genomicend - hits5[j]->chroffset,
print_sense(hits5[j]->sensedir),hittype_string(hits5[j]->hittype),hits5[j]));
j--;
}
@@ -13589,15 +13092,15 @@ pair_up_concordant_aux (bool *abort_pairing_p, int *found_score, int *nconcordan
while (j < nhits5 &&
hits5[j]->genomicend + querylength5 /* for scramble: */ + pairmax <= insert_start) {
- debug5(printf(" advance: j=%d/%d %lu..%lu %s %s %p\n",
- j,nhits5,hits5[j]->genomicstart,hits5[j]->genomicend,
+ debug5(printf(" advance: j=%d/%d %u..%u %s %s %p\n",
+ j,nhits5,hits5[j]->genomicstart - hits5[j]->chroffset,hits5[j]->genomicend - hits5[j]->chroffset,
print_sense(hits5[j]->sensedir),hittype_string(hits5[j]->hittype),hits5[j]));
j++;
}
while (j < nhits5 && hits5[j]->genomicend + querylength5 <= pairmax + insert_start) {
- debug5(printf(" overlap: j=%d/%d %lu..%lu %s %s %p",
- j,nhits5,hits5[j]->genomicstart,hits5[j]->genomicend,
+ debug5(printf(" overlap: j=%d/%d %u..%u %s %s %p",
+ j,nhits5,hits5[j]->genomicstart - hits5[j]->chroffset,hits5[j]->genomicend - hits5[j]->chroffset,
print_sense(hits5[j]->sensedir),hittype_string(hits5[j]->hittype),hits5[j]));
hit5 = hits5[j];
@@ -13683,14 +13186,14 @@ pair_up_concordant_aux (bool *abort_pairing_p, int *found_score, int *nconcordan
while (*abort_pairing_p == false && i < nhits5) {
hit5 = hits5[i];
insert_start = hit5->genomicend - querylength5;
- debug5(printf("plus/minus: i=%d/%d %lu..%lu %s %s %p\n",
- i,nhits5,hit5->genomicstart,hit5->genomicend,
+ debug5(printf("plus/minus: i=%d/%d %u..%u %s %s %p\n",
+ i,nhits5,hit5->genomicstart - hit5->chroffset,hit5->genomicend - hit5->chroffset,
print_sense(hit5->sensedir),hittype_string(hit5->hittype),hit5));
while (j >= 0 &&
hits3[j]->genomicstart + querylength3 /* for scramble: */ + pairmax > insert_start) {
- debug5(printf(" backup: j=%d/%d %lu..%lu %s %s %p\n",
- j,nhits3,hits3[j]->genomicstart,hits3[j]->genomicend,
+ debug5(printf(" backup: j=%d/%d %u..%u %s %s %p\n",
+ j,nhits3,hits3[j]->genomicstart - hits3[j]->chroffset,hits3[j]->genomicend - hits3[j]->chroffset,
print_sense(hits3[j]->sensedir),hittype_string(hits3[j]->hittype),hits3[j]));
j--;
}
@@ -13698,15 +13201,15 @@ pair_up_concordant_aux (bool *abort_pairing_p, int *found_score, int *nconcordan
while (j < nhits3 &&
hits3[j]->genomicstart + querylength3 /* for scramble: */ + pairmax <= insert_start) {
- debug5(printf(" advance: j=%d/%d %lu..%lu %s %s %p\n",
- j,nhits3,hits3[j]->genomicstart,hits3[j]->genomicend,
+ debug5(printf(" advance: j=%d/%d %u..%u %s %s %p\n",
+ j,nhits3,hits3[j]->genomicstart - hits3[j]->chroffset,hits3[j]->genomicend - hits3[j]->chroffset,
print_sense(hits3[j]->sensedir),hittype_string(hits3[j]->hittype),hits3[j]));
j++;
}
while (j < nhits3 && hits3[j]->genomicstart + querylength3 <= pairmax + insert_start) {
- debug5(printf(" overlap: j=%d/%d %lu..%lu %s %s %p",
- j,nhits3,hits3[j]->genomicstart,hits3[j]->genomicend,
+ debug5(printf(" overlap: j=%d/%d %u..%u %s %s %p",
+ j,nhits3,hits3[j]->genomicstart - hits3[j]->chroffset,hits3[j]->genomicend - hits3[j]->chroffset,
print_sense(hits3[j]->sensedir),hittype_string(hits3[j]->hittype),hits3[j]));
hit3 = hits3[j];
@@ -13768,14 +13271,14 @@ pair_up_concordant_aux (bool *abort_pairing_p, int *found_score, int *nconcordan
while (*abort_pairing_p == false && i < nhits3) {
hit3 = hits3[i];
insert_start = hit3->genomicstart - querylength3;
- debug5(printf("minus/plus: i=%d/%d %lu..%lu %s %s %p\n",
- i,nhits3,hit3->genomicstart,hit3->genomicend,
+ debug5(printf("minus/plus: i=%d/%d %u..%u %s %s %p\n",
+ i,nhits3,hit3->genomicstart - hit3->chroffset,hit3->genomicend - hit3->chroffset,
print_sense(hit3->sensedir),hittype_string(hit3->hittype),hit3));
while (j >= 0 &&
hits5[j]->genomicend + querylength5 /* for scramble: */ + pairmax > insert_start) {
- debug5(printf(" backup: j=%d/%d %lu..%lu %s %s %p\n",
- j,nhits5,hits5[j]->genomicstart,hits5[j]->genomicend,
+ debug5(printf(" backup: j=%d/%d %u..%u %s %s %p\n",
+ j,nhits5,hits5[j]->genomicstart - hits5[j]->chroffset,hits5[j]->genomicend - hits5[j]->chroffset,
print_sense(hits5[j]->sensedir),hittype_string(hits5[j]->hittype),hits5[j]));
j--;
}
@@ -13783,15 +13286,15 @@ pair_up_concordant_aux (bool *abort_pairing_p, int *found_score, int *nconcordan
while (j < nhits5 &&
hits5[j]->genomicend + querylength5 /* for scramble: */ + pairmax <= insert_start) {
- debug5(printf(" advance: j=%d/%d %lu..%lu %s %s %p\n",
- j,nhits5,hits5[j]->genomicstart,hits5[j]->genomicend,
+ debug5(printf(" advance: j=%d/%d %u..%u %s %s %p\n",
+ j,nhits5,hits5[j]->genomicstart - hits5[j]->chroffset,hits5[j]->genomicend - hits5[j]->chroffset,
print_sense(hits5[j]->sensedir),hittype_string(hits5[j]->hittype),hits5[j]));
j++;
}
while (j < nhits5 && hits5[j]->genomicend + querylength5 <= pairmax + insert_start) {
- debug5(printf(" overlap: j=%d/%d %lu..%lu %s %s %p",
- j,nhits5,hits5[j]->genomicstart,hits5[j]->genomicend,
+ debug5(printf(" overlap: j=%d/%d %u..%u %s %s %p",
+ j,nhits5,hits5[j]->genomicstart - hits5[j]->chroffset,hits5[j]->genomicend - hits5[j]->chroffset,
print_sense(hits5[j]->sensedir),hittype_string(hits5[j]->hittype),hits5[j]));
hit5 = hits5[j];
diff --git a/src/stage3hr.h b/src/stage3hr.h
index d492305..0b93be3 100644
--- a/src/stage3hr.h
+++ b/src/stage3hr.h
@@ -1,4 +1,4 @@
-/* $Id: stage3hr.h 140368 2014-07-02 00:56:33Z twu $ */
+/* $Id: stage3hr.h 141810 2014-07-17 02:53:02Z twu $ */
#ifndef STAGE3HR_INCLUDED
#define STAGE3HR_INCLUDED
@@ -182,9 +182,9 @@ Stage3end_npairs (T this);
extern Chrpos_T
Stage3end_distance (T this);
extern Chrpos_T
-Stage3end_shortexon_acceptor_distance (T this);
+Stage3end_shortexonA_distance (T this);
extern Chrpos_T
-Stage3end_shortexon_donor_distance (T this);
+Stage3end_shortexonD_distance (T this);
extern double
Stage3end_chimera_prob (T this);
extern double
@@ -348,8 +348,8 @@ extern T
Stage3end_new_gmap (int nmismatches_whole, int nmatches_posttrim, int max_match_length,
int ambig_end_length_5, int ambig_end_length_3,
Splicetype_T ambig_splicetype_5, Splicetype_T ambig_splicetype_3,
- double min_splice_prob, struct Pair_T *pairarray, int npairs,
- int nsegments, int nintrons, int nindelbreaks,
+ double ambig_prob_5, double ambig_prob_3, double min_splice_prob,
+ struct Pair_T *pairarray, int npairs, int nsegments, int nintrons, int nindelbreaks,
Univcoord_T left, int genomiclength, bool plusp, int genestrand, bool first_read_p, int querylength,
Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
int cdna_direction, int sensedir);
diff --git a/util/gff3_genes.pl.in b/util/gff3_genes.pl.in
index 3ec3a01..fa7bef8 100644
--- a/util/gff3_genes.pl.in
+++ b/util/gff3_genes.pl.in
@@ -6,6 +6,7 @@ use warnings;
$gene_name = "";
$last_transcript_id = "";
@exons = ();
+ at CDS_regions = ();
while (defined($line = <>)) {
if ($line =~ /^\#/) {
# Skip
@@ -30,11 +31,26 @@ while (defined($line = <>)) {
}
print "$gene_name\n";
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand);
+
+ } elsif ($#CDS_regions >= 0) {
+ # Handle last mRNA of previous gene
+ if ($strand eq "+") {
+ ($start,$end) = get_gene_bounds_plus(\@CDS_regions);
+ printf ">$last_transcript_id $chr:%u..%u\n",$start,$end;
+ } elsif ($strand eq "-") {
+ ($start,$end) = get_gene_bounds_minus(\@CDS_regions);
+ printf ">$last_transcript_id $chr:%u..%u\n",$end,$start;
+ } else {
+ die "strand $strand";
+ }
+ print "$gene_name\n";
+ print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand);
}
($gene_name) = $fields[8] =~ /ID=([^;]+)/;
$chr = $fields[0];
@exons = ();
+ @CDS_regions = ();
} elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") {
if ($#exons >= 0) {
@@ -49,6 +65,19 @@ while (defined($line = <>)) {
}
print "$gene_name\n";
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand);
+
+ } elsif ($#CDS_regions >= 0) {
+ if ($strand eq "+") {
+ ($start,$end) = get_gene_bounds_plus(\@CDS_regions);
+ printf ">$last_transcript_id $chr:%u..%u\n",$start,$end;
+ } elsif ($strand eq "-") {
+ ($start,$end) = get_gene_bounds_minus(\@CDS_regions);
+ printf ">$last_transcript_id $chr:%u..%u\n",$end,$start;
+ } else {
+ die "strand $strand";
+ }
+ print "$gene_name\n";
+ print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand);
}
($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/;
@@ -63,9 +92,12 @@ while (defined($line = <>)) {
}
@exons = ();
+ @CDS_regions = ();
- } elsif ($fields[2] eq "exon" || $fields[2] eq "CDS") {
+ } elsif ($fields[2] eq "exon") {
push @exons,"$fields[3] $fields[4]";
+ } elsif ($fields[2] eq "CDS") {
+ push @CDS_regions,"$fields[3] $fields[4]";
}
}
}
@@ -82,6 +114,19 @@ if ($#exons >= 0) {
}
print "$gene_name\n";
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand);
+
+} elsif ($#CDS_regions >= 0) {
+ if ($strand eq "+") {
+ ($start,$end) = get_gene_bounds_plus(\@CDS_regions);
+ printf ">$last_transcript_id $chr:%u..%u\n",$start,$end;
+ } elsif ($strand eq "-") {
+ ($start,$end) = get_gene_bounds_minus(\@CDS_regions);
+ printf ">$last_transcript_id $chr:%u..%u\n",$end,$start;
+ } else {
+ die "strand $strand";
+ }
+ print "$gene_name\n";
+ print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand);
}
diff --git a/util/gff3_introns.pl.in b/util/gff3_introns.pl.in
index 2222333..1e83593 100755
--- a/util/gff3_introns.pl.in
+++ b/util/gff3_introns.pl.in
@@ -29,6 +29,7 @@ if (defined($opt_d)) {
$gene_name = "";
$last_transcript_id = "";
@exons = ();
+ @CDS_regions = ();
while (defined($line = <>)) {
if ($line =~ /^\#/) {
# Skip
@@ -45,16 +46,24 @@ if (defined($opt_d)) {
# Handle last mRNA of previous gene
query_dinucleotides(\@exons,$chr,$strand,$FP);
#print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ } elsif ($#CDS_regions > 0) {
+ # Handle last mRNA of previous gene
+ query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
+ #print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
($gene_name) = $fields[8] =~ /ID=([^;]+)/;
$chr = $fields[0];
@exons = ();
+ @CDS_regions = ();
} elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") {
if ($#exons > 0) {
query_dinucleotides(\@exons,$chr,$strand,$FP);
#print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ } elsif ($#CDS_regions > 0) {
+ query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
+ #print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/;
@@ -69,9 +78,12 @@ if (defined($opt_d)) {
}
@exons = ();
+ @CDS_regions = ();
- } elsif ($fields[2] eq "exon" || $fields[2] eq "CDS") {
+ } elsif ($fields[2] eq "exon") {
push @exons,"$fields[3] $fields[4]";
+ } elsif ($fields[2] eq "CDS") {
+ push @CDS_regions,"$fields[3] $fields[4]";
}
}
}
@@ -79,6 +91,8 @@ if (defined($opt_d)) {
if ($#exons > 0) {
query_dinucleotides(\@exons,$chr,$strand,$FP);
+ } elsif ($#CDS_regions > 0) {
+ query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
}
close($FP);
@@ -105,6 +119,7 @@ if (defined($opt_d)) {
$gene_name = "";
$last_transcript_id = "";
@exons = ();
+ at CDS_regions = ();
while (defined($line = get_line())) {
if ($line =~ /^\#/) {
# Skip
@@ -119,15 +134,21 @@ while (defined($line = get_line())) {
if ($#exons > 0) {
# Handle last mRNA of previous gene
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ } elsif ($#CDS_regions > 0) {
+ # Handle last mRNA of previous gene
+ print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
($gene_name) = $fields[8] =~ /ID=([^;]+)/;
$chr = $fields[0];
@exons = ();
+ @CDS_regions = ();
} elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") {
if ($#exons > 0) {
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ } elsif ($#CDS_regions > 0) {
+ print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/;
@@ -142,15 +163,20 @@ while (defined($line = get_line())) {
}
@exons = ();
+ @CDS_regions = ();
- } elsif ($fields[2] eq "exon" || $fields[2] eq "CDS") {
+ } elsif ($fields[2] eq "exon") {
push @exons,"$fields[3] $fields[4]";
+ } elsif ($fields[2] eq "CDS") {
+ push @CDS_regions,"$fields[3] $fields[4]";
}
}
}
if ($#exons > 0) {
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+} elsif ($#CDS_regions > 0) {
+ print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
if (defined($opt_d)) {
diff --git a/util/gff3_splicesites.pl.in b/util/gff3_splicesites.pl.in
index 49ab3e1..e626b84 100755
--- a/util/gff3_splicesites.pl.in
+++ b/util/gff3_splicesites.pl.in
@@ -29,6 +29,7 @@ if (defined($opt_d)) {
$gene_name = "";
$last_transcript_id = "";
@exons = ();
+ @CDS_regions = ();
while (defined($line = <>)) {
if ($line =~ /^\#/) {
# Skip
@@ -45,16 +46,24 @@ if (defined($opt_d)) {
# Handle last mRNA of previous gene
query_dinucleotides(\@exons,$chr,$strand,$FP);
#print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ } elsif ($#CDS_regions > 0) {
+ # Handle last mRNA of previous gene
+ query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
+ #print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
($gene_name) = $fields[8] =~ /ID=([^;]+)/;
$chr = $fields[0];
@exons = ();
+ @CDS_regions = ();
} elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") {
if ($#exons > 0) {
query_dinucleotides(\@exons,$chr,$strand,$FP);
#print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ } elsif ($#CDS_regions > 0) {
+ query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
+ #print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/;
@@ -69,9 +78,14 @@ if (defined($opt_d)) {
}
@exons = ();
+ @CDS_regions = ();
- } elsif ($fields[2] eq "exon" || $fields[2] eq "CDS") {
+ } elsif ($fields[2] eq "exon") {
push @exons,"$fields[3] $fields[4]";
+
+ } elsif ($fields[2] eq "CDS") {
+ push @CDS_regions,"$fields[3] $fields[4]";
+
}
}
}
@@ -79,6 +93,8 @@ if (defined($opt_d)) {
if ($#exons > 0) {
query_dinucleotides(\@exons,$chr,$strand,$FP);
+ } elsif ($#CDS_regions > 0) {
+ query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
}
close($FP);
@@ -105,6 +121,7 @@ if (defined($opt_d)) {
$gene_name = "";
$last_transcript_id = "";
@exons = ();
+ at CDS_regions = ();
while (defined($line = get_line())) {
if ($line =~ /^\#/) {
# Skip
@@ -119,15 +136,21 @@ while (defined($line = get_line())) {
if ($#exons > 0) {
# Handle last mRNA of previous gene
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ } elsif ($#CDS_regions > 0) {
+ # Handle last mRNA of previous gene
+ print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
($gene_name) = $fields[8] =~ /ID=([^;]+)/;
$chr = $fields[0];
@exons = ();
+ @CDS_regions = ();
} elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") {
if ($#exons > 0) {
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ } elsif ($#CDS_regions > 0) {
+ print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/;
@@ -142,16 +165,21 @@ while (defined($line = get_line())) {
}
@exons = ();
+ @CDS_regions = ();
- } elsif ($fields[2] eq "exon" || $fields[2] eq "CDS") {
+ } elsif ($fields[2] eq "exon") {
push @exons,"$fields[3] $fields[4]";
+ } elsif ($fields[2] eq "CDS") {
+ push @CDS_regions,"$fields[3] $fields[4]";
}
}
}
if ($#exons > 0) {
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
-}
+} elsif ($#CDS_regions > 0) {
+ print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+}
if (defined($opt_d)) {
close($FP);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gmap.git
More information about the debian-med-commit
mailing list