[med-svn] [gmap] 01/05: New upstream version 2017-10-30
Alex Mestiashvili
malex-guest at moszumanska.debian.org
Thu Nov 2 15:39:04 UTC 2017
This is an automated email from the git hooks/post-receive script.
malex-guest pushed a commit to branch master
in repository gmap.
commit deec570b7f54b4a7ce5de5482a7d06276ea3026b
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date: Thu Nov 2 09:54:36 2017 +0100
New upstream version 2017-10-30
---
ChangeLog | 39 ++
VERSION | 2 +-
configure | 24 +-
src/cigar.c | 11 +-
src/cigar.h | 4 +-
src/get-genome.c | 95 ++--
src/gmap.c | 2 +-
src/gsnap.c | 4 +-
src/pairpool.c | 22 +-
src/sarray-search.c | 51 ++-
src/stage3.c | 1234 ++++++++++++++++++++++++++++++++++++++++-----------
src/stage3hr.c | 7 +-
src/substring.c | 62 +--
13 files changed, 1221 insertions(+), 336 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 58234e3..43227ec 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,42 @@
+2017-10-30 twu
+
+ * stage3.c: Merged revision 210882 from trunk to fix trim_end_indel
+ procedures to stop at a gap
+
+2017-10-29 twu
+
+ * VERSION, public-2017-09-05, src: Updated version number
+
+ * substring.c: Merged revision 210861 from trunk to fix a memory leak when
+ embellish_genomic is called a second time on a Stage3end_T object
+
+ * stage3hr.c: Merged revision 210860 from trunk to set hardclips to 0 for
+ creating CIGAR for a GMAP alignment
+
+ * stage3.c: Merged revision 210859 from trunk to disable forcep in
+ traverse_single_gap
+
+ * pairpool.c: Merged revision 210857 from trunk to fix Pairpool_join_end5
+ and Pairpool_join_end3. Not attaching the end if revision process still
+ results in a negative queryjump or genomejump.
+
+ * get-genome.c: Merged revision 210856 from trunk to look for map file in
+ genomesubdir as well as mapdir
+
+ * cigar.c, cigar.h, gsnap.c: Merged revision 210855 from trunk to call
+ Cigar_setup
+
+ * gmap.c: Merged revision 210647 from trunk to change option name from
+ --alt-initiation-codons to --alt-start-codons
+
+ * sarray-search.c: Merged part of revision 210755 from trunk. Ensuring that
+ we are using the smallest indel.
+
+2017-10-18 twu
+
+ * sarray-search.c: Merged revision 210665 from trunk. Fixed an issue where
+ querystart was -1, which can result when nmatches is 0
+
2017-10-14 twu
* VERSION, src: Updated version number
diff --git a/VERSION b/VERSION
index d549256..75cebdc 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2017-10-12
\ No newline at end of file
+2017-10-30
\ No newline at end of file
diff --git a/configure b/configure
index 7d3415b..ad2a907 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for gmap 2017-10-12.
+# Generated by GNU Autoconf 2.69 for gmap 2017-10-30.
#
# Report bugs to <Thomas Wu <twu at gene.com>>.
#
@@ -590,8 +590,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='gmap'
PACKAGE_TARNAME='gmap'
-PACKAGE_VERSION='2017-10-12'
-PACKAGE_STRING='gmap 2017-10-12'
+PACKAGE_VERSION='2017-10-30'
+PACKAGE_STRING='gmap 2017-10-30'
PACKAGE_BUGREPORT='Thomas Wu <twu at gene.com>'
PACKAGE_URL=''
@@ -1369,7 +1369,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures gmap 2017-10-12 to adapt to many kinds of systems.
+\`configure' configures gmap 2017-10-30 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1440,7 +1440,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of gmap 2017-10-12:";;
+ short | recursive ) echo "Configuration of gmap 2017-10-30:";;
esac
cat <<\_ACEOF
@@ -1577,7 +1577,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-gmap configure 2017-10-12
+gmap configure 2017-10-30
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2183,7 +2183,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by gmap $as_me 2017-10-12, which was
+It was created by gmap $as_me 2017-10-30, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2533,8 +2533,8 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking package version" >&5
$as_echo_n "checking package version... " >&6; }
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 2017-10-12" >&5
-$as_echo "2017-10-12" >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 2017-10-30" >&5
+$as_echo "2017-10-30" >&6; }
### Read defaults
@@ -4401,7 +4401,7 @@ fi
# Define the identity of the package.
PACKAGE='gmap'
- VERSION='2017-10-12'
+ VERSION='2017-10-30'
cat >>confdefs.h <<_ACEOF
@@ -19978,7 +19978,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by gmap $as_me 2017-10-12, which was
+This file was extended by gmap $as_me 2017-10-30, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -20044,7 +20044,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-gmap config.status 2017-10-12
+gmap config.status 2017-10-30
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/src/cigar.c b/src/cigar.c
index 7911135..0321319 100644
--- a/src/cigar.c
+++ b/src/cigar.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: cigar.c 207314 2017-06-14 19:28:08Z twu $";
+static char rcsid[] = "$Id: cigar.c 210864 2017-10-29 20:07:41Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -37,6 +37,15 @@ static bool md_lowercase_variant_p;
+void
+Cigar_setup (bool hide_soft_clips_p_in, bool merge_samechr_p_in, bool md_lowercase_variant_p_in) {
+ hide_soft_clips_p = hide_soft_clips_p_in;
+ merge_samechr_p = merge_samechr_p_in;
+ md_lowercase_variant_p = md_lowercase_variant_p_in;
+ return;
+}
+
+
#if 0
static void
print_tokens_stdout (List_T tokens) {
diff --git a/src/cigar.h b/src/cigar.h
index 2015330..45175f3 100644
--- a/src/cigar.h
+++ b/src/cigar.h
@@ -1,4 +1,4 @@
-/* $Id: cigar.h 206761 2017-05-30 17:39:28Z twu $ */
+/* $Id: cigar.h 210864 2017-10-29 20:07:41Z twu $ */
#ifndef CIGAR_INCLUDED
#define CIGAR_INCLUDED
@@ -9,6 +9,8 @@
#include "stage3hr.h"
+extern void
+Cigar_setup (bool hide_soft_clips_p_in, bool merge_samechr_p_in, bool md_lowercase_variant_p_in);
extern int
Cigar_length (List_T tokens);
extern void
diff --git a/src/get-genome.c b/src/get-genome.c
index 1a751b2..28bf4c7 100644
--- a/src/get-genome.c
+++ b/src/get-genome.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: get-genome.c 209604 2017-09-01 21:51:43Z twu $";
+static char rcsid[] = "$Id: get-genome.c 210865 2017-10-29 20:08:42Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -1382,16 +1382,32 @@ main (int argc, char *argv[]) {
} else if (dumpallp == true) {
if (map_iitfile != NULL) {
- mapdir = Datadir_find_mapdir(user_mapdir,genomesubdir,fileroot);
- iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
- strlen(map_iitfile)+strlen(".iit")+1,sizeof(char));
- sprintf(iitfile,"%s/%s.iit",mapdir,map_iitfile);
- if (Access_file_exists_p(iitfile) == false) {
- fprintf(stderr,"Map file %s.iit not found in %s. Available files:\n",map_iitfile,mapdir);
- Datadir_list_directory(stderr,mapdir);
- fprintf(stderr,"Either install file %s.iit or specify a full directory path\n",map_iitfile);
- fprintf(stderr,"using the -M flag to gmap.\n");
- exit(9);
+ if (Access_file_exists_p(map_iitfile) == true) {
+ iitfile = (char *) CALLOC(strlen(map_iitfile)+1,sizeof(char));
+ strcpy(iitfile,map_iitfile);
+
+ } else {
+ mapdir = Datadir_find_mapdir(user_mapdir,genomesubdir,fileroot);
+ iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
+ strlen(map_iitfile)+strlen(".iit")+1,sizeof(char));
+ sprintf(iitfile,"%s/%s.iit",mapdir,map_iitfile);
+
+ if (Access_file_exists_p(iitfile) == false) {
+ /* Try main level of genome */
+ FREE(iitfile);
+ iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
+ strlen(map_iitfile)+strlen(".iit")+1,sizeof(char));
+ sprintf(iitfile,"%s/%s.iit",genomesubdir,map_iitfile);
+
+ if (Access_file_exists_p(iitfile) == false) {
+ fprintf(stderr,"Map file %s.iit not found in %s. Available files:\n",map_iitfile,mapdir);
+ Datadir_list_directory(stderr,mapdir);
+ fprintf(stderr,"Either install file %s.iit or specify a full directory path\n",map_iitfile);
+ fprintf(stderr,"using the -M flag to gmap.\n");
+ exit(9);
+ }
+ }
+ FREE(mapdir);
}
FREE(mapdir);
@@ -1570,12 +1586,21 @@ main (int argc, char *argv[]) {
iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
strlen(map_iitfile)+strlen(".iit")+1,sizeof(char));
sprintf(iitfile,"%s/%s.iit",mapdir,map_iitfile);
+
if (Access_file_exists_p(iitfile) == false) {
- fprintf(stderr,"Map file %s.iit not found in %s. Available files:\n",map_iitfile,mapdir);
- Datadir_list_directory(stderr,mapdir);
- fprintf(stderr,"Either install file %s.iit or specify a full directory path\n",map_iitfile);
- fprintf(stderr,"using the -M flag to gmap.\n");
- exit(9);
+ /* Try main level of genome */
+ FREE(iitfile);
+ iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
+ strlen(map_iitfile)+strlen(".iit")+1,sizeof(char));
+ sprintf(iitfile,"%s/%s.iit",genomesubdir,map_iitfile);
+
+ if (Access_file_exists_p(iitfile) == false) {
+ fprintf(stderr,"Map file %s.iit not found in %s or %s. Available files:\n",map_iitfile,mapdir,genomesubdir);
+ Datadir_list_directory(stderr,mapdir);
+ fprintf(stderr,"Either install file %s.iit or specify a full directory path\n",map_iitfile);
+ fprintf(stderr,"using the -M flag to gmap.\n");
+ exit(9);
+ }
}
/* Don't need to find competing labels, since they are given, so divread can be READ_NONE */
@@ -1676,12 +1701,21 @@ main (int argc, char *argv[]) {
iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
strlen(map_iitfile)+strlen(".iit")+1,sizeof(char));
sprintf(iitfile,"%s/%s.iit",mapdir,map_iitfile);
+
if (Access_file_exists_p(iitfile) == false) {
- fprintf(stderr,"Map file %s.iit not found in %s. Available files:\n",map_iitfile,mapdir);
- Datadir_list_directory(stderr,mapdir);
- fprintf(stderr,"Either install file %s.iit or specify a full directory path\n",map_iitfile);
- fprintf(stderr,"using the -M flag to gmap.\n");
- exit(9);
+ /* Try main level of genome */
+ FREE(iitfile);
+ iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
+ strlen(map_iitfile)+strlen(".iit")+1,sizeof(char));
+ sprintf(iitfile,"%s/%s.iit",genomesubdir,map_iitfile);
+
+ if (Access_file_exists_p(iitfile) == false) {
+ fprintf(stderr,"Map file %s.iit not found in %s or %s. Available files:\n",map_iitfile,mapdir,genomesubdir);
+ Datadir_list_directory(stderr,mapdir);
+ fprintf(stderr,"Either install file %s.iit or specify a full directory path\n",map_iitfile);
+ fprintf(stderr,"using the -M flag to gmap.\n");
+ exit(9);
+ }
}
if (exonsp == true || sequencep == true || genome_sequence_p == true) {
@@ -1831,11 +1865,20 @@ main (int argc, char *argv[]) {
if ((map_iit = IIT_read(iitfile,/*name*/map_iitfile,/*readonlyp*/true,
/*divread*/READ_ALL,/*divstring*/NULL,/*add_iit_p*/false)) == NULL) {
- fprintf(stderr,"Map file %s.iit not found in %s. Available files:\n",map_iitfile,mapdir);
- Datadir_list_directory(stderr,mapdir);
- fprintf(stderr,"Either install file %s.iit or specify a full directory path\n",map_iitfile);
- fprintf(stderr,"using the -M flag to gmap.\n");
- exit(9);
+ FREE(iitfile);
+ iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
+ strlen(map_iitfile)+strlen(".iit")+1,sizeof(char));
+ sprintf(iitfile,"%s/%s.iit",genomesubdir,map_iitfile);
+ if ((map_iit = IIT_read(iitfile,/*name*/map_iitfile,/*readonlyp*/true,
+ /*divread*/READ_ALL,/*divstring*/NULL,/*add_iit_p*/false)) == NULL) {
+
+ fprintf(stderr,"Map file %s.iit not found in %s or %s. Available files:\n",map_iitfile,mapdir,genomesubdir);
+ Datadir_list_directory(stderr,mapdir);
+ fprintf(stderr,"Either install file %s.iit or specify a full directory path\n",map_iitfile);
+ fprintf(stderr,"using the -M flag to gmap.\n");
+ exit(9);
+ }
+
#if 0
} else if (map_typestring != NULL) {
map_type = IIT_typeint(map_iit,map_typestring);
diff --git a/src/gmap.c b/src/gmap.c
index c9099ff..ddce77d 100644
--- a/src/gmap.c
+++ b/src/gmap.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: gmap.c 210510 2017-10-12 21:09:38Z twu $";
+static char rcsid[] = "$Id: gmap.c 210863 2017-10-29 20:04:26Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
diff --git a/src/gsnap.c b/src/gsnap.c
index 7ea468c..1e5cdf7 100644
--- a/src/gsnap.c
+++ b/src/gsnap.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: gsnap.c 210600 2017-10-14 00:32:02Z twu $";
+static char rcsid[] = "$Id: gsnap.c 210864 2017-10-29 20:07:41Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -107,6 +107,7 @@ static char rcsid[] = "$Id: gsnap.c 210600 2017-10-14 00:32:02Z twu $";
#include "iit-read-univ.h"
#include "datadir.h"
#include "samprint.h" /* For SAM_setup */
+#include "cigar.h" /* For Cigar_setup */
#include "filestring.h"
#include "output.h"
@@ -3514,6 +3515,7 @@ worker_setup (char *genomesubdir, char *fileroot) {
force_xs_direction_p,md_lowercase_variant_p,snps_iit,find_dna_chimeras_p,splicing_iit,
donor_typeint,acceptor_typeint,transcript_splicing_p,genestruct_iit,
chromosome_iit,genomecomp);
+ Cigar_setup(hide_soft_clips_p,merge_samechr_p,md_lowercase_variant_p);
Output_setup(chromosome_iit,nofailsp,failsonlyp,quiet_if_excessive_p,maxpaths_report,
failedinput_root,quality_shift,
output_sam_p,print_m8_p,invert_first_p,invert_second_p,
diff --git a/src/pairpool.c b/src/pairpool.c
index 8078f6a..1a1cac4 100644
--- a/src/pairpool.c
+++ b/src/pairpool.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: pairpool.c 203955 2017-03-03 00:28:52Z twu $";
+static char rcsid[] = "$Id: pairpool.c 210866 2017-10-29 20:10:02Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -853,13 +853,19 @@ Pairpool_join_end3 (List_T path_orig, List_T end3_pairs_orig, Pairpool_T pairpoo
}
path = Pairpool_push_existing(path,pairpool,leftpair);
- if (queryjump == 0 && genomejump == 0) {
+ if (queryjump < 0 || genomejump < 0) {
+ /* Discard pair and all remaining pairs from end */
+ debug15(printf("Not possible to make end work, since queryjump = %d and genomejump = %d\n",
+ queryjump,genomejump));
+ end3_pairs = (List_T) NULL;
+ } else if (queryjump == 0 && genomejump == 0) {
/* No gapholder needed */
+ path = Pairpool_push_existing(path,pairpool,pair);
} else {
path = Pairpool_push_gapholder(path,pairpool,queryjump,genomejump,
/*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
+ path = Pairpool_push_existing(path,pairpool,pair);
}
- path = Pairpool_push_existing(path,pairpool,pair);
}
while (end3_pairs != NULL) {
@@ -937,13 +943,19 @@ Pairpool_join_end5 (List_T pairs_orig, List_T end5_path_orig, Pairpool_T pairpoo
}
pairs = Pairpool_push_existing(pairs,pairpool,rightpair);
- if (queryjump == 0 && genomejump == 0) {
+ if (queryjump < 0 || genomejump < 0) {
+ /* Discard pair and all remaining pairs from end */
+ debug15(printf("Not possible to make end work, since queryjump = %d and genomejump = %d\n",
+ queryjump,genomejump));
+ end5_path = (List_T) NULL;
+ } else if (queryjump == 0 && genomejump == 0) {
/* No gapholder needed */
+ pairs = Pairpool_push_existing(pairs,pairpool,pair);
} else {
pairs = Pairpool_push_gapholder(pairs,pairpool,queryjump,genomejump,
/*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
+ pairs = Pairpool_push_existing(pairs,pairpool,pair);
}
- pairs = Pairpool_push_existing(pairs,pairpool,pair);
}
while (end5_path != NULL) {
diff --git a/src/sarray-search.c b/src/sarray-search.c
index 37b7bca..12aa860 100644
--- a/src/sarray-search.c
+++ b/src/sarray-search.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: sarray-search.c 209644 2017-09-05 17:41:06Z twu $";
+static char rcsid[] = "$Id: sarray-search.c 210862 2017-10-29 20:02:29Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -4009,8 +4009,11 @@ get_diagonals (Univdiag_T *middle_diagonal, List_T *best_right_diagonals, List_T
if (j >= best_i) {
/* Create a new elt with new positions */
- querystart = ((Elt_T) elt_tree[j]->first)->querystart_leftward;
/* queryend was computed above */
+ if ((querystart = ((Elt_T) elt_tree[j]->first)->querystart_leftward) < 0) {
+ /* Can happen for an Elt that has 0 nmatches */
+ querystart = 0;
+ }
Sarray_read(&initptr,&finalptr,&successp,&nmatches,&(queryptr[querystart]),
/*querylength*/(queryend + 1) - querystart,/*queryoffset*/querystart,
query_compress,sarray,plusp,genestrand,conversion);
@@ -4077,8 +4080,11 @@ get_diagonals (Univdiag_T *middle_diagonal, List_T *best_right_diagonals, List_T
if (j >= 0) {
/* Create a new elt with new positions */
- querystart = ((Elt_T) elt_tree[j]->first)->querystart_leftward;
/* queryend was computed above */
+ if ((querystart = ((Elt_T) elt_tree[j]->first)->querystart_leftward) < 0) {
+ /* Can happen for an Elt that has 0 nmatches */
+ querystart = 0;
+ }
Sarray_read(&initptr,&finalptr,&successp,&nmatches,&(queryptr[querystart]),
/*querylength*/(queryend + 1) - querystart,/*queryoffset*/querystart,
query_compress,sarray,plusp,genestrand,conversion);
@@ -4717,7 +4723,7 @@ find_best_path (List_T *right_paths, Intlist_T *right_endpoints_sense, Intlist_T
Diag_T sub_diagonal;
Oligoindex_T oligoindex;
#endif
- Univcoord_T left, prev_left;
+ Univcoord_T left, prev_left, mindiff, diff;
/* Chrpos_T splice_distance; */
int splice_pos;
@@ -4837,7 +4843,7 @@ find_best_path (List_T *right_paths, Intlist_T *right_endpoints_sense, Intlist_T
prev_left = common_diagonal->univdiagonal;
}
- /* Distinguish right paths by looking for indel (which wins) or splicing */
+ /* Distinguish right paths by looking for smallest indel (which wins) or splicing */
debug13(printf("Have %d right_paths. Distinguish by looking for indels\n",List_length(*right_paths)));
for (p = *right_paths; p != NULL; p = List_next(p)) {
ambig_path = (List_T) List_head(p);
@@ -4847,11 +4853,23 @@ find_best_path (List_T *right_paths, Intlist_T *right_endpoints_sense, Intlist_T
if (left < prev_left) {
/* Insertion */
debug13(printf("Found insertion\n"));
- right_indel_diagonal = diagonal;
+ if (right_indel_diagonal == NULL) {
+ right_indel_diagonal = diagonal;
+ mindiff = prev_left - left;
+ } else if ((diff = prev_left - left) < mindiff) {
+ right_indel_diagonal = diagonal;
+ mindiff = diff;
+ }
+
} else if (left - prev_left < MIN_INTRONLEN) {
/* Deletion */
debug13(printf("Found deletion\n"));
- right_indel_diagonal = diagonal;
+ if (right_indel_diagonal == NULL) {
+ right_indel_diagonal = diagonal;
+ mindiff = left - prev_left;
+ } else if ((diff = left - prev_left) < mindiff) {
+ mindiff = diff;
+ }
}
}
@@ -5211,7 +5229,7 @@ find_best_path (List_T *right_paths, Intlist_T *right_endpoints_sense, Intlist_T
left = common_diagonal->univdiagonal;
}
- /* Distinguish left paths by looking for indel (which wins) or splicing */
+ /* Distinguish left paths by looking for smallest indel (which wins) or splicing */
debug13(printf("Have %d left_paths. Distinguish by looking for indel\n",List_length(*left_paths)));
for (p = *left_paths; p != NULL; p = List_next(p)) {
ambig_path = (List_T) List_head(p);
@@ -5221,11 +5239,24 @@ find_best_path (List_T *right_paths, Intlist_T *right_endpoints_sense, Intlist_T
if (left < prev_left) {
/* Insertion */
debug13(printf("Found insertion\n"));
- left_indel_diagonal = prev_diagonal;
+ if (left_indel_diagonal == NULL) {
+ left_indel_diagonal = prev_diagonal;
+ mindiff = prev_left - left;
+ } else if ((diff = prev_left - left) < mindiff) {
+ left_indel_diagonal = prev_diagonal;
+ mindiff = diff;
+ }
+
} else if (left - prev_left < MIN_INTRONLEN) {
/* Deletion */
debug13(printf("Found deletion\n"));
- left_indel_diagonal = prev_diagonal;
+ if (left_indel_diagonal == NULL) {
+ left_indel_diagonal = prev_diagonal;
+ mindiff = left - prev_left;
+ } else if ((diff = left - prev_left) < mindiff) {
+ left_indel_diagonal = prev_diagonal;
+ mindiff = diff;
+ }
}
}
diff --git a/src/stage3.c b/src/stage3.c
index 247dc04..3aac474 100644
--- a/src/stage3.c
+++ b/src/stage3.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage3.c 210509 2017-10-12 21:08:43Z twu $";
+static char rcsid[] = "$Id: stage3.c 210883 2017-10-30 21:42:56Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -3743,7 +3743,7 @@ trim_end5_indels (List_T pairs, int ambig_end_length,
}
exon = (List_T) NULL;
- while (pairs != NULL && pair->comp != INDEL_COMP) {
+ while (pairs != NULL && pair->gapp == false && pair->comp != INDEL_COMP) {
pairptr = pairs;
pairs = Pairpool_pop(pairs,&pair);
#ifdef WASTE
@@ -3753,7 +3753,7 @@ trim_end5_indels (List_T pairs, int ambig_end_length,
#endif
}
- while (pairs != NULL && ((Pair_T) pairs->first)->comp == INDEL_COMP) {
+ while (pairs != NULL && pair->gapp == false && ((Pair_T) pairs->first)->comp == INDEL_COMP) {
pairptr = pairs;
pairs = Pairpool_pop(pairs,&pair);
#ifdef WASTE
@@ -3833,6 +3833,10 @@ trim_end5_indels (List_T pairs, int ambig_end_length,
/* *trim5p = true; */
#endif
+ } else if (((Pair_T) pairs->first)->gapp == true) {
+ debug3(printf("Peeled all the way to a gap, so not handling with this procedure\n"));
+ path = exon;
+
} else {
querydp3_medialgap = ((Pair_T) pairs->first)->querypos - 1;
genomedp3_medialgap = ((Pair_T) pairs->first)->genomepos - 1;
@@ -4173,7 +4177,7 @@ trim_end3_indels (List_T path, int ambig_end_length,
}
exon = (List_T) NULL;
- while (path != NULL && pair->comp != INDEL_COMP) {
+ while (path != NULL && pair->gapp == false && pair->comp != INDEL_COMP) {
pairptr = path;
path = Pairpool_pop(path,&pair);
#ifdef WASTE
@@ -4183,7 +4187,7 @@ trim_end3_indels (List_T path, int ambig_end_length,
#endif
}
- while (path != NULL && ((Pair_T) path->first)->comp == INDEL_COMP) {
+ while (path != NULL && pair->gapp == false && ((Pair_T) path->first)->comp == INDEL_COMP) {
pairptr = path;
path = Pairpool_pop(path,&pair);
#ifdef WASTE
@@ -4263,6 +4267,10 @@ trim_end3_indels (List_T path, int ambig_end_length,
/* *trim3p = true; */
#endif
+ } else if (((Pair_T) path->first)->gapp == true) {
+ debug3(printf("Peeled all the way to a gap, so not handling with this procedure\n"));
+ pairs = exon;
+
} else {
querydp5_medialgap = ((Pair_T) path->first)->querypos + 1;
genomedp5_medialgap = ((Pair_T) path->first)->genomepos + 1;
@@ -4272,7 +4280,8 @@ trim_end3_indels (List_T path, int ambig_end_length,
continuous_gappairs_medialgap = Dynprog_end3_gap(&dynprogindex_minor,&finalscore,
&continuous_nmatches,&continuous_nmismatches,&continuous_nopens,&continuous_nindels,
dynprog,&(queryseq_ptr[querydp5_medialgap]),&(queryuc_ptr[querydp5_medialgap]),
- queryjump,genomejump,querydp5_medialgap,genomedp5_medialgap,
+ /*rlength*/queryjump,/*glength*/genomejump,
+ /*roffset*/querydp5_medialgap,/*goffset*/genomedp5_medialgap,
chroffset,chrhigh,watsonp,jump_late_p,pairpool,
extraband_end,defect_rate,/*endalign*/QUERYEND_NOGAPS,/*require_pos_score_p*/true);
debug(printf("CONTINUOUS AT 3 (trim_end3_indels)?\n"));
@@ -5492,6 +5501,9 @@ Stage3_new (struct Pair_T *pairarray, List_T pairs, int npairs, int goodness, in
fprintf(stderr,"Could not compute a valid cigar from the following alignment: %d + %d + %d != %d\n",
Pair_cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
Pair_dump_array_stderr(pairarray,npairs,/*zerobasedp*/true);
+#ifdef CHECK_ASSERTIONS
+ abort();
+#endif
Pair_tokens_free(&cigar_tokens);
return (T) NULL;
@@ -7105,142 +7117,499 @@ peel_leftward_intron (int *n_peeled_indels, bool *protectedp, List_T *peeled_pat
#if 0
+/* Not sure if we need this, or if it causes GMAP to fail on some alignments */
static List_T
-peel_rightward_old (bool *mismatchp, List_T *peeled_pairs, List_T pairs, int *querydp3, int *genomedp3,
-#ifdef WASTE
- Pairpool_T pairpool,
-#endif
- int maxpeelback, bool throughmismatchp, bool quit_on_gap_p,
- List_T *endgappairs, Pair_T *gappair, int *querydp3_medialgap, int *genomedp3_medialgap) {
- List_T peeled = NULL, rest = NULL, pairptr;
- Pair_T pair, nextpair, leftpair;
- int npeelback = 0, nconsecutive = 0, init_dynprogindex = DYNPROGINDEX_MINOR;
- bool stopp;
- int nmatches;
+peel_leftward_contiguous (int *n_peeled_indels, bool *protectedp, List_T *peeled_path, List_T path, int *querydp5, Chrpos_T *genomedp5,
+ int maxpeelback, bool stop_at_indels_p) {
+ List_T peeled = NULL;
+ Pair_T pair, rightpair;
+ int npeelback = 0, niter;
#if 0
- int incursion = 0;
+ int nincursion = 0;
#endif
+ int last_querypos;
+ Chrpos_T last_genomepos;
- *mismatchp = false;
- debug(printf("Peeling rightward:"));
- if (pairs == NULL) {
- debug(printf(" pairs is empty\n"));
- } else {
- pair = pairs->first;
- if (pair->gapp == true) {
- /* Throw away known gap */
- debug(printf(" Known_gap"));
- pairptr = pairs;
- pairs = Pairpool_pop(pairs,&pair);
-#ifdef WASTE
- peeled = Pairpool_push_existing(peeled,pairpool,pair);
-#else
- peeled = List_push_existing(peeled,pairptr);
-#endif
- }
- rest = pairs->rest;
- stopp = false;
- while (rest != NULL && stopp == false) {
- nextpair = rest->first;
- if (nextpair->gapp == true || nextpair->cdna == ' ' || nextpair->genome == ' ' || nextpair->protectedp == true) {
- stopp = true;
- } else {
- pairptr = pairs;
- pairs = Pairpool_pop(pairs,&pair);
-#ifdef WASTE
- peeled = Pairpool_push_existing(peeled,pairpool,pair);
-#else
- peeled = List_push_existing(peeled,pairptr);
-#endif
- debug(printf(" Peel [");
- Pair_dump_one(pair,/*zerobasedp*/true);
- printf("]"));
-
- if (uppercaseCode[(int) pair->cdna] != uppercaseCode[(int) pair->genome]) {
- *mismatchp = true;
- }
+ *n_peeled_indels = 0;
+ /* *protectedp = false; -- set by calling procedure */
- if (++npeelback >= maxpeelback) {
- stopp = true;
- }
+ debug(printf("Peeling leftward with maxpeelback %d and stop_at_indels_p %d:",maxpeelback,stop_at_indels_p));
- if (init_dynprogindex > 0 && pair->dynprogindex <= 0) {
- init_dynprogindex = pair->dynprogindex;
- }
+ /* Remove initial gaps */
+ while (path != NULL &&
+ ( ((Pair_T) path->first)->gapp == true ||
+ ((Pair_T) path->first)->comp == INDEL_COMP ||
+ ((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
+ path = Pairpool_pop(path,&pair);
+ }
- rest = pairs->rest;
- }
- }
+ if (path == NULL) {
+ debug(printf(" path is empty\n"));
- /* Continue to peelback through little skips and mismatches */
- debug(printf("\n||"));
+ } else if (stop_at_indels_p == true) {
+ pair = path->first;
+ if (pair->gapp == true) {
+ /* Peel known gap */
+ debug(printf(" Known_gap"));
+ peeled = List_transfer_one(peeled,&path);
+ }
- stopp = false;
- while (rest != NULL && stopp == false) {
- nextpair = rest->first;
- if (nextpair->gapp == true) {
- /* Peel this one, but then stop at end of loop */
- } else if (nextpair->protectedp == true) {
- /* Stop because it's protected */
- stopp = true;
- } else if (nextpair->cdna != ' ' && nextpair->genome != ' ') {
- /* Stop because it looks okay */
- stopp = true;
- }
+ /* Peel initial indels anyway */
+ while (path != NULL && ( ((Pair_T) path->first)->comp == INDEL_COMP || ((Pair_T) path->first)->comp == SHORTGAP_COMP )) {
+ debug(printf(" Peel [");
+ Pair_dump_one(path->first,/*zerobasedp*/true);
+ printf("]"));
+ peeled = List_transfer_one(peeled,&path);
+ }
- pairptr = pairs;
- pairs = Pairpool_pop(pairs,&pair);
-#ifdef WASTE
- peeled = Pairpool_push_existing(peeled,pairpool,pair);
-#else
- peeled = List_push_existing(peeled,pairptr);
-#endif
- debug(printf(" Extrapeel [");
- Pair_dump_one(pair,/*zerobasedp*/true);
+ if (path != NULL) {
+ last_querypos = ((Pair_T) path->first)->querypos;
+ last_genomepos = ((Pair_T) path->first)->genomepos;
+ }
+ while (npeelback < maxpeelback && path != NULL &&
+ ((Pair_T) path->first)->gapp == false &&
+ ((Pair_T) path->first)->comp != INDEL_COMP &&
+ ((Pair_T) path->first)->comp != SHORTGAP_COMP &&
+ ((Pair_T) path->first)->querypos + 1 >= last_querypos &&
+ ((Pair_T) path->first)->genomepos + 1 >= last_genomepos) {
+ debug(printf(" Peel [");
+ Pair_dump_one(path->first,/*zerobasedp*/true);
printf("]"));
-
- if (uppercaseCode[(int) pair->cdna] != uppercaseCode[(int) pair->genome]) {
- *mismatchp = true;
+ if (((Pair_T) path->first)->protectedp == true) {
+ *protectedp = true;
}
+ last_querypos = ((Pair_T) path->first)->querypos;
+ last_genomepos = ((Pair_T) path->first)->genomepos;
+ peeled = List_transfer_one(peeled,&path);
+ npeelback++;
+ }
+
+ } else {
+ /* Don't stop at indels, but do stop at gaps */
+ pair = path->first;
+ if (pair->gapp == true) {
+ /* Peel known gap */
+ debug(printf(" Known_gap"));
+ peeled = List_transfer_one(peeled,&path);
+ }
-#if 0
- if (pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP || pair->comp == MISMATCH_COMP) {
- nconsecutive = 0;
- } else if (++nconsecutive >= SUFFCONSECUTIVE) {
- stopp = true;
- }
-#endif
-
-#if 0
- if (pair->dynprogindex != init_dynprogindex) {
- if (++nincursion >= MAXINCURSION) {
- stopp = true;
- }
+ niter = 0;
+ if (path != NULL) {
+ last_querypos = ((Pair_T) path->first)->querypos;
+ last_genomepos = ((Pair_T) path->first)->genomepos;
+ }
+ while (npeelback < maxpeelback && niter < MAXITER && path != NULL &&
+ ((Pair_T) path->first)->gapp == false &&
+ ((Pair_T) path->first)->querypos + 1 >= last_querypos &&
+ ((Pair_T) path->first)->genomepos + 1 >= last_genomepos) {
+ debug(printf(" Peel [");
+ Pair_dump_one(path->first,/*zerobasedp*/true);
+ printf("]"));
+ if (((Pair_T) path->first)->comp == MATCH_COMP || ((Pair_T) path->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) path->first)->comp == AMBIGUOUS_COMP) {
+ npeelback++;
+ } else if (((Pair_T) path->first)->comp == INDEL_COMP) {
+ *n_peeled_indels += 1;
+ npeelback--;
+ } else if (((Pair_T) path->first)->comp == SHORTGAP_COMP) {
+ *n_peeled_indels += 1;
+ npeelback--;
+ } else {
+ npeelback--;
}
-#endif
-
- if (pair->gapp == true) {
- stopp = true;
+ if (((Pair_T) path->first)->protectedp == true) {
+ *protectedp = true;
}
+ niter++;
+ last_querypos = ((Pair_T) path->first)->querypos;
+ last_genomepos = ((Pair_T) path->first)->genomepos;
+ peeled = List_transfer_one(peeled,&path);
+ }
+
+ if (path != NULL && ((Pair_T) path->first)->gapp == true) {
+ debug(printf(" Hit gap [");
+ Pair_dump_one(path->first,/*zerobasedp*/true);
+ printf("]"));
+ }
+ }
- rest = pairs->rest;
+ if (path != NULL &&
+ ( ((Pair_T) path->first)->gapp == true ||
+ ((Pair_T) path->first)->comp == INDEL_COMP ||
+ ((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
+ /* Don't leave a gap or indel on the top of the path */
+ while (peeled != NULL &&
+ ( ((Pair_T) peeled->first)->gapp == true ||
+ ((Pair_T) peeled->first)->comp == INDEL_COMP ||
+ ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
+ debug(printf(" Putback [");
+ Pair_dump_one(peeled->first,/*zerobasedp*/true);
+ printf("]"));
+ path = List_transfer_one(path,&peeled);
+ }
+ if (peeled != NULL) {
+ debug(printf(" Putback [");
+ Pair_dump_one(peeled->first,/*zerobasedp*/true);
+ printf("]"));
+ path = List_transfer_one(path,&peeled); /* This should be match or mismatch */
}
}
-#ifdef PMAP
- /* Reverse process to codon boundary. Cases:
+ if (path != NULL) {
+ rightpair = path->first;
+ *querydp5 = rightpair->querypos + 1;
+ *genomedp5 = rightpair->genomepos + 1;
+ } else if (peeled != NULL) {
+ rightpair = peeled->first;
+ *querydp5 = rightpair->querypos;
+ *genomedp5 = rightpair->genomepos;
+ } else {
+ /* fprintf(stderr,"In peel_leftward, path and peeled are both NULL\n"); */
+ /* abort(); */
+ }
- - X | X X X
- 5 5 6 7 8
+ debug(
+ if (path == NULL) {
+ printf(" => Top of path is NULL.");
+ } else {
+ pair = path->first;
+ printf(" => Top of path is ");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ }
+ printf("\n => querydp5 = %d, genomedp5 = %d\n",*querydp5,*genomedp5);
+ );
- X - | X X X
- 5 6 6 7 8
+ *peeled_path = peeled;
+ return path;
+}
+#endif
+
- X | X - X X
- 5 6 7 7 8
+#if 0
+/* Not sure if we need this, or if it causes GMAP to fail on some alignments */
+static List_T
+peel_leftward_intron_contiguous (int *n_peeled_indels, bool *protectedp, List_T *peeled_path, List_T path, int *querydp5, Chrpos_T *genomedp5,
+ Chrpos_T genomedp3, bool stop_at_indels_p, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
+ int minpeelback, int min_mismatches) {
+ List_T peeled = NULL;
+ Pair_T pair, rightpair;
+ int npeelback = 0, nmismatches = 0, niter;
+ char cdna, intron_nt, intron_nt_alt;
+#if 0
+ int nincursion = 0;
+#endif
+ int maxpeelback = 12;
+ int last_querypos;
+ Chrpos_T last_genomepos;
- Rule: pair->querypos % 3 == 0 */
+
+ *n_peeled_indels = 0;
+ /* *protectedp = false; -- set by calling procedure */
+
+ debug(printf("Peeling leftward with genomedp3 %d and stop_at_indels_p %d:",genomedp3,stop_at_indels_p));
+
+ /* Remove initial gaps */
+ while (path != NULL &&
+ ( ((Pair_T) path->first)->gapp == true ||
+ ((Pair_T) path->first)->comp == INDEL_COMP ||
+ ((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
+ path = Pairpool_pop(path,&pair);
+ }
+
+ if (path == NULL) {
+ debug(printf(" path is empty\n"));
+
+ } else if (stop_at_indels_p == true) {
+ pair = path->first;
+ if (pair->gapp == true) {
+ /* Peel known gap */
+ debug(printf(" Known_gap"));
+ peeled = List_transfer_one(peeled,&path);
+ }
+
+ /* Peel initial indels anyway */
+ while (path != NULL && ( ((Pair_T) path->first)->comp == INDEL_COMP || ((Pair_T) path->first)->comp == SHORTGAP_COMP )) {
+ debug(printf(" Peel [");
+ Pair_dump_one(path->first,/*zerobasedp*/true);
+ printf("]"));
+ peeled = List_transfer_one(peeled,&path);
+ }
+
+ if (path != NULL) {
+ last_querypos = ((Pair_T) path->first)->querypos;
+ last_genomepos = ((Pair_T) path->first)->genomepos;
+ }
+ while (/*npeelback < maxpeelback &&*/
+ (npeelback < minpeelback || nmismatches < min_mismatches) && path != NULL &&
+ ((Pair_T) path->first)->gapp == false &&
+ ((Pair_T) path->first)->comp != INDEL_COMP &&
+ ((Pair_T) path->first)->comp != SHORTGAP_COMP &&
+ ((Pair_T) path->first)->querypos + 1 >= last_querypos &&
+ ((Pair_T) path->first)->genomepos + 1 >= last_genomepos) {
+ debug(printf(" Peel [");
+ Pair_dump_one(path->first,/*zerobasedp*/true);
+ printf("]"));
+
+ intron_nt = get_genomic_nt(&intron_nt_alt,genomedp3--,chroffset,chrhigh,watsonp);
+ if ((cdna = ((Pair_T) path->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
+ nmismatches++;
+ debug(printf(" (1) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp3+1));
+ }
+
+ if (((Pair_T) path->first)->protectedp == true) {
+ *protectedp = true;
+ }
+ last_querypos = ((Pair_T) path->first)->querypos;
+ last_genomepos = ((Pair_T) path->first)->genomepos;
+ peeled = List_transfer_one(peeled,&path);
+ npeelback++;
+ }
+
+ } else {
+ /* Don't stop at indels, but do stop at gaps */
+ pair = path->first;
+ if (pair->gapp == true) {
+ /* Peel known gap */
+ debug(printf(" Known_gap"));
+ peeled = List_transfer_one(peeled,&path);
+ }
+
+ niter = 0;
+ if (path != NULL) {
+ last_querypos = ((Pair_T) path->first)->querypos;
+ last_genomepos = ((Pair_T) path->first)->genomepos;
+ }
+ while (/*npeelback < maxpeelback &&*/
+ (npeelback < minpeelback || nmismatches < min_mismatches) && niter < MAXITER && path != NULL &&
+ ((Pair_T) path->first)->gapp == false &&
+ ((Pair_T) path->first)->querypos + 1 >= last_querypos &&
+ ((Pair_T) path->first)->genomepos + 1 >= last_genomepos) {
+ debug(printf(" Peel [");
+ Pair_dump_one(path->first,/*zerobasedp*/true);
+ printf("]"));
+
+ intron_nt = get_genomic_nt(&intron_nt_alt,genomedp3--,chroffset,chrhigh,watsonp);
+ if ((cdna = ((Pair_T) path->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
+ nmismatches++;
+ debug(printf(" (2) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp3+1));
+ }
+
+ if (((Pair_T) path->first)->comp == MATCH_COMP || ((Pair_T) path->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) path->first)->comp == AMBIGUOUS_COMP) {
+ npeelback++;
+ } else if (((Pair_T) path->first)->comp == INDEL_COMP) {
+ *n_peeled_indels += 1;
+ npeelback--;
+ } else if (((Pair_T) path->first)->comp == SHORTGAP_COMP) {
+ *n_peeled_indels += 1;
+ npeelback--;
+ } else {
+ npeelback--;
+ }
+ if (((Pair_T) path->first)->protectedp == true) {
+ *protectedp = true;
+ }
+ niter++;
+ last_querypos = ((Pair_T) path->first)->querypos;
+ last_genomepos = ((Pair_T) path->first)->genomepos;
+ peeled = List_transfer_one(peeled,&path);
+ }
+
+ if (path != NULL && ((Pair_T) path->first)->gapp == true) {
+ debug(printf(" Hit gap [");
+ Pair_dump_one(path->first,/*zerobasedp*/true);
+ printf("]"));
+ }
+ }
+
+ if (path != NULL &&
+ ( ((Pair_T) path->first)->gapp == true ||
+ ((Pair_T) path->first)->comp == INDEL_COMP ||
+ ((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
+ /* Don't leave a gap or indel on the top of the path */
+ while (peeled != NULL &&
+ ( ((Pair_T) peeled->first)->gapp == true ||
+ ((Pair_T) peeled->first)->comp == INDEL_COMP ||
+ ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
+ debug(printf(" Putback [");
+ Pair_dump_one(peeled->first,/*zerobasedp*/true);
+ printf("]"));
+ path = List_transfer_one(path,&peeled);
+ }
+ if (peeled != NULL) {
+ debug(printf(" Putback [");
+ Pair_dump_one(peeled->first,/*zerobasedp*/true);
+ printf("]"));
+ path = List_transfer_one(path,&peeled); /* This should be match or mismatch */
+ }
+ }
+
+ if (path != NULL) {
+ rightpair = path->first;
+ *querydp5 = rightpair->querypos + 1;
+ *genomedp5 = rightpair->genomepos + 1;
+ } else if (peeled != NULL) {
+ rightpair = peeled->first;
+ *querydp5 = rightpair->querypos;
+ *genomedp5 = rightpair->genomepos;
+ } else {
+ /* fprintf(stderr,"In peel_leftward, path and peeled are both NULL\n"); */
+ /* abort(); */
+ }
+
+ debug(
+ if (path == NULL) {
+ printf(" => Top of path is NULL.");
+ } else {
+ pair = path->first;
+ printf(" => Top of path is ");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ }
+ printf("\n => querydp5 = %d, genomedp5 = %d\n",*querydp5,*genomedp5);
+ );
+
+ *peeled_path = peeled;
+ return path;
+}
+#endif
+
+
+#if 0
+static List_T
+peel_rightward_old (bool *mismatchp, List_T *peeled_pairs, List_T pairs, int *querydp3, int *genomedp3,
+#ifdef WASTE
+ Pairpool_T pairpool,
+#endif
+ int maxpeelback, bool throughmismatchp, bool quit_on_gap_p,
+ List_T *endgappairs, Pair_T *gappair, int *querydp3_medialgap, int *genomedp3_medialgap) {
+ List_T peeled = NULL, rest = NULL, pairptr;
+ Pair_T pair, nextpair, leftpair;
+ int npeelback = 0, nconsecutive = 0, init_dynprogindex = DYNPROGINDEX_MINOR;
+ bool stopp;
+ int nmatches;
+#if 0
+ int incursion = 0;
+#endif
+
+ *mismatchp = false;
+ debug(printf("Peeling rightward:"));
+ if (pairs == NULL) {
+ debug(printf(" pairs is empty\n"));
+ } else {
+ pair = pairs->first;
+ if (pair->gapp == true) {
+ /* Throw away known gap */
+ debug(printf(" Known_gap"));
+ pairptr = pairs;
+ pairs = Pairpool_pop(pairs,&pair);
+#ifdef WASTE
+ peeled = Pairpool_push_existing(peeled,pairpool,pair);
+#else
+ peeled = List_push_existing(peeled,pairptr);
+#endif
+ }
+ rest = pairs->rest;
+
+ stopp = false;
+ while (rest != NULL && stopp == false) {
+ nextpair = rest->first;
+ if (nextpair->gapp == true || nextpair->cdna == ' ' || nextpair->genome == ' ' || nextpair->protectedp == true) {
+ stopp = true;
+ } else {
+ pairptr = pairs;
+ pairs = Pairpool_pop(pairs,&pair);
+#ifdef WASTE
+ peeled = Pairpool_push_existing(peeled,pairpool,pair);
+#else
+ peeled = List_push_existing(peeled,pairptr);
+#endif
+ debug(printf(" Peel [");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ printf("]"));
+
+ if (uppercaseCode[(int) pair->cdna] != uppercaseCode[(int) pair->genome]) {
+ *mismatchp = true;
+ }
+
+ if (++npeelback >= maxpeelback) {
+ stopp = true;
+ }
+
+ if (init_dynprogindex > 0 && pair->dynprogindex <= 0) {
+ init_dynprogindex = pair->dynprogindex;
+ }
+
+ rest = pairs->rest;
+ }
+ }
+
+ /* Continue to peelback through little skips and mismatches */
+ debug(printf("\n||"));
+
+ stopp = false;
+ while (rest != NULL && stopp == false) {
+ nextpair = rest->first;
+ if (nextpair->gapp == true) {
+ /* Peel this one, but then stop at end of loop */
+ } else if (nextpair->protectedp == true) {
+ /* Stop because it's protected */
+ stopp = true;
+ } else if (nextpair->cdna != ' ' && nextpair->genome != ' ') {
+ /* Stop because it looks okay */
+ stopp = true;
+ }
+
+ pairptr = pairs;
+ pairs = Pairpool_pop(pairs,&pair);
+#ifdef WASTE
+ peeled = Pairpool_push_existing(peeled,pairpool,pair);
+#else
+ peeled = List_push_existing(peeled,pairptr);
+#endif
+ debug(printf(" Extrapeel [");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ printf("]"));
+
+ if (uppercaseCode[(int) pair->cdna] != uppercaseCode[(int) pair->genome]) {
+ *mismatchp = true;
+ }
+
+#if 0
+ if (pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP || pair->comp == MISMATCH_COMP) {
+ nconsecutive = 0;
+ } else if (++nconsecutive >= SUFFCONSECUTIVE) {
+ stopp = true;
+ }
+#endif
+
+#if 0
+ if (pair->dynprogindex != init_dynprogindex) {
+ if (++nincursion >= MAXINCURSION) {
+ stopp = true;
+ }
+ }
+#endif
+
+ if (pair->gapp == true) {
+ stopp = true;
+ }
+
+ rest = pairs->rest;
+ }
+ }
+
+#ifdef PMAP
+ /* Reverse process to codon boundary. Cases:
+
+ - X | X X X
+ 5 5 6 7 8
+
+ X - | X X X
+ 5 6 6 7 8
+
+ X | X - X X
+ 5 6 7 7 8
+
+ Rule: pair->querypos % 3 == 0 */
debug(printf("\n<<"));
stopp = false;
@@ -7248,160 +7617,475 @@ peel_rightward_old (bool *mismatchp, List_T *peeled_pairs, List_T pairs, int *qu
pairptr = peeled;
peeled = Pairpool_pop(peeled,&pair);
#ifdef WASTE
- pairs = Pairpool_push_existing(pairs,pairpool,pair);
+ pairs = Pairpool_push_existing(pairs,pairpool,pair);
+#else
+ pairs = List_push_existing(pairs,pairptr);
+#endif
+ debug(printf(" Mod3putback [");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ printf("]"));
+ if (pair->querypos % 3 == 0) {
+ stopp = true;
+ }
+ }
+#endif
+
+ if (peeled == NULL) {
+ /* Do not alter querydp3 or genomedp3 */
+ } else {
+ leftpair = peeled->first;
+ while (peeled != NULL && (leftpair->gapp == true || leftpair->comp == INDEL_COMP || leftpair->comp == SHORTGAP_COMP)) {
+ debug(printf("Ran into gap; undoing peel, case 3, leftpair gapp %d, comp %c\n",
+ leftpair->gapp,leftpair->comp));
+ if (endgappairs != NULL) {
+ pairs = Pairpool_transfer(pairs,*endgappairs);
+ *endgappairs = (List_T) NULL;
+ }
+
+ if (quit_on_gap_p == true) {
+ pairs = Pairpool_transfer(pairs,peeled);
+ *peeled_pairs = (List_T) NULL;
+ return pairs;
+
+ } else {
+ /* Put back 1 */
+ /* if ((pairptr = peeled) != NULL) { */
+ pairptr = peeled;
+ peeled = Pairpool_pop(peeled,&pair);
+ pairs = List_push_existing(pairs,pairptr);
+ debug(printf(" Putback [");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ printf("]"));
+ /* } */
+
+#if 0
+ /* Put back 2 */
+ if ((pairptr = peeled) != NULL) {
+ peeled = Pairpool_pop(peeled,&pair);
+ pairs = List_push_existing(pairs,pairptr);
+ debug(printf(" Putback [");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ printf("]"));
+ }
+#endif
+ }
+
+ leftpair = pairs->first;
+ }
+
+ if (pairs != NULL) {
+ leftpair = pairs->first;
+ *querydp3 = leftpair->querypos - 1;
+ *genomedp3 = leftpair->genomepos - 1;
+ } else if (peeled != NULL) {
+ leftpair = peeled->first;
+ *querydp3 = leftpair->querypos;
+ *genomedp3 = leftpair->genomepos;
+ } else {
+ fprintf(stderr,"In peel_leftward, pairs and peeled are both NULL\n");
+ abort();
+ }
+ }
+
+ if (endgappairs != NULL) {
+ if (pairs == NULL || (pair = pairs->first) == NULL || (pair->gapp == false && pair->comp != INDEL_COMP && pair->comp != SHORTGAP_COMP)) {
+ *endgappairs = NULL;
+ *querydp3_medialgap = *querydp3;
+ *genomedp3_medialgap = *genomedp3;
+ } else {
+ pairptr = pairs;
+ pairs = Pairpool_pop(pairs,&pair);
+#ifdef WASTE
+ *endgappairs = Pairpool_push_existing(NULL,pairpool,pair);
#else
- pairs = List_push_existing(pairs,pairptr);
+ *endgappairs = List_push_existing(NULL,pairptr);
#endif
- debug(printf(" Mod3putback [");
- Pair_dump_one(pair,/*zerobasedp*/true);
- printf("]"));
- if (pair->querypos % 3 == 0) {
- stopp = true;
- }
- }
+ debug(printf(" Peeling gap [");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ printf("]"));
+ *gappair = pair;
+ debug(printf(" gapcomp: '%c'",pair->comp));
+
+ nmatches = 0;
+ while (pairs != NULL && nmatches < 3) {
+ pairptr = pairs;
+ pairs = Pairpool_pop(pairs,&pair);
+#ifdef WASTE
+ *endgappairs = Pairpool_push_existing(*endgappairs,pairpool,pair);
+#else
+ *endgappairs = List_push_existing(*endgappairs,pairptr);
#endif
+ debug(printf(" Peeling after gap [");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ printf("]"));
+ if (uppercaseCode[(int) pair->cdna] == uppercaseCode[(int) pair->genome]) {
+ nmatches++;
+ }
+ }
- if (peeled == NULL) {
- /* Do not alter querydp3 or genomedp3 */
- } else {
- leftpair = peeled->first;
- while (peeled != NULL && (leftpair->gapp == true || leftpair->comp == INDEL_COMP || leftpair->comp == SHORTGAP_COMP)) {
- debug(printf("Ran into gap; undoing peel, case 3, leftpair gapp %d, comp %c\n",
- leftpair->gapp,leftpair->comp));
- if (endgappairs != NULL) {
+ leftpair = (*endgappairs)->first;
+ if (leftpair->gapp == true || leftpair->comp == INDEL_COMP || leftpair->comp == SHORTGAP_COMP) {
+ debug(printf("Ran into gap; undoing peel, case 4\n"));
pairs = Pairpool_transfer(pairs,*endgappairs);
*endgappairs = (List_T) NULL;
+
+ if (quit_on_gap_p == true) {
+ pairs = Pairpool_transfer(pairs,peeled);
+ *peeled_pairs = (List_T) NULL;
+ return pairs;
+
+ } else {
+ /* Put back 1 */
+ if ((pairptr = peeled) != NULL) {
+ peeled = Pairpool_pop(peeled,&pair);
+ pairs = List_push_existing(pairs,pairptr);
+ debug(printf(" Putback [");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ printf("]"));
+ }
+
+ /* Put back 2 */
+ if ((pairptr = peeled) != NULL) {
+ peeled = Pairpool_pop(peeled,&pair);
+ pairs = List_push_existing(pairs,pairptr);
+ debug(printf(" Putback [");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ printf("]"));
+ }
+ }
}
- if (quit_on_gap_p == true) {
- pairs = Pairpool_transfer(pairs,peeled);
- *peeled_pairs = (List_T) NULL;
- return pairs;
+ if (pairs != NULL) {
+ leftpair = pairs->first;
+ *querydp3_medialgap = leftpair->querypos - 1;
+ *genomedp3_medialgap = leftpair->genomepos - 1;
+ } else if (peeled != NULL) {
+ leftpair = peeled->first;
+ *querydp3_medialgap = leftpair->querypos;
+ *genomedp3_medialgap = leftpair->genomepos;
+ } else {
+ fprintf(stderr,"In peel_leftward for medialgap, pairs and peeled are both NULL\n");
+ abort();
+ }
+ }
+ }
+
+ /* assert(peeled == NULL || pairs == NULL || ((Pair_T) pairs->first)->comp != INDEL_COMP); */
+ debug(
+ if (pairs == NULL) {
+ printf(" => Top of pairs is NULL.");
+ } else {
+ pair = pairs->first;
+ printf(" => Top of pairs is ");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ }
+ printf("\n => querydp3 = %d, genomedp3 = %d\n",*querydp3,*genomedp3);
+ );
+
+ *peeled_pairs = peeled;
+ return pairs;
+}
+#endif
+
+
+static List_T
+peel_rightward (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, List_T pairs, int *querydp3, Chrpos_T *genomedp3,
+ int maxpeelback, bool stop_at_indels_p) {
+ List_T peeled = NULL;
+ Pair_T pair, leftpair;
+ int npeelback = 0, niter;
+#if 0
+ int incursion = 0;
+#endif
+
+ *n_peeled_indels = 0;
+ /* *protectedp = false; -- set by calling procedure */
+
+ debug(printf("Peeling rightward with maxpeelback %d and stop_at_indels_p %d:",maxpeelback,stop_at_indels_p));
+
+ /* Remove initial gaps */
+ while (pairs != NULL &&
+ ( ((Pair_T) pairs->first)->gapp == true ||
+ ((Pair_T) pairs->first)->comp == INDEL_COMP ||
+ ((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
+ pairs = Pairpool_pop(pairs,&pair);
+ }
+
+ if (pairs == NULL) {
+ debug(printf(" pairs is empty\n"));
+
+ } else if (stop_at_indels_p == true) {
+ pair = pairs->first;
+ if (pair->gapp == true) {
+ /* Peel known gap */
+ debug(printf(" Known_gap"));
+ peeled = List_transfer_one(peeled,&pairs);
+ }
+
+ /* Peel initial indels anyway */
+ while (pairs != NULL && ( ((Pair_T) pairs->first)->comp == INDEL_COMP || ((Pair_T) pairs->first)->comp == INDEL_COMP )) {
+ debug(printf(" Peel [");
+ Pair_dump_one(pairs->first,/*zerobasedp*/true);
+ printf("]"));
+ peeled = List_transfer_one(peeled,&pairs);
+ }
+
+ while (npeelback < maxpeelback && pairs != NULL &&
+ ((Pair_T) pairs->first)->gapp == false &&
+ ((Pair_T) pairs->first)->comp != INDEL_COMP &&
+ ((Pair_T) pairs->first)->comp != SHORTGAP_COMP) {
+ debug(printf(" Peel [");
+ Pair_dump_one(pairs->first,/*zerobasedp*/true);
+ printf("]"));
+ if (((Pair_T) pairs->first)->protectedp == true) {
+ *protectedp = true;
+ }
+ peeled = List_transfer_one(peeled,&pairs);
+ npeelback++;
+ }
+
+ } else {
+ /* Don't stop at indels, but do stop at gaps */
+ pair = pairs->first;
+ if (pair->gapp == true) {
+ /* Peel known gap */
+ debug(printf(" Known_gap"));
+ peeled = List_transfer_one(peeled,&pairs);
+ }
+ niter = 0;
+ while (npeelback < maxpeelback && niter < MAXITER && pairs != NULL &&
+ ((Pair_T) pairs->first)->gapp == false) {
+ debug(printf(" Peel [");
+ Pair_dump_one(pairs->first,/*zerobasedp*/true);
+ printf("]"));
+ if (((Pair_T) pairs->first)->comp == MATCH_COMP || ((Pair_T) pairs->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) pairs->first)->comp == AMBIGUOUS_COMP) {
+ npeelback++;
+ } else if (((Pair_T) pairs->first)->comp == INDEL_COMP) {
+ *n_peeled_indels += 1;
+ npeelback--;
+ } else if (((Pair_T) pairs->first)->comp == SHORTGAP_COMP) {
+ *n_peeled_indels += 1;
+ npeelback--;
} else {
- /* Put back 1 */
- /* if ((pairptr = peeled) != NULL) { */
- pairptr = peeled;
- peeled = Pairpool_pop(peeled,&pair);
- pairs = List_push_existing(pairs,pairptr);
- debug(printf(" Putback [");
- Pair_dump_one(pair,/*zerobasedp*/true);
- printf("]"));
- /* } */
+ npeelback--;
+ }
+ if (((Pair_T) pairs->first)->protectedp == true) {
+ *protectedp = true;
+ }
+ niter++;
+ peeled = List_transfer_one(peeled,&pairs);
+ }
+
+ if (pairs != NULL && ((Pair_T) pairs->first)->gapp == true) {
+ debug(printf(" Hit gap [");
+ Pair_dump_one(pairs->first,/*zerobasedp*/true);
+ printf("]"));
+ }
+ }
+
+ if (pairs != NULL &&
+ ( ((Pair_T) pairs->first)->gapp == true ||
+ ((Pair_T) pairs->first)->comp == INDEL_COMP ||
+ ((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
+ /* Don't leave a gap or indel on the top of the pairs */
+ while (peeled != NULL &&
+ ( ((Pair_T) peeled->first)->gapp == true ||
+ ((Pair_T) peeled->first)->comp == INDEL_COMP ||
+ ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
+ debug(printf(" Putback [");
+ Pair_dump_one(peeled->first,/*zerobasedp*/true);
+ printf("]"));
+ pairs = List_transfer_one(pairs,&peeled);
+ }
+ if (peeled != NULL) {
+ debug(printf(" Putback [");
+ Pair_dump_one(peeled->first,/*zerobasedp*/true);
+ printf("]"));
+ pairs = List_transfer_one(pairs,&peeled); /* This should be match or mismatch */
+ }
+ }
+
+ if (pairs != NULL) {
+ leftpair = pairs->first;
+ *querydp3 = leftpair->querypos - 1;
+ *genomedp3 = leftpair->genomepos - 1;
+ } else if (peeled != NULL) {
+ leftpair = peeled->first;
+ *querydp3 = leftpair->querypos;
+ *genomedp3 = leftpair->genomepos;
+ } else {
+ /* fprintf(stderr,"In peel_rightward, pairs and peeled are both NULL\n"); */
+ /* abort(); */
+ }
+
+ debug(
+ if (pairs == NULL) {
+ printf(" => Top of pairs is NULL.");
+ } else {
+ pair = pairs->first;
+ printf(" => Top of pairs is ");
+ Pair_dump_one(pair,/*zerobasedp*/true);
+ }
+ printf("\n => querydp3 = %d, genomedp3 = %d\n",*querydp3,*genomedp3);
+ );
+
+ *peeled_pairs = peeled;
+ return pairs;
+}
+
+/* Instead of maxpeelback, follow the 5' intron until we get enough mismatches */
+static List_T
+peel_rightward_intron (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, List_T pairs, int *querydp3, Chrpos_T *genomedp3,
+ Chrpos_T genomedp5, bool stop_at_indels_p, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
+ int minpeelback, int min_mismatches) {
+ List_T peeled = NULL;
+ Pair_T pair, leftpair;
+ int npeelback = 0, nmismatches = 0, niter;
+ char cdna, intron_nt, intron_nt_alt;
#if 0
- /* Put back 2 */
- if ((pairptr = peeled) != NULL) {
- peeled = Pairpool_pop(peeled,&pair);
- pairs = List_push_existing(pairs,pairptr);
- debug(printf(" Putback [");
- Pair_dump_one(pair,/*zerobasedp*/true);
- printf("]"));
- }
+ int incursion = 0;
#endif
- }
+ int maxpeelback = 12;
- leftpair = pairs->first;
+ *n_peeled_indels = 0;
+ /* *protectedp = false; -- set by calling procedure */
+
+ debug(printf("Peeling rightward with genomedp5 %d and stop_at_indels_p %d:",genomedp5,stop_at_indels_p));
+
+ /* Remove initial gaps */
+ while (pairs != NULL &&
+ ( ((Pair_T) pairs->first)->gapp == true ||
+ ((Pair_T) pairs->first)->comp == INDEL_COMP ||
+ ((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
+ pairs = Pairpool_pop(pairs,&pair);
+ }
+
+ if (pairs == NULL) {
+ debug(printf(" pairs is empty\n"));
+
+ } else if (stop_at_indels_p == true) {
+ pair = pairs->first;
+ if (pair->gapp == true) {
+ /* Peel known gap */
+ debug(printf(" Known_gap"));
+ peeled = List_transfer_one(peeled,&pairs);
}
- if (pairs != NULL) {
- leftpair = pairs->first;
- *querydp3 = leftpair->querypos - 1;
- *genomedp3 = leftpair->genomepos - 1;
- } else if (peeled != NULL) {
- leftpair = peeled->first;
- *querydp3 = leftpair->querypos;
- *genomedp3 = leftpair->genomepos;
- } else {
- fprintf(stderr,"In peel_leftward, pairs and peeled are both NULL\n");
- abort();
+ /* Peel initial indels anyway */
+ while (pairs != NULL && ( ((Pair_T) pairs->first)->comp == INDEL_COMP || ((Pair_T) pairs->first)->comp == INDEL_COMP )) {
+ debug(printf(" Peel [");
+ Pair_dump_one(pairs->first,/*zerobasedp*/true);
+ printf("]"));
+ peeled = List_transfer_one(peeled,&pairs);
}
- }
- if (endgappairs != NULL) {
- if (pairs == NULL || (pair = pairs->first) == NULL || (pair->gapp == false && pair->comp != INDEL_COMP && pair->comp != SHORTGAP_COMP)) {
- *endgappairs = NULL;
- *querydp3_medialgap = *querydp3;
- *genomedp3_medialgap = *genomedp3;
- } else {
- pairptr = pairs;
- pairs = Pairpool_pop(pairs,&pair);
-#ifdef WASTE
- *endgappairs = Pairpool_push_existing(NULL,pairpool,pair);
-#else
- *endgappairs = List_push_existing(NULL,pairptr);
-#endif
- debug(printf(" Peeling gap [");
- Pair_dump_one(pair,/*zerobasedp*/true);
+ while (/*npeelback < maxpeelback &&*/
+ (npeelback < minpeelback || nmismatches < min_mismatches) && pairs != NULL &&
+ ((Pair_T) pairs->first)->gapp == false &&
+ ((Pair_T) pairs->first)->comp != INDEL_COMP &&
+ ((Pair_T) pairs->first)->comp != SHORTGAP_COMP) {
+ debug(printf(" Peel [");
+ Pair_dump_one(pairs->first,/*zerobasedp*/true);
printf("]"));
- *gappair = pair;
- debug(printf(" gapcomp: '%c'",pair->comp));
- nmatches = 0;
- while (pairs != NULL && nmatches < 3) {
- pairptr = pairs;
- pairs = Pairpool_pop(pairs,&pair);
-#ifdef WASTE
- *endgappairs = Pairpool_push_existing(*endgappairs,pairpool,pair);
-#else
- *endgappairs = List_push_existing(*endgappairs,pairptr);
-#endif
- debug(printf(" Peeling after gap [");
- Pair_dump_one(pair,/*zerobasedp*/true);
- printf("]"));
- if (uppercaseCode[(int) pair->cdna] == uppercaseCode[(int) pair->genome]) {
- nmatches++;
- }
+ intron_nt = get_genomic_nt(&intron_nt_alt,genomedp5++,chroffset,chrhigh,watsonp);
+ if ((cdna = ((Pair_T) pairs->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
+ nmismatches++;
+ debug(printf(" (3) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp5-1));
}
- leftpair = (*endgappairs)->first;
- if (leftpair->gapp == true || leftpair->comp == INDEL_COMP || leftpair->comp == SHORTGAP_COMP) {
- debug(printf("Ran into gap; undoing peel, case 4\n"));
- pairs = Pairpool_transfer(pairs,*endgappairs);
- *endgappairs = (List_T) NULL;
+ if (((Pair_T) pairs->first)->protectedp == true) {
+ *protectedp = true;
+ }
+ peeled = List_transfer_one(peeled,&pairs);
+ npeelback++;
+ }
- if (quit_on_gap_p == true) {
- pairs = Pairpool_transfer(pairs,peeled);
- *peeled_pairs = (List_T) NULL;
- return pairs;
+ } else {
+ /* Don't stop at indels, but do stop at gaps */
+ pair = pairs->first;
+ if (pair->gapp == true) {
+ /* Peel known gap */
+ debug(printf(" Known_gap"));
+ peeled = List_transfer_one(peeled,&pairs);
+ }
- } else {
- /* Put back 1 */
- if ((pairptr = peeled) != NULL) {
- peeled = Pairpool_pop(peeled,&pair);
- pairs = List_push_existing(pairs,pairptr);
- debug(printf(" Putback [");
- Pair_dump_one(pair,/*zerobasedp*/true);
- printf("]"));
- }
+ niter = 0;
+ while (/*npeelback < maxpeelback &&*/
+ (npeelback < minpeelback || nmismatches < min_mismatches) && niter < MAXITER && pairs != NULL &&
+ ((Pair_T) pairs->first)->gapp == false) {
+ debug(printf(" Peel [");
+ Pair_dump_one(pairs->first,/*zerobasedp*/true);
+ printf("]"));
- /* Put back 2 */
- if ((pairptr = peeled) != NULL) {
- peeled = Pairpool_pop(peeled,&pair);
- pairs = List_push_existing(pairs,pairptr);
- debug(printf(" Putback [");
- Pair_dump_one(pair,/*zerobasedp*/true);
- printf("]"));
- }
- }
+ intron_nt = get_genomic_nt(&intron_nt_alt,genomedp5++,chroffset,chrhigh,watsonp);
+ if ((cdna = ((Pair_T) pairs->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
+ nmismatches++;
+ debug(printf(" (4) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp5-1));
}
- if (pairs != NULL) {
- leftpair = pairs->first;
- *querydp3_medialgap = leftpair->querypos - 1;
- *genomedp3_medialgap = leftpair->genomepos - 1;
- } else if (peeled != NULL) {
- leftpair = peeled->first;
- *querydp3_medialgap = leftpair->querypos;
- *genomedp3_medialgap = leftpair->genomepos;
+ if (((Pair_T) pairs->first)->comp == MATCH_COMP || ((Pair_T) pairs->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) pairs->first)->comp == AMBIGUOUS_COMP) {
+ npeelback++;
+ } else if (((Pair_T) pairs->first)->comp == INDEL_COMP) {
+ *n_peeled_indels += 1;
+ npeelback--;
+ } else if (((Pair_T) pairs->first)->comp == SHORTGAP_COMP) {
+ *n_peeled_indels += 1;
+ npeelback--;
} else {
- fprintf(stderr,"In peel_leftward for medialgap, pairs and peeled are both NULL\n");
- abort();
+ npeelback--;
+ }
+ if (((Pair_T) pairs->first)->protectedp == true) {
+ *protectedp = true;
}
+ niter++;
+ peeled = List_transfer_one(peeled,&pairs);
+ }
+
+ if (pairs != NULL && ((Pair_T) pairs->first)->gapp == true) {
+ debug(printf(" Hit gap [");
+ Pair_dump_one(pairs->first,/*zerobasedp*/true);
+ printf("]"));
+ }
+ }
+
+ if (pairs != NULL &&
+ ( ((Pair_T) pairs->first)->gapp == true ||
+ ((Pair_T) pairs->first)->comp == INDEL_COMP ||
+ ((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
+ /* Don't leave a gap or indel on the top of the pairs */
+ while (peeled != NULL &&
+ ( ((Pair_T) peeled->first)->gapp == true ||
+ ((Pair_T) peeled->first)->comp == INDEL_COMP ||
+ ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
+ debug(printf(" Putback [");
+ Pair_dump_one(peeled->first,/*zerobasedp*/true);
+ printf("]"));
+ pairs = List_transfer_one(pairs,&peeled);
+ }
+ if (peeled != NULL) {
+ debug(printf(" Putback [");
+ Pair_dump_one(peeled->first,/*zerobasedp*/true);
+ printf("]"));
+ pairs = List_transfer_one(pairs,&peeled); /* This should be match or mismatch */
}
}
- /* assert(peeled == NULL || pairs == NULL || ((Pair_T) pairs->first)->comp != INDEL_COMP); */
+ if (pairs != NULL) {
+ leftpair = pairs->first;
+ *querydp3 = leftpair->querypos - 1;
+ *genomedp3 = leftpair->genomepos - 1;
+ } else if (peeled != NULL) {
+ leftpair = peeled->first;
+ *querydp3 = leftpair->querypos;
+ *genomedp3 = leftpair->genomepos;
+ } else {
+ /* fprintf(stderr,"In peel_rightward, pairs and peeled are both NULL\n"); */
+ /* abort(); */
+ }
+
debug(
if (pairs == NULL) {
printf(" => Top of pairs is NULL.");
@@ -7416,18 +8100,22 @@ peel_rightward_old (bool *mismatchp, List_T *peeled_pairs, List_T pairs, int *qu
*peeled_pairs = peeled;
return pairs;
}
-#endif
+#if 0
+/* Not sure if we need this, or if it causes GMAP to fail on some alignments */
static List_T
-peel_rightward (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, List_T pairs, int *querydp3, Chrpos_T *genomedp3,
- int maxpeelback, bool stop_at_indels_p) {
+peel_rightward_contiguous (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, List_T pairs, int *querydp3, Chrpos_T *genomedp3,
+ int maxpeelback, bool stop_at_indels_p) {
List_T peeled = NULL;
Pair_T pair, leftpair;
int npeelback = 0, niter;
#if 0
int incursion = 0;
#endif
+ int last_querypos;
+ Chrpos_T last_genomepos;
+
*n_peeled_indels = 0;
/* *protectedp = false; -- set by calling procedure */
@@ -7461,16 +8149,24 @@ peel_rightward (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, Li
peeled = List_transfer_one(peeled,&pairs);
}
+ if (pairs != NULL) {
+ last_querypos = ((Pair_T) pairs->first)->querypos;
+ last_genomepos = ((Pair_T) pairs->first)->genomepos;
+ }
while (npeelback < maxpeelback && pairs != NULL &&
((Pair_T) pairs->first)->gapp == false &&
((Pair_T) pairs->first)->comp != INDEL_COMP &&
- ((Pair_T) pairs->first)->comp != SHORTGAP_COMP) {
+ ((Pair_T) pairs->first)->comp != SHORTGAP_COMP &&
+ ((Pair_T) pairs->first)->querypos <= last_querypos + 1 &&
+ ((Pair_T) pairs->first)->genomepos <= last_genomepos + 1) {
debug(printf(" Peel [");
Pair_dump_one(pairs->first,/*zerobasedp*/true);
printf("]"));
if (((Pair_T) pairs->first)->protectedp == true) {
*protectedp = true;
}
+ last_querypos = ((Pair_T) pairs->first)->querypos;
+ last_genomepos = ((Pair_T) pairs->first)->genomepos;
peeled = List_transfer_one(peeled,&pairs);
npeelback++;
}
@@ -7485,8 +8181,14 @@ peel_rightward (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, Li
}
niter = 0;
+ if (pairs != NULL) {
+ last_querypos = ((Pair_T) pairs->first)->querypos;
+ last_genomepos = ((Pair_T) pairs->first)->genomepos;
+ }
while (npeelback < maxpeelback && niter < MAXITER && pairs != NULL &&
- ((Pair_T) pairs->first)->gapp == false) {
+ ((Pair_T) pairs->first)->gapp == false &&
+ ((Pair_T) pairs->first)->querypos <= last_querypos + 1 &&
+ ((Pair_T) pairs->first)->genomepos <= last_genomepos + 1) {
debug(printf(" Peel [");
Pair_dump_one(pairs->first,/*zerobasedp*/true);
printf("]"));
@@ -7505,6 +8207,8 @@ peel_rightward (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, Li
*protectedp = true;
}
niter++;
+ last_querypos = ((Pair_T) pairs->first)->querypos;
+ last_genomepos = ((Pair_T) pairs->first)->genomepos;
peeled = List_transfer_one(peeled,&pairs);
}
@@ -7564,13 +8268,16 @@ peel_rightward (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, Li
*peeled_pairs = peeled;
return pairs;
}
+#endif
+#if 0
+/* Not sure if we need this, or if it causes GMAP to fail on some alignments */
/* Instead of maxpeelback, follow the 5' intron until we get enough mismatches */
static List_T
-peel_rightward_intron (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, List_T pairs, int *querydp3, Chrpos_T *genomedp3,
- Chrpos_T genomedp5, bool stop_at_indels_p, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
- int minpeelback, int min_mismatches) {
+peel_rightward_intron_contiguous (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, List_T pairs, int *querydp3, Chrpos_T *genomedp3,
+ Chrpos_T genomedp5, bool stop_at_indels_p, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
+ int minpeelback, int min_mismatches) {
List_T peeled = NULL;
Pair_T pair, leftpair;
int npeelback = 0, nmismatches = 0, niter;
@@ -7579,6 +8286,9 @@ peel_rightward_intron (int *n_peeled_indels, bool *protectedp, List_T *peeled_pa
int incursion = 0;
#endif
int maxpeelback = 12;
+ int last_querypos;
+ Chrpos_T last_genomepos;
+
*n_peeled_indels = 0;
/* *protectedp = false; -- set by calling procedure */
@@ -7612,11 +8322,17 @@ peel_rightward_intron (int *n_peeled_indels, bool *protectedp, List_T *peeled_pa
peeled = List_transfer_one(peeled,&pairs);
}
+ if (pairs != NULL) {
+ last_querypos = ((Pair_T) pairs->first)->querypos;
+ last_genomepos = ((Pair_T) pairs->first)->genomepos;
+ }
while (/*npeelback < maxpeelback &&*/
(npeelback < minpeelback || nmismatches < min_mismatches) && pairs != NULL &&
((Pair_T) pairs->first)->gapp == false &&
((Pair_T) pairs->first)->comp != INDEL_COMP &&
- ((Pair_T) pairs->first)->comp != SHORTGAP_COMP) {
+ ((Pair_T) pairs->first)->comp != SHORTGAP_COMP &&
+ ((Pair_T) pairs->first)->querypos <= last_querypos + 1 &&
+ ((Pair_T) pairs->first)->genomepos <= last_genomepos + 1) {
debug(printf(" Peel [");
Pair_dump_one(pairs->first,/*zerobasedp*/true);
printf("]"));
@@ -7630,6 +8346,8 @@ peel_rightward_intron (int *n_peeled_indels, bool *protectedp, List_T *peeled_pa
if (((Pair_T) pairs->first)->protectedp == true) {
*protectedp = true;
}
+ last_querypos = ((Pair_T) pairs->first)->querypos;
+ last_genomepos = ((Pair_T) pairs->first)->genomepos;
peeled = List_transfer_one(peeled,&pairs);
npeelback++;
}
@@ -7644,9 +8362,15 @@ peel_rightward_intron (int *n_peeled_indels, bool *protectedp, List_T *peeled_pa
}
niter = 0;
+ if (pairs != NULL) {
+ last_querypos = ((Pair_T) pairs->first)->querypos;
+ last_genomepos = ((Pair_T) pairs->first)->genomepos;
+ }
while (/*npeelback < maxpeelback &&*/
(npeelback < minpeelback || nmismatches < min_mismatches) && niter < MAXITER && pairs != NULL &&
- ((Pair_T) pairs->first)->gapp == false) {
+ ((Pair_T) pairs->first)->gapp == false &&
+ ((Pair_T) pairs->first)->querypos <= last_querypos + 1 &&
+ ((Pair_T) pairs->first)->genomepos <= last_genomepos + 1) {
debug(printf(" Peel [");
Pair_dump_one(pairs->first,/*zerobasedp*/true);
printf("]"));
@@ -7672,6 +8396,8 @@ peel_rightward_intron (int *n_peeled_indels, bool *protectedp, List_T *peeled_pa
*protectedp = true;
}
niter++;
+ last_querypos = ((Pair_T) pairs->first)->querypos;
+ last_genomepos = ((Pair_T) pairs->first)->genomepos;
peeled = List_transfer_one(peeled,&pairs);
}
@@ -7731,6 +8457,7 @@ peel_rightward_intron (int *n_peeled_indels, bool *protectedp, List_T *peeled_pa
*peeled_pairs = peeled;
return pairs;
}
+#endif
/************************************************************************
@@ -7823,6 +8550,8 @@ traverse_single_gap (bool *filledp, int *dynprogindex, List_T pairs, List_T *pat
}
debug(Pair_dump_list(gappairs,true));
}
+ debug(printf("Gap pairs:\n"));
+ debug(Pair_dump_list(gappairs,true));
debug(printf(" Final score: %d\n",finalscore));
#if 0
@@ -7839,8 +8568,10 @@ traverse_single_gap (bool *filledp, int *dynprogindex, List_T pairs, List_T *pat
}
#else
/* New behavior: Compares new score to orig score */
- if (forcep == true) {
+ if (0 && forcep == true) {
+ /* Don't do this. If gappairs is NULL, eliminates peeled pairs */
/* Intended for build_dual_breaks */
+ debug(printf("forcep is true, so transferring gappairs\n"));
pairs = Pairpool_transfer(pairs,gappairs);
*filledp = true;
} else {
@@ -10121,6 +10852,7 @@ traverse_dual_break (List_T pairs, List_T *path, Pair_T leftpair, Pair_T rightpa
chrend = (chrhigh - chroffset) - genomedp5;
}
+ assert(chrend >= chrstart);
debug14(printf("Starting stage2 with chrstart %u, chrend %u, watsonp %d\n",
chrstart,chrend,watsonp));
diff --git a/src/stage3hr.c b/src/stage3hr.c
index 8f5973e..60b288f 100644
--- a/src/stage3hr.c
+++ b/src/stage3hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage3hr.c 210602 2017-10-14 00:33:11Z twu $";
+static char rcsid[] = "$Id: stage3hr.c 210868 2017-10-29 20:11:43Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -9481,10 +9481,15 @@ Stage3end_new_gmap (int nmismatches_whole, int nmatches_posttrim, int max_match_
debug0(printf("Entered Stage3end_new_gmap with orig_sensedir %d\n",orig_sensedir));
assert(orig_sensedir == SENSE_NULL || orig_sensedir == SENSE_ANTI || orig_sensedir == SENSE_FORWARD);
+#if 0
+ /* This leads to H on both ends automatically */
start = &(pairarray[0]);
end = &(pairarray[npairs-1]);
hardclip_start = start->querypos;
hardclip_end = (querylength - 1) - end->querypos;
+#else
+ hardclip_start = hardclip_end = 0;
+#endif
cigar_tokens = Pair_compute_cigar(&intronp,&hardclip_start,&hardclip_end,pairarray,npairs,querylength,
/*watsonp*/plusp,orig_sensedir,/*chimera_part*/0);
diff --git a/src/substring.c b/src/substring.c
index 4733adc..1d44704 100644
--- a/src/substring.c
+++ b/src/substring.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: substring.c 207556 2017-06-21 00:52:32Z twu $";
+static char rcsid[] = "$Id: substring.c 210869 2017-10-29 20:12:18Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -1739,21 +1739,24 @@ Substring_setup (bool print_nsnpdiffs_p_in, bool print_snplabels_p_in,
+/* The result variable should be an existing genomic_bothdiff or genomic_refdiff, and then reassigned to that */
static char *
-embellish_genomic (char *genomic_diff, char *query, int querystart, int queryend, int querylength,
+embellish_genomic (char *result, char *genomic_diff, char *query, int querystart, int queryend, int querylength,
int mandatory_trim_left, int mandatory_trim_right, int extraleft, int extraright, int genestrand) {
- char *result;
int i, j, k;
debug1(printf("Entered embellish_genomic with querystart %d, queryend %d, querylength %d, genomic_diff %s\n",
querystart,queryend,querylength,genomic_diff));
+ if (result == NULL) {
+ result = (char *) MALLOC_OUT((querylength+1) * sizeof(char));
+ }
+ result[querylength] = '\0';
#ifdef DEBUG1
- result = (char *) CALLOC_OUT(querylength+1,sizeof(char));
-#else
- result = (char *) MALLOC_OUT((querylength+1) * sizeof(char));
+ for (i = 0; i < querylength; i++) {
+ result[0] = '?';
+ }
#endif
- result[querylength] = '\0';
/* Add aligned region with lower-case diffs, surrounded by dashes */
fill_w_dashes(result,0,querystart);
@@ -1802,12 +1805,13 @@ embellish_genomic (char *genomic_diff, char *query, int querystart, int queryend
static char *
-embellish_genomic_sam (char *genomic_diff, char *query, int querystart, int queryend, int querylength,
+embellish_genomic_sam (char *result, char *genomic_diff, char *query, int querystart, int queryend, int querylength,
int genestrand, bool exactp) {
- char *result;
int i, j, k;
- result = (char *) MALLOC_OUT((querylength+1) * sizeof(char));
+ if (result == NULL) {
+ result = (char *) MALLOC_OUT((querylength+1) * sizeof(char));
+ }
result[querylength] = '\0';
strncpy(&(result[querystart]),&(genomic_diff[querystart]),queryend-querystart);
@@ -2909,9 +2913,10 @@ Substring_display_prep (T this, char *queryuc_ptr, int querylength,
}
/* Need to perform embellish to put dashes in */
- this->genomic_bothdiff = embellish_genomic(genomic_diff,queryuc_ptr,this->querystart,this->queryend,
- querylength,this->mandatory_trim_left,this->mandatory_trim_right,
- extraleft,extraright,this->genestrand);
+ this->genomic_bothdiff =
+ embellish_genomic(this->genomic_bothdiff,genomic_diff,queryuc_ptr,this->querystart,this->queryend,
+ querylength,this->mandatory_trim_left,this->mandatory_trim_right,
+ extraleft,extraright,this->genestrand);
if (snps_iit == NULL) {
this->genomic_refdiff = this->genomic_bothdiff;
@@ -2928,15 +2933,17 @@ Substring_display_prep (T this, char *queryuc_ptr, int querylength,
/*pos5*/this->querystart,/*pos3*/this->queryend,
/*plusp*/true,this->genestrand);
if (output_sam_p == false) {
- this->genomic_refdiff = embellish_genomic(genomic_diff,queryuc_ptr,this->querystart,this->queryend,
- querylength,this->mandatory_trim_left,this->mandatory_trim_right,
- extraleft,extraright,this->genestrand);
+ this->genomic_refdiff =
+ embellish_genomic(this->genomic_refdiff,genomic_diff,queryuc_ptr,this->querystart,this->queryend,
+ querylength,this->mandatory_trim_left,this->mandatory_trim_right,
+ extraleft,extraright,this->genestrand);
}
}
if (output_sam_p == true) {
- this->genomic_refdiff = embellish_genomic_sam(genomic_diff,queryuc_ptr,this->querystart,this->queryend,
- querylength,this->genestrand,this->exactp);
+ this->genomic_refdiff =
+ embellish_genomic_sam(this->genomic_refdiff,genomic_diff,queryuc_ptr,this->querystart,this->queryend,
+ querylength,this->genestrand,this->exactp);
}
if (0 && this->exactp == true && extraleft == 0 && extraright == 0) {
@@ -2987,9 +2994,10 @@ Substring_display_prep (T this, char *queryuc_ptr, int querylength,
}
/* Need to perform embellish to put dashes in */
- this->genomic_bothdiff = embellish_genomic(genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
- querylength,this->mandatory_trim_left,this->mandatory_trim_right,
- extraleft,extraright,this->genestrand);
+ this->genomic_bothdiff =
+ embellish_genomic(this->genomic_bothdiff,genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
+ querylength,this->mandatory_trim_left,this->mandatory_trim_right,
+ extraleft,extraright,this->genestrand);
if (snps_iit == NULL) {
this->genomic_refdiff = this->genomic_bothdiff;
@@ -3008,15 +3016,17 @@ Substring_display_prep (T this, char *queryuc_ptr, int querylength,
/*plusp*/false,this->genestrand);
if (output_sam_p == false) {
- this->genomic_refdiff = embellish_genomic(genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
- querylength,this->mandatory_trim_left,this->mandatory_trim_right,
- extraleft,extraright,this->genestrand);
+ this->genomic_refdiff =
+ embellish_genomic(this->genomic_refdiff,genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
+ querylength,this->mandatory_trim_left,this->mandatory_trim_right,
+ extraleft,extraright,this->genestrand);
}
}
if (output_sam_p == true) {
- this->genomic_refdiff = embellish_genomic_sam(genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
- querylength,this->genestrand,this->exactp);
+ this->genomic_refdiff =
+ embellish_genomic_sam(this->genomic_refdiff,genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
+ querylength,this->genestrand,this->exactp);
}
if (0 && this->exactp == true && extraleft == 0 && extraright == 0) {
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gmap.git
More information about the debian-med-commit
mailing list