[med-svn] [gmap] 02/05: New upstream version 2017-10-12
Alex Mestiashvili
malex-guest at moszumanska.debian.org
Fri Oct 20 22:23:42 UTC 2017
This is an automated email from the git hooks/post-receive script.
malex-guest pushed a commit to branch master
in repository gmap.
commit 7f5d44011681710749d0cae4d977d67876e156ec
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date: Fri Oct 20 23:37:42 2017 +0200
New upstream version 2017-10-12
---
ChangeLog | 39 ++++++++
VERSION | 2 +-
configure | 24 ++---
src/cellpool.c | 7 +-
src/cellpool.h | 9 +-
src/chrom.c | 8 +-
src/gmap.c | 11 ++-
src/gsnap.c | 11 ++-
src/outbuffer.c | 8 +-
src/pair.c | 75 ++++++++-------
src/samflags.h | 6 +-
src/samheader.c | 14 ++-
src/samprint.c | 16 ++-
src/stage2.c | 283 +++++++++++++++++++++++++++++++++++++++++++-----------
src/stage3.c | 27 +++---
src/stage3hr.c | 28 +++++-
src/translation.c | 8 +-
src/translation.h | 4 +-
18 files changed, 434 insertions(+), 146 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index e5cd7b1..58234e3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,42 @@
+2017-10-14 twu
+
+ * VERSION, src: Updated version number
+
+ * stage3hr.c: Merged revision 210536 from trunk. Removed use of minimum of
+ npaths and maxpaths
+
+ * samprint.c: Merged revision 210535 from trunk. Handling dinucleotides
+ gracefully if genomic sequence is NULL.
+
+ * gsnap.c: Merged revision 210534 from trunk. Removed --maxsearch option.
+ Increased default expected_pairlength value.
+
+2017-10-12 twu
+
+ * outbuffer.c, samflags.h, samheader.c: Merged revision 210516 from trunk.
+ Fixed problem introduced in 2017-05 which caused the --output-file option
+ to produce a NULL file pointer.
+
+ * chrom.c: Merged revision 210513 from trunk. Dereferencing a character
+ pointer.
+
+ * pair.c: Merged revision 210506 from trunk. Fixed problem with
+ initialization of endi in determining gff3 coordinates. Avoiding reading
+ the pair at index of npairs.
+
+ * cellpool.c, cellpool.h, stage2.c: Merged revision 210504 from trunk.
+ Adding non-overlapping paths, as well as high-scoring paths
+
+ * gmap.c, translation.c, translation.h: Merged revision 210493 from trunk.
+ Adding option for only ATG as initiation codon. Making this the default.
+
+ * stage3.c: Merged revision 210492 from trunk. For trimming end exons,
+ using a percentage of the querylength instead of a fixed length
+
+ * stage3hr.c: Merged revision 210278 from trunk. In
+ resolve_ambiguous_splice procedure, when ambiguity is resolved, setting
+ genomicstart or genomicend fields accordingly.
+
2017-09-29 twu
* VERSION, public-2017-09-05, src: Updated version number
diff --git a/VERSION b/VERSION
index 2079994..d549256 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2017-09-30
\ No newline at end of file
+2017-10-12
\ No newline at end of file
diff --git a/configure b/configure
index 1400e2e..7d3415b 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for gmap 2017-09-30.
+# Generated by GNU Autoconf 2.69 for gmap 2017-10-12.
#
# Report bugs to <Thomas Wu <twu at gene.com>>.
#
@@ -590,8 +590,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='gmap'
PACKAGE_TARNAME='gmap'
-PACKAGE_VERSION='2017-09-30'
-PACKAGE_STRING='gmap 2017-09-30'
+PACKAGE_VERSION='2017-10-12'
+PACKAGE_STRING='gmap 2017-10-12'
PACKAGE_BUGREPORT='Thomas Wu <twu at gene.com>'
PACKAGE_URL=''
@@ -1369,7 +1369,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures gmap 2017-09-30 to adapt to many kinds of systems.
+\`configure' configures gmap 2017-10-12 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1440,7 +1440,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of gmap 2017-09-30:";;
+ short | recursive ) echo "Configuration of gmap 2017-10-12:";;
esac
cat <<\_ACEOF
@@ -1577,7 +1577,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-gmap configure 2017-09-30
+gmap configure 2017-10-12
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2183,7 +2183,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by gmap $as_me 2017-09-30, which was
+It was created by gmap $as_me 2017-10-12, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2533,8 +2533,8 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking package version" >&5
$as_echo_n "checking package version... " >&6; }
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 2017-09-30" >&5
-$as_echo "2017-09-30" >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 2017-10-12" >&5
+$as_echo "2017-10-12" >&6; }
### Read defaults
@@ -4401,7 +4401,7 @@ fi
# Define the identity of the package.
PACKAGE='gmap'
- VERSION='2017-09-30'
+ VERSION='2017-10-12'
cat >>confdefs.h <<_ACEOF
@@ -19978,7 +19978,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by gmap $as_me 2017-09-30, which was
+This file was extended by gmap $as_me 2017-10-12, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -20044,7 +20044,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-gmap config.status 2017-09-30
+gmap config.status 2017-10-12
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/src/cellpool.c b/src/cellpool.c
index 5b2c4ba..538f87e 100644
--- a/src/cellpool.c
+++ b/src/cellpool.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: cellpool.c 135447 2014-05-07 22:25:45Z twu $";
+static char rcsid[] = "$Id: cellpool.c 210511 2017-10-12 21:11:21Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -163,7 +163,8 @@ Cellpool_reset (T this) {
}
List_T
-Cellpool_push (List_T list, T this, int rootposition, int querypos, int hit, bool fwdp, int score) {
+Cellpool_push (List_T list, T this, int rootposition, int endposition,
+ int querypos, int hit, bool fwdp, int score) {
List_T listcell;
Cell_T new;
List_T p;
@@ -182,10 +183,12 @@ Cellpool_push (List_T list, T this, int rootposition, int querypos, int hit, boo
new->rootposition = rootposition;
+ new->endposition = endposition;
new->querypos = querypos;
new->hit = hit;
new->fwdp = fwdp;
new->score = score;
+ new->pushedp = false;
if (this->listcellctr >= this->nlistcells) {
diff --git a/src/cellpool.h b/src/cellpool.h
index 64bd10a..1d96190 100644
--- a/src/cellpool.h
+++ b/src/cellpool.h
@@ -1,4 +1,4 @@
-/* $Id: cellpool.h 135447 2014-05-07 22:25:45Z twu $ */
+/* $Id: cellpool.h 210511 2017-10-12 21:11:21Z twu $ */
#ifndef CELLPOOL_INCLUDED
#define CELLPOOL_INCLUDED
@@ -8,11 +8,13 @@
typedef struct Cell_T *Cell_T;
struct Cell_T {
- int rootposition;
+ int rootposition; /* Want to allow for -1 */
+ int endposition;
int querypos;
int hit;
bool fwdp;
int score;
+ int pushedp;
};
@@ -30,7 +32,8 @@ Cellpool_new (void);
extern void
Cellpool_reset (T this);
extern List_T
-Cellpool_push (List_T list, T this, int rootposition, int querypos, int hit, bool fwdp, int score);
+Cellpool_push (List_T list, T this, int rootposition, int endposition,
+ int querypos, int hit, bool fwdp, int score);
extern List_T
Cellpool_pop (List_T list, Cell_T *x);
diff --git a/src/chrom.c b/src/chrom.c
index 11da4e6..32e4e4a 100644
--- a/src/chrom.c
+++ b/src/chrom.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: chrom.c 207849 2017-06-29 19:20:50Z twu $";
+static char rcsid[] = "$Id: chrom.c 210514 2017-10-12 21:31:32Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -131,7 +131,7 @@ Chrom_from_string (char *string, char *mitochondrial_string, Univcoord_T order,
}
p = string;
- while (p != '\0' && *p >= '0' && *p <= '9') {
+ while (*p != '\0' && *p >= '0' && *p <= '9') {
ndigits++;
p++;
}
@@ -165,11 +165,11 @@ Chrom_from_string (char *string, char *mitochondrial_string, Univcoord_T order,
} else if (sex_p == true) {
new->chromtype = SEX_CHROMOSOME;
} else {
- while (p != '\0' && (*p < '1' || *p > '9')) {
+ while (*p != '\0' && (*p < '1' || *p > '9')) {
/* Stop at initial '1' through '9'. An initial '0' must be alphabetic, not numeric. */
p++;
}
- if (p != '\0') {
+ if (*p != '\0') {
new->chromtype = ALPHA_NUMERIC;
new->num = atoi(p);
new->alpha = (char *) MALLOC((p - string + 1)*sizeof(char));
diff --git a/src/gmap.c b/src/gmap.c
index f649c12..c9099ff 100644
--- a/src/gmap.c
+++ b/src/gmap.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: gmap.c 210067 2017-09-23 00:16:06Z twu $";
+static char rcsid[] = "$Id: gmap.c 210510 2017-10-12 21:09:38Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -183,6 +183,7 @@ static char rcsid[] = "$Id: gmap.c 210067 2017-09-23 00:16:06Z twu $";
************************************************************************/
static int translation_code = 1;
+static bool alt_initiation_codons_p = false;
static Univ_IIT_T chromosome_iit = NULL;
static Univ_IIT_T altscaffold_iit = NULL;
@@ -530,6 +531,7 @@ static struct option long_options[] = {
{"localsplicedist", required_argument, 0, 'w'}, /* shortsplicedist */
#endif
{"translation-code", required_argument, 0, 0}, /* translation_code */
+ {"alt-start-codons", no_argument, 0, 0}, /* alt_initiation_codons_p */
{"nthreads", required_argument, 0, 't'}, /* nworkers */
{"splicingdir", required_argument, 0, 0}, /* user_splicingdir */
@@ -5419,6 +5421,9 @@ parse_command_line (int argc, char *argv[], int optind) {
} else if (!strcmp(long_name,"translation-code")) {
translation_code = atoi(check_valid_int(optarg));
+ } else if (!strcmp(long_name,"alt-start-codons")) {
+ alt_initiation_codons_p = true;
+
} else if (!strcmp(long_name,"min-intronlength")) {
min_intronlength = atoi(check_valid_int(optarg));
@@ -6787,7 +6792,7 @@ main (int argc, char *argv[]) {
}
- Translation_setup(translation_code);
+ Translation_setup(translation_code,alt_initiation_codons_p);
if (user_pairalign_p == true) {
/* Creation of genomebits/genomecomp and initialization done within single_thread() for each input sequence */
@@ -7445,6 +7450,8 @@ Output options\n\
--translation-code=INT Genetic code used for translating codons to amino acids and computing CDS\n\
Integer value (default=1) corresponds to an available code at\n\
http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi\n\
+ --alt-start-codons Also, use the alternate initiation codons shown in the above Web site\n\
+ By default, without this option, only ATG is considered an initiation codon\n\
");
#ifdef PMAP
diff --git a/src/gsnap.c b/src/gsnap.c
index cd61704..7ea468c 100644
--- a/src/gsnap.c
+++ b/src/gsnap.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: gsnap.c 210070 2017-09-23 00:17:54Z twu $";
+static char rcsid[] = "$Id: gsnap.c 210600 2017-10-14 00:32:02Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -304,7 +304,7 @@ static int pairmax_linear;
static int pairmax_circular;
static int pairmax_dna = 1000;
static int pairmax_rna = 200000;
-static int expected_pairlength = 200;
+static int expected_pairlength = 500;
static int pairlength_deviation = 100;
#ifdef USE_MPI
@@ -639,7 +639,9 @@ static struct option long_options[] = {
{"extend-soft-clips", no_argument, 0, 0}, /* hide_soft_clips_p */
{"noexceptions", no_argument, 0, '0'}, /* exception_raise_p */
+#if 0
{"maxsearch", required_argument, 0, 0}, /* maxpaths_search */
+#endif
{"npaths", required_argument, 0, 'n'}, /* maxpaths_report */
{"add-paired-nomappers", no_argument, 0, 0}, /* add_paired_nomappers_p */
{"paired-flag-means-concordant", required_argument, 0, 0}, /* paired_flag_means_concordant_p */
@@ -1751,8 +1753,10 @@ parse_command_line (int argc, char *argv[], int optind) {
} else if (!strcmp(long_name,"unload")) {
unloadp = true;
+#if 0
} else if (!strcmp(long_name,"maxsearch")) {
maxpaths_search = atoi(optarg);
+#endif
} else if (!strcmp(long_name,"mode")) {
if (!strcmp(optarg,"standard")) {
@@ -4295,12 +4299,15 @@ is still designed to be fast.\n\
--genome-unk-mismatch=INT Whether to count unknown (N) characters in the genome as a mismatch\n\
(0=no, 1=yes (default))\n\
");
+
+#if 0
fprintf(stdout,"\
--maxsearch=INT Maximum number of alignments to find (default %d).\n\
Must be larger than --npaths, which is the number to report.\n\
Keeping this number large will allow for random selection among multiple alignments.\n\
Reducing this number can speed up the program.\n\
",maxpaths_search);
+#endif
#if 0
fprintf(stdout,"\
diff --git a/src/outbuffer.c b/src/outbuffer.c
index ce4bcf6..ddc2d14 100644
--- a/src/outbuffer.c
+++ b/src/outbuffer.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: outbuffer.c 200473 2016-11-14 20:54:20Z twu $";
+static char rcsid[] = "$Id: outbuffer.c 210517 2017-10-12 22:41:05Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -341,7 +341,7 @@ Outbuffer_setup (int argc_in, char **argv_in, int optind_in,
}
} else if (output_file != NULL) {
- outputs[0] = SAM_header_open_file(OUTPUT_NONE,/*split_output_root*/output_file,appendp);
+ outputs[0] = SAM_header_open_file(OUTPUT_FILE,/*split_output_root*/output_file,appendp);
#ifdef SAM_HEADERS_ON_EMPTY_FILES
print_file_headers(outputs[0]);
#endif
@@ -923,7 +923,7 @@ Outbuffer_print_filestrings (Filestring_T fp, Filestring_T fp_failedinput, Files
output = outputs[0] = stdout;
print_file_headers(stdout);
} else {
- output = outputs[0] = SAM_header_open_file(/*split_output*/OUTPUT_NONE,output_file,appendp);
+ output = outputs[0] = SAM_header_open_file(/*split_output*/OUTPUT_FILE,output_file,appendp);
print_file_headers(output);
}
}
@@ -995,7 +995,7 @@ Outbuffer_print_filestrings (Filestring_T fp, Filestring_T fp_failedinput) {
output = outputs[0] = stdout;
print_file_headers(stdout);
} else {
- output = outputs[0] = SAM_header_open_file(/*split_output*/OUTPUT_NONE,output_file,appendp);
+ output = outputs[0] = SAM_header_open_file(/*split_output*/OUTPUT_FILE,output_file,appendp);
print_file_headers(output);
}
}
diff --git a/src/pair.c b/src/pair.c
index d75929e..84672a3 100644
--- a/src/pair.c
+++ b/src/pair.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: pair.c 210070 2017-09-23 00:17:54Z twu $";
+static char rcsid[] = "$Id: pair.c 210512 2017-10-12 21:12:19Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -2727,7 +2727,7 @@ print_gff3_exons_forward (Filestring_T fp, struct T *pairs, int npairs, int path
int last_querypos = -1;
Chrpos_T last_genomepos = -1U;
- endi = npairs;
+ endi = npairs - 1;
if (cds_p == false) {
starti = 0;
@@ -2792,7 +2792,6 @@ print_gff3_exons_forward (Filestring_T fp, struct T *pairs, int npairs, int path
abort();
}
-
ptr = &(pairs[starti]);
for (i = starti; i <= endi; i++) {
/* prev = this; */
@@ -3034,7 +3033,7 @@ print_gff3_exons_backward (Filestring_T fp, struct T *pairs, int npairs, int pat
int last_querypos = -1;
Chrpos_T last_genomepos = -1U;
- endi = npairs;
+ endi = npairs - 1;
if (cds_p == false) {
starti = 0;
@@ -4141,14 +4140,16 @@ Pair_print_gsnap (Filestring_T fp, struct T *pairs_querydir, int npairs, int nse
FPRINTF(fp,"\n,");
- this = ptr;
- exon_querystart = this->querypos + 1;
- exon_genomestart = this->genomepos + 1;
- nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
-
- /* Start of next line */
- for (querypos = 1; querypos < exon_querystart; querypos++) {
- FPRINTF(fp,"-");
+ if (i < npairs) {
+ this = ptr;
+ exon_querystart = this->querypos + 1;
+ exon_genomestart = this->genomepos + 1;
+ nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
+
+ /* Start of next line */
+ for (querypos = 1; querypos < exon_querystart; querypos++) {
+ FPRINTF(fp,"-");
+ }
}
} else if (this->cdna == ' ') {
@@ -4219,14 +4220,16 @@ Pair_print_gsnap (Filestring_T fp, struct T *pairs_querydir, int npairs, int nse
FPRINTF(fp,"\n,");
- this = ptr;
- exon_querystart = this->querypos + 1;
- exon_genomestart = this->genomepos + 1;
- nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
-
- /* Start of next line */
- for (querypos = 1; querypos < exon_querystart; querypos++) {
- FPRINTF(fp,"-");
+ if (i < npairs) {
+ this = ptr;
+ exon_querystart = this->querypos + 1;
+ exon_genomestart = this->genomepos + 1;
+ nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
+
+ /* Start of next line */
+ for (querypos = 1; querypos < exon_querystart; querypos++) {
+ FPRINTF(fp,"-");
+ }
}
} else {
@@ -4464,9 +4467,9 @@ Pair_print_bedpe (Filestring_T fp, struct T *pairarray, int npairs,
if (i < npairs) {
ptr--;
i--;
+ this = ptr;
}
- this = ptr;
/* exon_querystart = this->querypos + 1; */
exon_genomestart = this->genomepos + 1;
@@ -4506,9 +4509,9 @@ Pair_print_bedpe (Filestring_T fp, struct T *pairarray, int npairs,
if (i < npairs) {
ptr--;
i--;
+ this = ptr;
}
- this = ptr;
/* exon_querystart = this->querypos + 1; */
exon_genomestart = this->genomepos + 1;
@@ -4707,12 +4710,12 @@ Pair_print_m8 (Filestring_T fp, struct T *pairs_querydir, int npairs, bool inver
if (i < npairs) {
ptr--;
i--;
- }
- this = ptr;
- exon_querystart = this->querypos + 1;
- exon_genomestart = this->genomepos + 1;
- nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
+ this = ptr;
+ exon_querystart = this->querypos + 1;
+ exon_genomestart = this->genomepos + 1;
+ nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
+ }
} else if (this->cdna == ' ') {
/* DELETION */
@@ -4733,10 +4736,12 @@ Pair_print_m8 (Filestring_T fp, struct T *pairs_querydir, int npairs, bool inver
print_m8_line(fp,exon_querystart,exon_queryend,chr,exon_genomestart,exon_genomeend,
nmismatches_bothdiff,headerseq,acc_suffix);
- this = ptr;
- exon_querystart = this->querypos + 1;
- exon_genomestart = this->genomepos + 1;
- nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
+ if (i < npairs) {
+ this = ptr;
+ exon_querystart = this->querypos + 1;
+ exon_genomestart = this->genomepos + 1;
+ nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
+ }
} else {
fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
@@ -4847,9 +4852,9 @@ Pair_min_evalue (struct T *pairarray, int npairs) {
if (i < npairs) {
ptr--;
i--;
+ this = ptr;
}
- this = ptr;
exon_querystart = this->querypos + 1;
nmismatches_bothdiff = 0;
@@ -4865,6 +4870,7 @@ Pair_min_evalue (struct T *pairarray, int npairs) {
if (i < npairs) {
ptr--;
i--;
+ this = ptr;
}
/* Finish rest of this line */
@@ -4874,7 +4880,6 @@ Pair_min_evalue (struct T *pairarray, int npairs) {
min_evalue = evalue;
}
- this = ptr;
exon_querystart = this->querypos + 1;
nmismatches_bothdiff = 0;
@@ -5030,13 +5035,13 @@ Pair_guess_cdna_direction_array (int *sensedir, struct T *pairs_querydir, int np
if (i < npairs) {
ptr--;
i--;
+ this = ptr;
}
splice_site_probs(&sense_prob,&antisense_prob,
prev_splicesitep,splicesitep,chroffset,
exon_genomestart,exon_genomeend,watsonp);
- this = ptr;
exon_genomestart = this->genomepos + 1;
} else if (this->cdna == ' ') {
@@ -5052,13 +5057,13 @@ Pair_guess_cdna_direction_array (int *sensedir, struct T *pairs_querydir, int np
if (i < npairs) {
ptr--;
i--;
+ this = ptr;
}
splice_site_probs(&sense_prob,&antisense_prob,
prev_splicesitep,splicesitep,chroffset,
exon_genomestart,exon_genomeend,watsonp);
- this = ptr;
exon_genomestart = this->genomepos + 1;
} else {
diff --git a/src/samflags.h b/src/samflags.h
index ef1984a..4cde5dc 100644
--- a/src/samflags.h
+++ b/src/samflags.h
@@ -1,4 +1,4 @@
-/* $Id: samflags.h 205263 2017-04-13 00:02:34Z twu $ */
+/* $Id: samflags.h 210517 2017-10-12 22:41:05Z twu $ */
#ifndef SAMFLAGS_INCLUDED
#define SAMFLAGS_INCLUDED
@@ -59,7 +59,9 @@
#define ABBREV_CONCORDANT_MULT_XS "CX"
-typedef enum {OUTPUT_NONE,
+typedef enum {OUTPUT_FILE, /* specified by the --output-file option */
+
+ OUTPUT_NONE, /* used when omit_concordant_uniq_p or omit_concordant_mult_p is set */
OUTPUT_NM, /* nomapping */
diff --git a/src/samheader.c b/src/samheader.c
index 9b63bc3..22b0cac 100644
--- a/src/samheader.c
+++ b/src/samheader.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: samheader.c 207321 2017-06-14 19:37:40Z twu $";
+static char rcsid[] = "$Id: samheader.c 210517 2017-10-12 22:41:05Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -26,9 +26,7 @@ SAM_header_open_file (SAM_split_output_type split_output, char *split_output_roo
#endif
char *filename, *suffix;
- if (split_output == OUTPUT_NONE) {
- /* Don't open a file */
- return (FILE *) NULL;
+ if (split_output == OUTPUT_FILE) {
#ifdef USE_MPI
/* output file name is passed in through split_output_root */
@@ -52,6 +50,7 @@ SAM_header_open_file (SAM_split_output_type split_output, char *split_output_roo
write_mode = "w";
}
+ /* split_output_root here is output_file */
filename = (char *) CALLOC(strlen(split_output_root)+1,sizeof(char));
sprintf(filename,"%s",split_output_root);
@@ -65,9 +64,14 @@ SAM_header_open_file (SAM_split_output_type split_output, char *split_output_roo
}
#endif
+ } else if (split_output == OUTPUT_NONE) {
+ /* Don't open a file */
+ return (FILE *) NULL;
+
} else {
switch (split_output) {
- case OUTPUT_NONE: /* Handled above */ abort();
+ case OUTPUT_FILE: /* Handled above */ abort(); break;
+ case OUTPUT_NONE: /* Handled above */ abort(); break;
case OUTPUT_NM: suffix = "nomapping"; break;
diff --git a/src/samprint.c b/src/samprint.c
index 2619c88..bae8906 100644
--- a/src/samprint.c
+++ b/src/samprint.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: samprint.c 207409 2017-06-16 00:45:35Z twu $";
+static char rcsid[] = "$Id: samprint.c 210601 2017-10-14 00:32:35Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -1365,10 +1365,15 @@ halfdonor_dinucleotide (char *donor1, char *donor2, Substring_T donor, int sense
int substring_start, substring_end;
genomic = Substring_genomic_refdiff(donor);
- if (sensedir == SENSE_FORWARD) {
+
+ if (genomic == NULL) {
+ *donor1 = *donor2 = 'X';
+
+ } else if (sensedir == SENSE_FORWARD) {
substring_end = Substring_queryend(donor);
*donor1 = toupper(genomic[substring_end]);
*donor2 = toupper(genomic[substring_end+1]);
+
} else {
substring_start = Substring_querystart(donor);
*donor2 = toupper(complCode[(int) genomic[substring_start-2]]);
@@ -1384,10 +1389,15 @@ halfacceptor_dinucleotide (char *acceptor2, char *acceptor1, Substring_T accepto
int substring_start, substring_end;
genomic = Substring_genomic_refdiff(acceptor);
- if (sensedir == SENSE_FORWARD) {
+
+ if (genomic == NULL) {
+ *acceptor1 = *acceptor2 = 'X';
+
+ } else if (sensedir == SENSE_FORWARD) {
substring_start = Substring_querystart(acceptor);
*acceptor2 = toupper(genomic[substring_start-2]);
*acceptor1 = toupper(genomic[substring_start-1]);
+
} else {
substring_end = Substring_queryend(acceptor);
*acceptor1 = toupper(complCode[(int) genomic[substring_end]]);
diff --git a/src/stage2.c b/src/stage2.c
index 65d4f23..2d96613 100644
--- a/src/stage2.c
+++ b/src/stage2.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage2.c 207199 2017-06-12 18:31:34Z twu $";
+static char rcsid[] = "$Id: stage2.c 210511 2017-10-12 21:11:21Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -59,7 +59,8 @@ static char rcsid[] = "$Id: stage2.c 207199 2017-06-12 18:31:34Z twu $";
#define NON_CANONICAL_PENALTY_MIDDLE 4
#define MISS_BEHIND 16
#define GREEDY_ADVANCE 6
-#define FINAL_SCORE_TOLERANCE 8
+#define FINAL_SCORE_TOLERANCE 20 /* Was 8, but missed some paths on Y chromosome */
+#define NONOVERLAPPING_SCORE_TOLERANCE 0.5
#define ENOUGH_CONSECUTIVE 32
@@ -3087,6 +3088,7 @@ intmatrix_2d_free (int ***matrix, int length1) {
typedef struct Cell_T *Cell_T;
struct Cell_T {
int rootposition;
+ int endposition;
int querypos;
int hit;
bool fwdp;
@@ -3102,10 +3104,11 @@ Cell_free (Cell_T *old) {
static Cell_T
-Cell_new (int rootposition, int querypos, int hit, bool fwdp, int score) {
+Cell_new (int rootposition, int endposition, int querypos, int hit, bool fwdp, int score) {
Cell_T new = (Cell_T) MALLOC(sizeof(*new));
new->rootposition = rootposition;
+ new->endposition = endposition;
new->querypos = querypos;
new->hit = hit;
new->fwdp = fwdp;
@@ -3115,6 +3118,29 @@ Cell_new (int rootposition, int querypos, int hit, bool fwdp, int score) {
#endif
+/* Used for the final set of cells, to see if we have non-overlapping paths */
+static int
+Cell_interval_cmp (const void *a, const void *b) {
+ Cell_T x = * (Cell_T *) a;
+ Cell_T y = * (Cell_T *) b;
+
+ if (x->rootposition < y->rootposition) {
+ return -1;
+ } else if (y->rootposition < x->rootposition) {
+ return +1;
+
+ } else if (x->endposition > y->endposition) {
+ return -1;
+ } else if (y->endposition > x->endposition) {
+ return +1;
+
+ } else {
+ return 0;
+ }
+}
+
+
+/* Used for the initial set of cells, to get the end cell for each rootposition */
static int
Cell_rootposition_left_cmp (const void *a, const void *b) {
Cell_T x = * (Cell_T *) a;
@@ -3124,6 +3150,15 @@ Cell_rootposition_left_cmp (const void *a, const void *b) {
return -1;
} else if (y->rootposition < x->rootposition) {
return +1;
+
+#if 0
+ /* Want score ranking, rather than interval ranking here. Otherwise, we don't get the final endposition */
+ } else if (x->endposition < y->endposition) {
+ return -1;
+ } else if (y->endposition < x->endposition) {
+ return +1;
+#endif
+
#if 0
} else if (x->tracei < y->tracei) {
return -1;
@@ -3151,6 +3186,8 @@ Cell_rootposition_left_cmp (const void *a, const void *b) {
}
}
+
+/* Used for the initial set of cells, to get the end cell for each rootposition */
static int
Cell_rootposition_right_cmp (const void *a, const void *b) {
Cell_T x = * (Cell_T *) a;
@@ -3160,6 +3197,15 @@ Cell_rootposition_right_cmp (const void *a, const void *b) {
return -1;
} else if (y->rootposition < x->rootposition) {
return +1;
+
+#if 0
+ /* Want score ranking, rather than interval ranking here. Otherwise, we don't get the final endposition */
+ } else if (x->endposition < y->endposition) {
+ return -1;
+ } else if (y->endposition < x->endposition) {
+ return +1;
+#endif
+
#if 0
} else if (x->tracei < y->tracei) {
return -1;
@@ -3239,8 +3285,9 @@ Linkmatrix_get_cells_fwd (int *nunique, struct Link_T **links, int querystart, i
if (links[querypos][hit].fwd_score >= threshold_score) {
rootposition = links[querypos][hit].fwd_rootposition;
/* tracei = links[querypos][hit].fwd_tracei; */
- celllist = Cellpool_push(celllist,cellpool,rootposition,querypos,hit,/*fwdp*/true,
- links[querypos][hit].fwd_score);
+ celllist = Cellpool_push(celllist,cellpool,rootposition,
+ /*endposition*/(int) mappings[querypos][hit],
+ querypos,hit,/*fwdp*/true,links[querypos][hit].fwd_score);
ncells++;
}
}
@@ -3260,6 +3307,7 @@ Linkmatrix_get_cells_fwd (int *nunique, struct Link_T **links, int querystart, i
if (favor_right_p == true) {
qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_right_cmp);
} else {
+ /* favor_right_p is always false for GMAP */
qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_left_cmp);
}
@@ -3301,7 +3349,7 @@ Linkmatrix_get_cells_fwd (int *nunique, struct Link_T **links, int querystart, i
#else
static Cell_T *
-get_cells_fwd (int *nunique, struct Link_T **links, int **fwd_scores,
+get_cells_fwd (int *nunique, struct Link_T **links, int **fwd_scores, Chrpos_T **mappings,
int querystart, int queryend, int *npositions,
bool favor_right_p, Cellpool_T cellpool) {
Cell_T *sorted, *cells;
@@ -3317,8 +3365,9 @@ get_cells_fwd (int *nunique, struct Link_T **links, int **fwd_scores,
if (fwd_scores[querypos][hit] > 0) {
rootposition = links[querypos][hit].fwd_rootposition;
/* tracei = links[querypos][hit].fwd_tracei; */
- celllist = Cellpool_push(celllist,cellpool,rootposition,querypos,hit,/*fwdp*/true,
- fwd_scores[querypos][hit]);
+ celllist = Cellpool_push(celllist,cellpool,rootposition,
+ /*endposition*/(int) mappings[querypos][hit],
+ querypos,hit,/*fwdp*/true,fwd_scores[querypos][hit]);
ncells++;
}
}
@@ -3338,6 +3387,7 @@ get_cells_fwd (int *nunique, struct Link_T **links, int **fwd_scores,
if (favor_right_p == true) {
qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_right_cmp);
} else {
+ /* favor_right_p is always false for GMAP */
qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_left_cmp);
}
@@ -3420,16 +3470,18 @@ Linkmatrix_get_cells_both (int *nunique, struct Link_T **links, int querystart,
if (links[querypos][hit].fwd_score >= threshold_score) {
rootposition = links[querypos][hit].fwd_rootposition;
/* tracei = links[querypos][hit].fwd_tracei; */
- celllist = Cellpool_push(celllist,cellpool,rootposition,querypos,hit,/*fwdp*/true,
- links[querypos][hit].fwd_score);
+ celllist = Cellpool_push(celllist,cellpool,rootposition,
+ /*endposition*/(int) mappings[querypos][hit],
+ querypos,hit,/*fwdp*/true,links[querypos][hit].fwd_score);
ncells++;
}
#ifdef SEPARATE_FWD_REV
if (links[querypos][hit].rev_score >= threshold_score) {
rootposition = links[querypos][hit].rev_rootposition;
/* tracei = links[querypos][hit].rev_tracei; */
- celllist = Cellpool_push(celllist,cellpool,rootposition,querypos,hit,/*fwdp*/false,
- links[querypos][hit].rev_score);
+ celllist = Cellpool_push(celllist,cellpool,rootposition,
+ /*endposition*/(int) mappings[querypos][hit],
+ querypos,hit,/*fwdp*/false,links[querypos][hit].rev_score);
ncells++;
}
#endif
@@ -3451,6 +3503,7 @@ Linkmatrix_get_cells_both (int *nunique, struct Link_T **links, int querystart,
if (favor_right_p == true) {
qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_right_cmp);
} else {
+ /* favor_right_p is always false for GMAP */
qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_left_cmp);
}
@@ -3541,7 +3594,7 @@ align_compute_scores_lookback (int *ncells, struct Link_T **links, int **fwd_sco
bool localp, bool skip_repetitive_p,
bool use_canonical_p, int non_canonical_penalty, bool debug_graphic_p, bool favor_right_p) {
Cell_T *cells;
- Link_T currlink, prevlink;
+ Link_T currlink;
int curr_querypos, indexsize_nt, indexsize_query, hit, nhits, low_hit, high_hit;
int nskipped, min_hits, specific_querypos, specific_low_hit, specific_high_hit, next_querypos;
Intlist_T processed = NULL;
@@ -3923,7 +3976,7 @@ align_compute_scores_lookback (int *ncells, struct Link_T **links, int **fwd_sco
indexsize,best_overall_score,favor_right_p,cellpool);
}
#else
- cells = get_cells_fwd(&(*ncells),links,fwd_scores,querystart,queryend,npositions,
+ cells = get_cells_fwd(&(*ncells),links,fwd_scores,mappings,querystart,queryend,npositions,
favor_right_p,cellpool);
#endif
@@ -4166,6 +4219,7 @@ align_compute_lookback (Chrpos_T **mappings, int *npositions, int totalpositions
bool fwdp;
int querypos, hit;
int bestscore;
+ int last_endposition;
if (oned_matrix_p == true) {
@@ -4195,6 +4249,8 @@ align_compute_lookback (Chrpos_T **mappings, int *npositions, int totalpositions
anchoredp,anchor_querypos,anchor_position,localp,
skip_repetitive_p,use_canonical_p,non_canonical_penalty,
debug_graphic_p,favor_right_p);
+ /* cells are currently sorted by Cell_score_cmp in get_cells_fwd */
+
#ifdef SEPARATE_FWD_REV
debug1(Linkmatrix_print_both(links,mappings,querylength,npositions,queryseq_ptr,indexsize));
@@ -4206,9 +4262,9 @@ align_compute_lookback (Chrpos_T **mappings, int *npositions, int totalpositions
all_paths = (List_T) NULL;
} else {
+ /* High-scoring paths */
bestscore = cells[0]->score;
-
- debug11(printf("Looping on %d cells, allowing up to %d alignments, plus any with best score %d\n",
+ debug11(printf("** Looping on %d cells, allowing up to %d alignments, plus any with best score %d\n",
ncells,max_nalignments,bestscore));
if (snps_p == true) {
@@ -4218,27 +4274,27 @@ align_compute_lookback (Chrpos_T **mappings, int *npositions, int totalpositions
querypos = cell->querypos;
hit = cell->hit;
fwdp = cell->fwdp;
- debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, position %u\n",
- i,cell->rootposition,cell->score,querypos,hit,mappings[querypos][hit]));
-
+ debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
+ i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
+
all_paths = List_push(all_paths,(void *) traceback_one_snps(querypos,hit,links,mappings,queryseq_ptr,
chroffset,chrhigh,/*watsonp*/plusp,
#ifdef DEBUG0
fwd_scores,indexsize,
#endif
pairpool,fwdp));
+ cell->pushedp = true;
}
} else {
for (i = 0; i < ncells && (i < max_nalignments || cells[i]->score == bestscore)
&& cells[i]->score > bestscore - FINAL_SCORE_TOLERANCE; i++) {
-
cell = cells[i];
querypos = cell->querypos;
hit = cell->hit;
fwdp = cell->fwdp;
- debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, position %u\n",
- i,cell->rootposition,cell->score,querypos,hit,mappings[querypos][hit]));
+ debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
+ i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
#if 0
if (debug_graphic_p == true) {
@@ -4260,8 +4316,58 @@ align_compute_lookback (Chrpos_T **mappings, int *npositions, int totalpositions
#endif
pairpool,fwdp));
}
+ }
+
+ /* Non-overlapping paths */
+ debug11(printf("** Looping on %d cells, looking for non-overlapping paths\n",ncells));
+ qsort(cells,ncells,sizeof(Cell_T),Cell_interval_cmp);
+ last_endposition = 0;
+ if (snps_p == true) {
+ for (i = 0; i < ncells; i++) {
+ cell = cells[i];
+ if (cell->score > bestscore * NONOVERLAPPING_SCORE_TOLERANCE &&
+ cell->rootposition > last_endposition && cell->pushedp == false) {
+ debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
+ i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
+ querypos = cell->querypos;
+ hit = cell->hit;
+ fwdp = cell->fwdp;
+ all_paths = List_push(all_paths,(void *) traceback_one_snps(querypos,hit,links,mappings,queryseq_ptr,
+ chroffset,chrhigh,/*watsonp*/plusp,
+#ifdef DEBUG0
+ fwd_scores,indexsize,
+#endif
+ pairpool,fwdp));
+ cell->pushedp = true;
+ last_endposition = cell->endposition;
+ }
+ }
+
+ } else {
+ for (i = 0; i < ncells; i++) {
+ cell = cells[i];
+ if (cell->score > bestscore * NONOVERLAPPING_SCORE_TOLERANCE &&
+ cell->rootposition > last_endposition && cell->pushedp == false) {
+ debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
+ i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
+ querypos = cell->querypos;
+ hit = cell->hit;
+ fwdp = cell->fwdp;
+ all_paths = List_push(all_paths,(void *) traceback_one(querypos,hit,links,mappings,queryseq_ptr,queryuc_ptr,
+#ifdef PMAP
+ chroffset,chrhigh,/*watsonp*/plusp,/*lookbackp*/true,
+#endif
+#ifdef DEBUG0
+ fwd_scores,indexsize,
+#endif
+ pairpool,fwdp));
+ cell->pushedp = true;
+ last_endposition = cell->endposition;
+ }
+ }
}
+
debug11(printf("\n"));
#if 0
@@ -4313,7 +4419,7 @@ align_compute_scores_lookforward (int *ncells, struct Link_T **links, int **fwd_
bool use_canonical_p, int non_canonical_penalty,
bool debug_graphic_p, bool favor_right_p) {
Cell_T *cells;
- Link_T currlink, prevlink;
+ Link_T currlink;
int curr_querypos, indexsize_nt, indexsize_query, hit, nhits, low_hit, high_hit;
int nskipped, min_hits, specific_querypos, specific_low_hit, specific_high_hit, next_querypos;
Intlist_T processed = NULL;
@@ -4695,7 +4801,7 @@ align_compute_scores_lookforward (int *ncells, struct Link_T **links, int **fwd_
indexsize,best_overall_score,favor_right_p,cellpool);
}
#else
- cells = get_cells_fwd(&(*ncells),links,fwd_scores,querystart,queryend,npositions,
+ cells = get_cells_fwd(&(*ncells),links,fwd_scores,mappings,querystart,queryend,npositions,
favor_right_p,cellpool);
#endif
@@ -4727,6 +4833,7 @@ align_compute_lookforward (Chrpos_T **mappings, int *npositions, int totalpositi
bool fwdp;
int querypos, hit;
int bestscore;
+ int last_endposition;
if (oned_matrix_p == true) {
links = Linkmatrix_1d_new(querylength,npositions,totalpositions);
@@ -4755,6 +4862,7 @@ align_compute_lookforward (Chrpos_T **mappings, int *npositions, int totalpositi
anchoredp,anchor_querypos,anchor_position,localp,
skip_repetitive_p,use_canonical_p,non_canonical_penalty,
debug_graphic_p,favor_right_p);
+ /* cells are currently sorted by Cell_score_cmp in get_cells_fwd */
#ifdef SEPARATE_FWD_REV
debug1(Linkmatrix_print_both(links,mappings,querylength,npositions,queryseq_ptr,indexsize));
@@ -4766,49 +4874,116 @@ align_compute_lookforward (Chrpos_T **mappings, int *npositions, int totalpositi
all_paths = (List_T) NULL;
} else {
+ /* High-scoring paths */
bestscore = cells[0]->score;
-
- debug11(printf("Looping on %d cells, allowing up to %d alignments, plus any with best score %d\n",
+ debug11(printf("** Looping on %d cells, allowing up to %d alignments, plus any with best score %d\n",
ncells,max_nalignments,bestscore));
- for (i = 0; i < ncells && (i < max_nalignments || cells[i]->score == bestscore)
- && cells[i]->score > bestscore - FINAL_SCORE_TOLERANCE; i++) {
- cell = cells[i];
- querypos = cell->querypos;
- hit = cell->hit;
- fwdp = cell->fwdp;
- debug11(printf("Starting subpath %d at rootposition %d with score %d, querypos %d, hit %d, position %u\n",
- i,cell->rootposition,cell->score,querypos,hit,mappings[querypos][hit]));
-
-
- if (debug_graphic_p == true) {
- /* best_path_dump_R(links,mappings,querypos,hit,fwdp,"best.path"); */
- printf("plot(all.mers,col=\"black\",pch=\".\",xlab=\"Query\",ylab=\"Genomic\")\n");
- printf("points(active.mers,col=\"red\",pch=\".\")\n");
- printf("points(best.path,col=\"green\",pch=\".\")\n");
- printf("lines(querypos,minactive,col=\"blue\")\n");
- printf("lines(querypos,maxactive,col=\"blue\")\n");
+ if (snps_p == true) {
+ for (i = 0; i < ncells && (i < max_nalignments || cells[i]->score == bestscore)
+ && cells[i]->score > bestscore - FINAL_SCORE_TOLERANCE; i++) {
+ cell = cells[i];
+ if (cell->pushedp == false) {
+ querypos = cell->querypos;
+ hit = cell->hit;
+ fwdp = cell->fwdp;
+ debug11(printf("Starting subpath %d at rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
+ i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
+ all_paths = List_push(all_paths,(void *) traceback_one_snps(querypos,hit,links,mappings,queryseq_ptr,
+ chroffset,chrhigh,/*watsonp*/plusp,
+#ifdef DEBUG0
+ fwd_scores,indexsize,
+#endif
+ pairpool,fwdp));
+ cell->pushedp = true;
+ }
}
- if (snps_p == true) {
- all_paths = List_push(all_paths,(void *) traceback_one_snps(querypos,hit,links,mappings,queryseq_ptr,
- chroffset,chrhigh,/*watsonp*/plusp,
-#ifdef DEBUG0
- fwd_scores,indexsize,
+ } else {
+ for (i = 0; i < ncells && (i < max_nalignments || cells[i]->score == bestscore)
+ && cells[i]->score > bestscore - FINAL_SCORE_TOLERANCE; i++) {
+ cell = cells[i];
+ if (cell->pushedp == false) {
+ querypos = cell->querypos;
+ hit = cell->hit;
+ fwdp = cell->fwdp;
+ debug11(printf("Starting subpath %d at rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
+ i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
+
+#if 0
+ if (debug_graphic_p == true) {
+ /* best_path_dump_R(links,mappings,querypos,hit,fwdp,"best.path"); */
+ printf("plot(all.mers,col=\"black\",pch=\".\",xlab=\"Query\",ylab=\"Genomic\")\n");
+ printf("points(active.mers,col=\"red\",pch=\".\")\n");
+ printf("points(best.path,col=\"green\",pch=\".\")\n");
+ printf("lines(querypos,minactive,col=\"blue\")\n");
+ printf("lines(querypos,maxactive,col=\"blue\")\n");
+ }
#endif
- pairpool,fwdp));
- } else {
- all_paths = List_push(all_paths,(void *) traceback_one(querypos,hit,links,mappings,queryseq_ptr,queryuc_ptr,
+
+ all_paths = List_push(all_paths,(void *) traceback_one(querypos,hit,links,mappings,queryseq_ptr,queryuc_ptr,
#ifdef PMAP
- chroffset,chrhigh,/*watsonp*/plusp,/*lookbackp*/false,
+ chroffset,chrhigh,/*watsonp*/plusp,/*lookbackp*/false,
#endif
#ifdef DEBUG0
- fwd_scores,indexsize,
+ fwd_scores,indexsize,
#endif
- pairpool,fwdp));
+ pairpool,fwdp));
+ cell->pushedp = true;
+ }
}
}
+
+ /* Non-overlapping paths */
+ debug11(printf("** Looping on %d cells, looking for non-overlapping paths\n",ncells));
+ qsort(cells,ncells,sizeof(Cell_T),Cell_interval_cmp);
+ last_endposition = 0;
+ if (snps_p == true) {
+ for (i = 0; i < ncells; i++) {
+ cell = cells[i];
+ if (cell->score > bestscore * NONOVERLAPPING_SCORE_TOLERANCE &&
+ cell->rootposition > last_endposition && cell->pushedp == false) {
+ debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
+ i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
+ querypos = cell->querypos;
+ hit = cell->hit;
+ fwdp = cell->fwdp;
+ all_paths = List_push(all_paths,(void *) traceback_one_snps(querypos,hit,links,mappings,queryseq_ptr,
+ chroffset,chrhigh,/*watsonp*/plusp,
+#ifdef DEBUG0
+ fwd_scores,indexsize,
+#endif
+ pairpool,fwdp));
+ cell->pushedp = true;
+ last_endposition = cell->endposition;
+ }
+ }
+
+ } else {
+ for (i = 0; i < ncells; i++) {
+ cell = cells[i];
+ if (cell->score > bestscore * NONOVERLAPPING_SCORE_TOLERANCE &&
+ cell->rootposition > last_endposition && cell->pushedp == false) {
+ debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
+ i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
+ querypos = cell->querypos;
+ hit = cell->hit;
+ fwdp = cell->fwdp;
+ all_paths = List_push(all_paths,(void *) traceback_one(querypos,hit,links,mappings,queryseq_ptr,queryuc_ptr,
+#ifdef PMAP
+ chroffset,chrhigh,/*watsonp*/plusp,/*lookbackp*/false,
+#endif
+#ifdef DEBUG0
+ fwd_scores,indexsize,
+#endif
+ pairpool,fwdp));
+ cell->pushedp = true;
+ last_endposition = cell->endposition;
+ }
+ }
+ }
+
debug11(printf("\n"));
#if 0
diff --git a/src/stage3.c b/src/stage3.c
index 58d2638..247dc04 100644
--- a/src/stage3.c
+++ b/src/stage3.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage3.c 210195 2017-09-29 15:12:33Z twu $";
+static char rcsid[] = "$Id: stage3.c 210509 2017-10-12 21:08:43Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -102,7 +102,7 @@ static char rcsid[] = "$Id: stage3.c 210195 2017-09-29 15:12:33Z twu $";
#define MICROEXON_PROB_MISMATCH 0.80
#define END_MIN_EXONLENGTH 12
-#define END_SUFFICIENT_EXONLENGTH 24 /* Defines length beyond which we can ignore maxintronlen_ends */
+#define END_SUFFICIENT_EXONLENGTH_PCT 0.10 /* Defines length (in terms of percentage of query beyond which we can ignore maxintronlen_ends */
#if 0
/* No longer used. Not sure why it was used before */
@@ -3883,7 +3883,7 @@ trim_end5_indels (List_T pairs, int ambig_end_length,
static List_T
trim_end5_exons (bool *indelp, bool *trim5p, int ambig_end_length, List_T pairs,
Dynprog_T dynprog, Univcoord_T chroffset, Univcoord_T chrhigh,
- char *queryseq_ptr, char *queryuc_ptr,
+ char *queryseq_ptr, char *queryuc_ptr, int querylength,
int cdna_direction, bool watsonp, bool jump_late_p,
Pairpool_T pairpool, double defect_rate) {
List_T path, exon, pairptr, p;
@@ -4023,7 +4023,8 @@ trim_end5_exons (bool *indelp, bool *trim5p, int ambig_end_length, List_T pairs,
#endif
} else {
- if (nmatches < END_SUFFICIENT_EXONLENGTH && splice->genomejump > maxintronlen_ends) {
+ if (nmatches < (int) (END_SUFFICIENT_EXONLENGTH_PCT * (double) querylength) &&
+ splice->genomejump > maxintronlen_ends) {
debug3(printf("End intron is too long, so trimming it\n"));
path = (List_T) NULL;
*trim5p = true;
@@ -4451,7 +4452,8 @@ trim_end3_exons (bool *indelp, bool *trim3p, int ambig_end_length, List_T path,
#endif
} else {
- if (nmatches < END_SUFFICIENT_EXONLENGTH && splice->genomejump > maxintronlen_ends) {
+ if (nmatches < (int) (END_SUFFICIENT_EXONLENGTH_PCT * (double) querylength) &&
+ splice->genomejump > maxintronlen_ends) {
debug3(printf("End intron is too long, so trimming it\n"));
pairs = (List_T) NULL;
*trim3p = true;
@@ -12575,7 +12577,7 @@ path_compute_dir (double *defect_rate, List_T pairs,
static List_T
path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5, double *ambig_prob_5,
double defect_rate, List_T pairs, int cdna_direction,
- bool watsonp, bool jump_late_p, char *queryseq_ptr, char *queryuc_ptr,
+ bool watsonp, bool jump_late_p, char *queryseq_ptr, char *queryuc_ptr, int querylength,
Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
int maxpeelback, Pairpool_T pairpool, Dynprog_T dynprogR) {
@@ -12663,7 +12665,7 @@ path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5, do
iter1 = 0;
while (iter1 < 5 && trim5p == true) {
pairs = trim_end5_exons(&indelp,&trim5p,*ambig_end_length_5,pairs,dynprogR,chroffset,chrhigh,
- queryseq_ptr,queryuc_ptr,
+ queryseq_ptr,queryuc_ptr,querylength,
cdna_direction,watsonp,jump_late_p,pairpool,defect_rate);
if (indelp == true) {
pairs = trim_end5_indels(pairs,*ambig_end_length_5,dynprogR,chroffset,chrhigh,
@@ -14292,7 +14294,8 @@ path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
maxpeelback,defect_rate,pairpool,dynprogR,
/*extendp*/true,/*endalign*/QUERYEND_NOGAPS);
pairs = trim_end5_exons(&indelp,&trim5p,*ambig_end_length_5,pairs,dynprogR,chroffset,chrhigh,
- queryseq_ptr,queryuc_ptr,*cdna_direction,watsonp,jump_late_p,pairpool,defect_rate);
+ queryseq_ptr,queryuc_ptr,querylength,*cdna_direction,watsonp,
+ jump_late_p,pairpool,defect_rate);
if (indelp == true) {
pairs = trim_end5_indels(pairs,*ambig_end_length_5,dynprogR,chroffset,chrhigh,
queryseq_ptr,queryuc_ptr,
@@ -14614,7 +14617,7 @@ Stage3_compute (int *cdna_direction, int *sensedir, List_T *finalpairs1, int *np
if (all_stage2_starts == NULL) {
best_pairs = path_compute_end5(&fwd_ambig_end_length_5,&fwd_ambig_splicetype_5,&fwd_ambig_prob_5,
defect_rate_fwd,pairs_fwd,/*cdna_direction*/+1,watsonp,jump_late_p,
- queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
+ queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,pairpool,dynprogR);
} else {
@@ -14649,7 +14652,7 @@ Stage3_compute (int *cdna_direction, int *sensedir, List_T *finalpairs1, int *np
temp_pairs = path_compute_end5(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
defect_rate_temp,/*pairs*/List_reverse(path_fwd),
/*cdna_direction*/+1,watsonp,jump_late_p,
- queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
+ queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,pairpool,dynprogR);
if (temp_pairs != NULL && end_compare(best_pairs,temp_pairs,/*cdna_direction*/+1,watsonp,
@@ -14731,7 +14734,7 @@ Stage3_compute (int *cdna_direction, int *sensedir, List_T *finalpairs1, int *np
if (all_stage2_starts == NULL) {
best_pairs = path_compute_end5(&rev_ambig_end_length_5,&rev_ambig_splicetype_5,&rev_ambig_prob_5,
defect_rate_rev,pairs_rev,/*cdna_direction*/-1,watsonp,jump_late_p,
- queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
+ queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,pairpool,dynprogR);
@@ -14758,7 +14761,7 @@ Stage3_compute (int *cdna_direction, int *sensedir, List_T *finalpairs1, int *np
temp_pairs = path_compute_end5(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
defect_rate_temp,/*pairs*/List_reverse(path_rev),
/*cdna_direction*/-1,watsonp,jump_late_p,
- queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
+ queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
knownsplice_limit_low,knownsplice_limit_high,
maxpeelback,pairpool,dynprogR);
if (temp_pairs != NULL && end_compare(best_pairs,temp_pairs,/*cdna_direction*/-1,watsonp,
diff --git a/src/stage3hr.c b/src/stage3hr.c
index b701757..8f5973e 100644
--- a/src/stage3hr.c
+++ b/src/stage3hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage3hr.c 209126 2017-08-15 19:34:28Z twu $";
+static char rcsid[] = "$Id: stage3hr.c 210602 2017-10-14 00:33:11Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -10352,7 +10352,9 @@ Stage3end_eval_and_sort (int npaths, int *first_absmq, int *second_absmq,
stage3array[i]->mapq_loglik -= maxlik;
}
+#if 0
/* Save on computation if possible */
+ /* Not possible, since we are going to select randomly from among all npaths */
if (npaths < maxpaths) {
compute_npaths = npaths;
} else {
@@ -10361,6 +10363,9 @@ Stage3end_eval_and_sort (int npaths, int *first_absmq, int *second_absmq,
if (compute_npaths < 2) {
compute_npaths = 2;
}
+#else
+ compute_npaths = npaths;
+#endif
/* Compute absolute mapq */
for (i = 0; i < compute_npaths; i++) {
@@ -10591,7 +10596,9 @@ Stage3end_eval_and_sort_guided (int npaths, int *first_absmq, int *second_absmq,
stage3array[i]->mapq_loglik -= maxlik;
}
+#if 0
/* Save on computation if possible */
+ /* Not possible, since we are going to select randomly from among all paths */
if (npaths < maxpaths) {
compute_npaths = npaths;
} else {
@@ -10600,6 +10607,9 @@ Stage3end_eval_and_sort_guided (int npaths, int *first_absmq, int *second_absmq,
if (compute_npaths < 2) {
compute_npaths = 2;
}
+#else
+ compute_npaths = npaths;
+#endif
/* Compute absolute mapq */
for (i = 0; i < compute_npaths; i++) {
@@ -15202,6 +15212,8 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, int *amb_resol
*amb_resolve_5 = bingoi5;
*amb_resolve_3 = bingoi3;
*amb_status_inside = AMB_RESOLVED_BYLENGTH;
+ hit5->genomicend = end_ambcoords[bingoi5] + end_amb_length_5;
+ hit3->genomicstart = start_ambcoords[bingoi3] - start_amb_length_3;
} else if (nbest == 0) {
debug9(printf("\nnbest is zero: amb_length = %d...%d",
@@ -15214,6 +15226,8 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, int *amb_resol
*amb_resolve_5 = besti5;
*amb_resolve_3 = besti3;
*amb_status_inside = AMB_RESOLVED_BYMATCHES;
+ hit5->genomicend = end_ambcoords[besti5] + end_amb_length_5;
+ hit3->genomicstart = start_ambcoords[besti3] - start_amb_length_3;
} else {
*amb_resolve_5 = -1; /* Signifies cannot resolve */
@@ -15259,6 +15273,7 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, int *amb_resol
debug9(printf("\nnbingo is 1\n"));
*amb_resolve_5 = bingoi5;
*amb_status_inside = AMB_RESOLVED_BYLENGTH;
+ hit5->genomicend = end_ambcoords[bingoi5] + end_amb_length_5;
} else if (nbest == 0) {
debug9(printf("\nnbest is zero: amb_length = %d...%d",
@@ -15270,6 +15285,7 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, int *amb_resol
debug9(printf("\nnbest is 1, with nmismatches %d\n",best_nmismatches));
*amb_resolve_5 = besti5;
*amb_status_inside = AMB_RESOLVED_BYMATCHES;
+ hit5->genomicend = end_ambcoords[besti5] + end_amb_length_5;
} else {
*amb_resolve_5 = -1;
@@ -15312,6 +15328,7 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, int *amb_resol
debug9(printf("\nnbingo is 1\n"));
*amb_resolve_3 = bingoi3;
*amb_status_inside = AMB_RESOLVED_BYLENGTH;
+ hit3->genomicstart = start_ambcoords[bingoi3] - start_amb_length_3;
} else if (nbest == 0) {
debug9(printf("\nnbest is zero: amb_length = %d...%d",
@@ -15323,6 +15340,7 @@ resolve_inside_ambiguous_splice_plus (int *unresolved_amb_length, int *amb_resol
debug9(printf("\nnbest is 1, with nmismatches %d\n",best_nmismatches));
*amb_resolve_3 = besti3;
*amb_status_inside = AMB_RESOLVED_BYMATCHES;
+ hit3->genomicstart = start_ambcoords[besti3] - start_amb_length_3;
} else {
*amb_resolve_3 = -1;
@@ -15417,6 +15435,8 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, int *amb_reso
*amb_resolve_5 = bingoi5;
*amb_resolve_3 = bingoi3;
*amb_status_inside = AMB_RESOLVED_BYLENGTH;
+ hit5->genomicend = end_ambcoords[bingoi5] - end_amb_length_5;
+ hit3->genomicstart = start_ambcoords[bingoi3] + start_amb_length_3;
} else if (nbest == 0) {
debug9(printf("\nnbest is zero: amb_length = %d...%d",
@@ -15431,6 +15451,8 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, int *amb_reso
*amb_resolve_5 = besti5;
*amb_resolve_3 = besti3;
*amb_status_inside = AMB_RESOLVED_BYMATCHES;
+ hit5->genomicend = end_ambcoords[besti5] - end_amb_length_5;
+ hit3->genomicstart = start_ambcoords[besti3] + start_amb_length_3;
} else {
*amb_resolve_5 = -1; /* Signifies cannot resolve */
@@ -15475,6 +15497,7 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, int *amb_reso
debug9(printf("\nnbingo is 1\n"));
*amb_resolve_5 = bingoi5;
*amb_status_inside = AMB_RESOLVED_BYLENGTH;
+ hit5->genomicend = end_ambcoords[bingoi5] - end_amb_length_5;
} else if (nbest == 0) {
debug9(printf("\nnbest is zero: amb_length = %d...%d",
@@ -15487,6 +15510,7 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, int *amb_reso
debug9(printf("\nnbest is 1, with nmismatches %d\n",best_nmismatches));
*amb_resolve_5 = besti5;
*amb_status_inside = AMB_RESOLVED_BYMATCHES;
+ hit5->genomicend = end_ambcoords[besti5] - end_amb_length_5;
} else {
*amb_resolve_5 = -1;
@@ -15529,6 +15553,7 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, int *amb_reso
debug9(printf("\nnbingo is 1\n"));
*amb_resolve_3 = bingoi3;
*amb_status_inside = AMB_RESOLVED_BYLENGTH;
+ hit3->genomicstart = start_ambcoords[bingoi3] + start_amb_length_3;
} else if (nbest == 0) {
debug9(printf("\nnbest is zero: amb_length = %d...%d",
@@ -15541,6 +15566,7 @@ resolve_inside_ambiguous_splice_minus (int *unresolved_amb_length, int *amb_reso
debug9(printf("\nnbest is 1, with nmismatches %d\n",best_nmismatches));
*amb_resolve_3 = besti3;
*amb_status_inside = AMB_RESOLVED_BYMATCHES;
+ hit3->genomicstart = start_ambcoords[besti3] + start_amb_length_3;
} else {
*amb_resolve_3 = -1;
diff --git a/src/translation.c b/src/translation.c
index d81e423..9ad729f 100644
--- a/src/translation.c
+++ b/src/translation.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: translation.c 210069 2017-09-23 00:16:39Z twu $";
+static char rcsid[] = "$Id: translation.c 210510 2017-10-12 21:09:38Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -198,7 +198,7 @@ static char *initiation_table;
/* Taken from http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi" on April 2016 */
void
-Translation_setup (int translation_code) {
+Translation_setup (int translation_code, bool alt_initiation_codons_p) {
switch (translation_code) {
case 1: /* The Standard Code */
translation_table = "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
@@ -300,6 +300,10 @@ Translation_setup (int translation_code) {
exit(9);
}
+ if (alt_initiation_codons_p == false) {
+ initiation_table = "-----------------------------------M----------------------------";
+ }
+
return;
}
diff --git a/src/translation.h b/src/translation.h
index 0ce43b0..f5ad949 100644
--- a/src/translation.h
+++ b/src/translation.h
@@ -1,4 +1,4 @@
-/* $Id: translation.h 188718 2016-04-30 01:53:47Z twu $ */
+/* $Id: translation.h 210510 2017-10-12 21:09:38Z twu $ */
#ifndef TRANSLATION_INCLUDED
#define TRANSLATION_INCLUDED
@@ -12,7 +12,7 @@
typedef struct T *T;
extern void
-Translation_setup (int translation_code);
+Translation_setup (int translation_code, bool alt_initiation_codons_p);
#ifdef PMAP
extern void
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gmap.git
More information about the debian-med-commit
mailing list