[med-svn] [gmap] 01/04: Imported Upstream version 2015-12-31.v9
Alex Mestiashvili
malex-guest at moszumanska.debian.org
Fri Mar 18 09:29:32 UTC 2016
This is an automated email from the git hooks/post-receive script.
malex-guest pushed a commit to branch master
in repository gmap.
commit e3408d5a15b373f1241d461a0ab57b5a631fcf7d
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date: Fri Mar 18 09:47:27 2016 +0100
Imported Upstream version 2015-12-31.v9
---
ChangeLog | 15 ++
src/gsnap.c | 20 +-
src/pair.c | 60 ++++--
src/sam_sort.c | 5 +-
src/samread.c | 457 +++++++++++++++++++++++++-------------------
src/stage1hr.c | 18 +-
src/stage1hr.h | 4 +-
src/stage2.c | 37 +++-
src/uniqscan.c | 4 +-
util/gff3_genes.pl.in | 57 ++++--
util/gff3_introns.pl.in | 112 ++++++++---
util/gff3_splicesites.pl.in | 145 +++++++++-----
12 files changed, 615 insertions(+), 319 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 37cee49..af65f81 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2016-03-17 twu
+
+ * gff3_genes.pl.in, gff3_introns.pl.in, gff3_splicesites.pl.in: Merged
+ revision 186095 from trunk to handle parsing of latest NCBI gff3 files
+
+ * pair.c, stage2.c: Changed occurrences of abs() to explicit conditional
+ statements, since abs() can give large integers with -m64 compiler flag
+
+ * gsnap.c, stage1hr.c, stage1hr.h, uniqscan.c: Added option --max-anchors
+
+ * samread.c: Made version consistent with trunk, adding breaks after cases
+ and handling EOF. Added debugging statements.
+
+ * sam_sort.c: Added todo comment
+
2016-02-18 twu
* stage3hr.c: Applied revision 184528 from trunk to check for stage2pairs
diff --git a/src/gsnap.c b/src/gsnap.c
index bf91e19..add5511 100644
--- a/src/gsnap.c
+++ b/src/gsnap.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: gsnap.c 181923 2016-01-08 00:43:56Z twu $";
+static char rcsid[] = "$Id: gsnap.c 186090 2016-03-17 22:21:17Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -330,6 +330,9 @@ static Width_T required_index1interval = 0;
static Width_T spansize;
static int indexdb_size_threshold;
+/* Option for GSNAP complete set algorithm. Affects speed significantly. */
+static int max_anchors = 10;
+
/* Genes IIT */
static char *genes_file = (char *) NULL;
@@ -555,6 +558,7 @@ static struct option long_options[] = {
{"runlengthdir", required_argument, 0, 0}, /* user_runlengthdir */
{"use-runlength", required_argument, 0, 0}, /* runlength_root */
+ {"max-anchors", required_argument, 0, 0}, /* max_anchors */
{"gmap-mode", required_argument, 0, 0}, /* gmap_mode */
{"trigger-score-for-gmap", required_argument, 0, 0}, /* trigger_score_for_gmap */
{"gmap-min-match-length", required_argument, 0, 0}, /* gmap_min_nconsecutive */
@@ -1728,6 +1732,9 @@ parse_command_line (int argc, char *argv[], int optind) {
} else if (!strcmp(long_name,"use-runlength")) {
runlength_root = optarg;
+ } else if (!strcmp(long_name,"max-anchors")) {
+ max_anchors = atoi(check_valid_int(optarg));
+
} else if (!strcmp(long_name,"gmap-mode")) {
gmap_mode = 0; /* Initialize */
string = strtok(optarg,",");
@@ -3230,7 +3237,7 @@ worker_setup (char *genomesubdir, char *fileroot) {
/*snpp*/snps_iit ? true : false,amb_closest_p,amb_clip_p,min_shortend);
Splice_setup(min_shortend);
Indel_setup(min_indel_end_matches,indel_penalty_middle);
- Stage1hr_setup(use_sarray_p,use_only_sarray_p,index1part,index1interval,spansize,chromosome_iit,nchromosomes,
+ Stage1hr_setup(use_sarray_p,use_only_sarray_p,index1part,index1interval,spansize,max_anchors,chromosome_iit,nchromosomes,
genomecomp,genomecomp_alt,mode,maxpaths_search,
splicesites,splicetypes,splicedists,nsplicesites,
novelsplicingp,knownsplicingp,find_dna_chimeras_p,distances_observed_p,
@@ -4120,6 +4127,15 @@ is still designed to be fast.\n\
");
#endif
+ fprintf(stdout,"\
+ --max-anchors=INT Controls number of candidate segments returned by the complete set algorithm\n\
+ Default is 10. Can be increased to higher values to solve alignments with\n\
+ evenly spaced mismatches at close distances. However, higher values will\n\
+ cause GSNAP to run more slowly. A value of 1000, for example, slows down\n\
+ the program by a factor of 10 or so. Therefore, change this value only if\n\
+ absolutely necessary.\n\
+");
+
fprintf(stdout,"\n");
fprintf(stdout,"Options for GMAP alignment within GSNAP\n");
fprintf(stdout,"\
diff --git a/src/pair.c b/src/pair.c
index 3fd1649..56e3894 100644
--- a/src/pair.c
+++ b/src/pair.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: pair.c 179365 2015-11-20 22:15:56Z twu $";
+static char rcsid[] = "$Id: pair.c 186092 2016-03-17 22:28:37Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -853,7 +853,13 @@ add_intronlengths (struct T *pairs, int npairs) {
}
if (comp == DUALBREAK_COMP || comp == EXTRAEXON_COMP) {
- sprintf(cdnabreak,"%d",abs(this->querypos - last_querypos)-1);
+ /* abs() gives a large value when flag -m64 is specified */
+ /* sprintf(cdnabreak,"%d",abs(this->querypos - last_querypos)-1); */
+ if (this->querypos > last_querypos) {
+ sprintf(cdnabreak,"%d",(this->querypos - last_querypos) - 1);
+ } else {
+ sprintf(cdnabreak,"%d",(last_querypos - this->querypos) - 1);
+ }
if (this->genomepos < last_genomepos) {
sprintf(genomicbreak,"%d",last_genomepos - this->genomepos - 1);
} else {
@@ -2561,7 +2567,7 @@ print_gff3_exons_forward (Filestring_T fp, struct T *pairs, int npairs, int path
int pctidentity, num = 0, den = 0, exonno = 0, i;
int Mlength = 0, Ilength = 0, Dlength = 0;
List_T tokens = NULL;
- char token[10];
+ char token[11];
#if 0
int intronno = 0;
#endif
@@ -2631,7 +2637,14 @@ print_gff3_exons_forward (Filestring_T fp, struct T *pairs, int npairs, int path
}
if (gff_estmatch_format_p == true && i > 0) {
- sprintf(token,"N%u",abs(intron_end - intron_start) + 1);
+ /* abs() gives a large value when flag -m64 is specified */
+ /* sprintf(token,"N%u",abs(intron_end - intron_start) + 1); */
+ if (intron_end > intron_start) {
+ sprintf(token,"N%u",(intron_end - intron_start) + 1);
+ } else {
+ sprintf(token,"N%u",(intron_start - intron_end) + 1);
+ }
+
tokens = push_token(tokens,token);
} else if (gff_introns_p == true) {
if (i > 0) {
@@ -5137,7 +5150,7 @@ hardclip_pairs (int *clipped_npairs, int hardclip_start, int hardclip_end,
List_T
Pair_clean_cigar (List_T tokens, bool watsonp) {
List_T clean, unique = NULL, p;
- char token[10], *curr_token, *last_token;
+ char token[11], *curr_token, *last_token;
int length = 0;
char type, last_type = ' ';
bool duplicatep = false;
@@ -5222,7 +5235,7 @@ List_T
Pair_compute_cigar (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
bool watsonp, int sensedir, int chimera_part) {
List_T tokens = NULL;
- char token[10];
+ char token[11];
int Mlength = 0, Ilength = 0, Dlength = 0;
bool in_exon = false, deletionp;
struct T *ptr, *prev, *this = NULL;
@@ -5322,7 +5335,13 @@ Pair_compute_cigar (bool *intronp, int *hardclip_start, int *hardclip_end, struc
if (prev != NULL) {
/* Gap */
- genome_gap = abs(intron_end - intron_start) + 1;
+ /* abs() gives a large value when flag -m64 is specified */
+ /* genome_gap = abs(intron_end - intron_start) + 1; */
+ if (intron_end > intron_start) {
+ genome_gap = (intron_end - intron_start) + 1;
+ } else {
+ genome_gap = (intron_start - intron_end) + 1;
+ }
deletionp = false;
#ifdef CONVERT_INTRONS_TO_DELETIONS
@@ -5843,7 +5862,7 @@ state_print (MD_state_T state) {
static List_T
compute_md_string_old (int *nmismatches, struct T *pairs, int npairs, bool watsonp) {
List_T tokens = NULL;
- char token[10], *first_token;
+ char token[11], *first_token;
int nmatches = 0;
struct T *ptr, *prev, *this = NULL;
MD_state_T state = IN_MISMATCHES;
@@ -6016,7 +6035,7 @@ static List_T
compute_md_string (int *nmismatches_refdiff, int *nmismatches_bothdiff, int *nindels,
struct T *pairs, int npairs, bool watsonp, List_T cigar_tokens) {
List_T md_tokens = NULL, p;
- char *cigar_token, token[10], *first_token, type;
+ char *cigar_token, token[11], *first_token, type;
Pair_T this;
int nmatches = 0, length;
MD_state_T state = IN_MISMATCHES;
@@ -6508,7 +6527,12 @@ count_psl_blocks_nt (Intlist_T *blockSizes, Intlist_T *qStarts, Uintlist_T *tSta
nblocks++;
block_queryend = last_querypos;
debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1));
- *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1);
+ /* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */
+ if (block_queryend > block_querystart) {
+ *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1);
+ } else {
+ *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1);
+ }
in_block = false;
}
} else if (this->comp == INTRONGAP_COMP) {
@@ -6519,7 +6543,12 @@ count_psl_blocks_nt (Intlist_T *blockSizes, Intlist_T *qStarts, Uintlist_T *tSta
nblocks++;
block_queryend = last_querypos;
debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1));
- *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1);
+ /* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */
+ if (block_queryend > block_querystart) {
+ *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1);
+ } else {
+ *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1);
+ }
in_block = false;
}
@@ -6555,7 +6584,12 @@ count_psl_blocks_nt (Intlist_T *blockSizes, Intlist_T *qStarts, Uintlist_T *tSta
nblocks++;
block_queryend = last_querypos;
debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1));
- *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1);
+ /* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */
+ if (block_queryend > block_querystart) {
+ *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1);
+ } else {
+ *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1);
+ }
}
*blockSizes = Intlist_reverse(*blockSizes);
@@ -8140,7 +8174,7 @@ Pair_print_compressed (Filestring_T fp, int pathnum, int npaths, T start, T end,
Chrpos_T exon_genomestart = -1, exon_genomeend, intron_start, intron_end;
int num = 0, den = 0, runlength = 0, i;
int print_dinucleotide_p;
- char token[10], donor[3], acceptor[3], *chr;
+ char token[11], donor[3], acceptor[3], *chr;
double coverage;
/* double trimmed_coverage; */
int last_querypos = -1;
diff --git a/src/sam_sort.c b/src/sam_sort.c
index 9438ad7..a49097b 100644
--- a/src/sam_sort.c
+++ b/src/sam_sort.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: sam_sort.c 155408 2014-12-16 07:00:44Z twu $";
+static char rcsid[] = "$Id: sam_sort.c 186087 2016-03-17 22:12:51Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -27,6 +27,9 @@ static char rcsid[] = "$Id: sam_sort.c 155408 2014-12-16 07:00:44Z twu $";
#include "getopt.h"
+/* To do: Check of reverse complements of paired-reads work properly.
+ May want to use low/high instead of first/second read */
+
/************************************************************************
*
* Check for correctness:
diff --git a/src/samread.c b/src/samread.c
index 440a380..a3253b0 100644
--- a/src/samread.c
+++ b/src/samread.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: samread.c 154570 2014-12-03 22:06:33Z twu $";
+static char rcsid[] = "$Id: samread.c 186088 2016-03-17 22:13:46Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -25,6 +25,13 @@ static char rcsid[] = "$Id: samread.c 154570 2014-12-03 22:06:33Z twu $";
#define debug(x)
#endif
+/* Prints location of parsing error for Unexpected output type */
+#ifdef DEBUG1
+#define debug1(x) x
+#else
+#define debug1(x)
+#endif
+
static int
cigar_string_readlength (int *hardclip_low, int *hardclip_high, char *cigar) {
@@ -149,10 +156,10 @@ Samread_get_acc (unsigned int *flag, char *line) {
/* Takes advantage of the fact that sam_sort knows the linelength */
char *
Samread_get_acc_fromfile (int *acclength, FILE *fp, int linelength) {
- char *acc, *p;
+ char *acc, *p, c;
p = acc = MALLOC((linelength + 1) * sizeof(char));
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0'; /* terminating char */
*acclength = (p - acc)/sizeof(char);
@@ -179,9 +186,11 @@ parse_XO_fromfile (FILE *fp) {
if (abbrev1 == 'M') {
return OUTPUT_NM;
} else {
+ debug1(fprintf(stderr,"parse_XO_from_file 1: "));
fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
return OUTPUT_NONE;
}
+ break;
case 'C':
switch (abbrev1) {
case 'U': return OUTPUT_CU;
@@ -189,8 +198,11 @@ parse_XO_fromfile (FILE *fp) {
case 'T': return OUTPUT_CT;
case 'M': return OUTPUT_CM;
case 'X': return OUTPUT_CX;
- default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 2: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
}
+ break;
case 'H':
switch (abbrev1) {
case 'U': return OUTPUT_HU;
@@ -198,8 +210,11 @@ parse_XO_fromfile (FILE *fp) {
case 'T': return OUTPUT_HT;
case 'M': return OUTPUT_HM;
case 'X': return OUTPUT_HX;
- default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 3: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
}
+ break;
case 'U':
switch (abbrev1) {
case 'U': return OUTPUT_UU;
@@ -207,8 +222,11 @@ parse_XO_fromfile (FILE *fp) {
case 'T': return OUTPUT_UT;
case 'M': return OUTPUT_UM;
case 'X': return OUTPUT_UX;
- default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 4: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
}
+ break;
case 'P':
switch (abbrev1) {
case 'C': return OUTPUT_PC;
@@ -217,72 +235,91 @@ parse_XO_fromfile (FILE *fp) {
case 'L': return OUTPUT_PL;
case 'M': return OUTPUT_PM;
case 'X': return OUTPUT_PX;
- default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 5: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
}
- default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 6: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
}
}
- while (c != '\0') {
- while ((c = fgetc(fp)) != '\0' && c != '\t') ;
- if (c == '\0') {
- return OUTPUT_NONE;
- } else {
- c0 = fgetc(fp);
- c1 = fgetc(fp);
- if (c0 == 'X' && c1 == 'O') {
- fgetc(fp); /* : */
- fgetc(fp); /* type */
- fgetc(fp); /* : */
- abbrev0 = fgetc(fp);
- abbrev1 = fgetc(fp);
- switch (abbrev0) {
- case 'N':
- if (abbrev1 == 'M') {
- return OUTPUT_NM;
- } else {
- fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
- return OUTPUT_NONE;
- }
- case 'C':
- switch (abbrev1) {
- case 'U': return OUTPUT_CU;
- case 'C': return OUTPUT_CC;
- case 'T': return OUTPUT_CT;
- case 'M': return OUTPUT_CM;
- case 'X': return OUTPUT_CX;
- default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
- }
- case 'H':
- switch (abbrev1) {
- case 'U': return OUTPUT_HU;
- case 'C': return OUTPUT_HC;
- case 'T': return OUTPUT_HT;
- case 'M': return OUTPUT_HM;
- case 'X': return OUTPUT_HX;
- default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
- }
- case 'U':
- switch (abbrev1) {
- case 'U': return OUTPUT_UU;
- case 'C': return OUTPUT_UC;
- case 'T': return OUTPUT_UT;
- case 'M': return OUTPUT_UM;
- case 'X': return OUTPUT_UX;
- default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
- }
- case 'P':
- switch (abbrev1) {
- case 'C': return OUTPUT_PC;
- case 'I': return OUTPUT_PI;
- case 'S': return OUTPUT_PS;
- case 'L': return OUTPUT_PL;
- case 'M': return OUTPUT_PM;
- case 'X': return OUTPUT_PX;
- default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
- }
- default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+ if (c == EOF || c == '\0') {
+ return OUTPUT_NONE;
+ } else {
+ c0 = fgetc(fp);
+ c1 = fgetc(fp);
+ if (c0 == 'X' && c1 == 'O') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+ abbrev0 = fgetc(fp);
+ abbrev1 = fgetc(fp);
+ switch (abbrev0) {
+ case 'N':
+ if (abbrev1 == 'M') {
+ return OUTPUT_NM;
+ } else {
+ debug1(fprintf(stderr,"parse_XO_from_file 7: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ return OUTPUT_NONE;
}
+ break;
+ case 'C':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_CU;
+ case 'C': return OUTPUT_CC;
+ case 'T': return OUTPUT_CT;
+ case 'M': return OUTPUT_CM;
+ case 'X': return OUTPUT_CX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 8: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ case 'H':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_HU;
+ case 'C': return OUTPUT_HC;
+ case 'T': return OUTPUT_HT;
+ case 'M': return OUTPUT_HM;
+ case 'X': return OUTPUT_HX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 9: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ case 'U':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_UU;
+ case 'C': return OUTPUT_UC;
+ case 'T': return OUTPUT_UT;
+ case 'M': return OUTPUT_UM;
+ case 'X': return OUTPUT_UX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 10: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ case 'P':
+ switch (abbrev1) {
+ case 'C': return OUTPUT_PC;
+ case 'I': return OUTPUT_PI;
+ case 'S': return OUTPUT_PS;
+ case 'L': return OUTPUT_PL;
+ case 'M': return OUTPUT_PM;
+ case 'X': return OUTPUT_PX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 11: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 12: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
}
}
}
@@ -310,7 +347,7 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) {
fgetc(fp); /* : */
p = *hiti;
- while ((c = *p++ = fgetc(fp)) != '\0' && c != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
*--p = '\0'; /* terminating char */
} else if (c0 == 'X' && c1 == 'O') {
@@ -324,9 +361,11 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) {
if (abbrev1 == 'M') {
split_output = OUTPUT_NM;
} else {
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 1: "));
fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
split_output = OUTPUT_NONE;
}
+ break;
case 'C':
switch (abbrev1) {
case 'U': split_output = OUTPUT_CU; break;
@@ -335,9 +374,11 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) {
case 'M': split_output = OUTPUT_CM; break;
case 'X': split_output = OUTPUT_CX; break;
default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 2: "));
fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
split_output = OUTPUT_NONE;
}
+ break;
case 'H':
switch (abbrev1) {
case 'U': split_output = OUTPUT_HU; break;
@@ -346,9 +387,11 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) {
case 'M': split_output = OUTPUT_HM; break;
case 'X': split_output = OUTPUT_HX; break;
default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 3: "));
fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
split_output = OUTPUT_NONE;
}
+ break;
case 'U':
switch (abbrev1) {
case 'U': split_output = OUTPUT_UU; break;
@@ -357,9 +400,11 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) {
case 'M': split_output = OUTPUT_UM; break;
case 'X': split_output = OUTPUT_UX; break;
default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 4: "));
fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
split_output = OUTPUT_NONE;
}
+ break;
case 'P':
switch (abbrev1) {
case 'C': split_output = OUTPUT_PC; break;
@@ -369,94 +414,106 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) {
case 'M': split_output = OUTPUT_PM; break;
case 'X': split_output = OUTPUT_PX; break;
default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 5: "));
fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
split_output = OUTPUT_NONE;
}
+ break;
default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 6: "));
fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
split_output = OUTPUT_NONE;
}
}
- while (c != '\0') {
- while ((c = fgetc(fp)) != '\0' && c != '\t') ;
- if (c == '\0') {
- return split_output;
- } else {
- c0 = fgetc(fp);
- c1 = fgetc(fp);
- if (c0 == 'H' && c1 == 'I') {
- fgetc(fp); /* : */
- fgetc(fp); /* type */
- fgetc(fp); /* : */
-
- p = *hiti;
- while ((c = *p++ = fgetc(fp)) != '\0' && c != '\t') ;
- *--p = '\0'; /* terminating char */
-
- } else if (c0 == 'X' && c1 == 'O') {
- fgetc(fp); /* : */
- fgetc(fp); /* type */
- fgetc(fp); /* : */
- abbrev0 = fgetc(fp);
- abbrev1 = fgetc(fp);
- switch (abbrev0) {
- case 'N':
- if (abbrev1 == 'M') {
- split_output = OUTPUT_NM;
- } else {
- fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
- split_output = OUTPUT_NONE;
- }
- case 'C':
- switch (abbrev1) {
- case 'U': split_output = OUTPUT_CU; break;
- case 'C': split_output = OUTPUT_CC; break;
- case 'T': split_output = OUTPUT_CT; break;
- case 'M': split_output = OUTPUT_CM; break;
- case 'X': split_output = OUTPUT_CX; break;
- default:
- fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
- split_output = OUTPUT_NONE;
- }
- case 'H':
- switch (abbrev1) {
- case 'U': split_output = OUTPUT_HU; break;
- case 'C': split_output = OUTPUT_HC; break;
- case 'T': split_output = OUTPUT_HT; break;
- case 'M': split_output = OUTPUT_HM; break;
- case 'X': split_output = OUTPUT_HX; break;
- default:
- fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
- split_output = OUTPUT_NONE;
- }
- case 'U':
- switch (abbrev1) {
- case 'U': split_output = OUTPUT_UU; break;
- case 'C': split_output = OUTPUT_UC; break;
- case 'T': split_output = OUTPUT_UT; break;
- case 'M': split_output = OUTPUT_UM; break;
- case 'X': split_output = OUTPUT_UX; break;
- default:
- fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
- split_output = OUTPUT_NONE;
- }
- case 'P':
- switch (abbrev1) {
- case 'C': split_output = OUTPUT_PC; break;
- case 'I': split_output = OUTPUT_PI; break;
- case 'S': split_output = OUTPUT_PS; break;
- case 'L': split_output = OUTPUT_PL; break;
- case 'M': split_output = OUTPUT_PM; break;
- case 'X': split_output = OUTPUT_PX; break;
- default:
- fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
- split_output = OUTPUT_NONE;
- }
+ while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+ if (c == EOF || c == '\0') {
+ return split_output;
+ } else {
+ c0 = fgetc(fp);
+ c1 = fgetc(fp);
+ if (c0 == 'H' && c1 == 'I') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+
+ p = *hiti;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+ *--p = '\0'; /* terminating char */
+
+ } else if (c0 == 'X' && c1 == 'O') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+ abbrev0 = fgetc(fp);
+ abbrev1 = fgetc(fp);
+ switch (abbrev0) {
+ case 'N':
+ if (abbrev1 == 'M') {
+ split_output = OUTPUT_NM;
+ } else {
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 7: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ case 'C':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_CU; break;
+ case 'C': split_output = OUTPUT_CC; break;
+ case 'T': split_output = OUTPUT_CT; break;
+ case 'M': split_output = OUTPUT_CM; break;
+ case 'X': split_output = OUTPUT_CX; break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 8: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ case 'H':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_HU; break;
+ case 'C': split_output = OUTPUT_HC; break;
+ case 'T': split_output = OUTPUT_HT; break;
+ case 'M': split_output = OUTPUT_HM; break;
+ case 'X': split_output = OUTPUT_HX; break;
default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 9: "));
fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
split_output = OUTPUT_NONE;
}
+ break;
+ case 'U':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_UU; break;
+ case 'C': split_output = OUTPUT_UC; break;
+ case 'T': split_output = OUTPUT_UT; break;
+ case 'M': split_output = OUTPUT_UM; break;
+ case 'X': split_output = OUTPUT_UX; break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 10: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ case 'P':
+ switch (abbrev1) {
+ case 'C': split_output = OUTPUT_PC; break;
+ case 'I': split_output = OUTPUT_PI; break;
+ case 'S': split_output = OUTPUT_PS; break;
+ case 'L': split_output = OUTPUT_PL; break;
+ case 'M': split_output = OUTPUT_PM; break;
+ case 'X': split_output = OUTPUT_PX; break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 11: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 12: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
}
}
}
@@ -471,7 +528,7 @@ char *
Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM_split_output_type *split_output,
char **hiti, Univcoord_T *genomicpos, int *initial_softclip, bool *query_lowp,
FILE *fp, Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength) {
- char *acc, *p;
+ char *acc, *p, c;
char *substring;
Chrnum_T chrnum, mate_chrnum;
Chrpos_T chrpos, mate_chrpos;
@@ -481,13 +538,13 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM
/* 1. QNAME */
p = acc = MALLOC((linelength + 1) * sizeof(char));
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0'; /* terminating char */
*acclength = (p - acc)/sizeof(char);
/* 2. FLAG. Skip */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%u",&(*flag)) != 1) {
@@ -499,13 +556,15 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM
/* 3. RNAME: chr. Skip */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (!strcmp(substring,"*")) {
*genomicpos = 0;
} else if ((chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+ fprintf(stderr,"Available chromosomes in the genome: ");
+ Univ_IIT_dump_labels(stderr,chromosome_iit);
exit(9);
} else {
*genomicpos = chroffsets[chrnum - 1];
@@ -513,7 +572,7 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM
/* 4. POS: chrpos */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%u",&chrpos) != 1) {
@@ -524,11 +583,11 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM
}
/* 5. MAPQ: Mapping quality. Skip */
- while (fgetc(fp) != '\t') ;
+ while ((c = fgetc(fp)) != EOF && c != '\t') ;
/* 6. CIGAR. Parse for initial_softclip */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
*initial_softclip = cigar_string_initial_softclip(substring);
@@ -536,7 +595,7 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM
/* 7. MRNM: Mate chr */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (!strcmp(substring,"*")) {
@@ -546,6 +605,8 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM
mate_genomicpos = chroffsets[mate_chrnum - 1];
} else if ((mate_chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+ fprintf(stderr,"Available chromosomes in the genome: ");
+ Univ_IIT_dump_labels(stderr,chromosome_iit);
exit(9);
} else {
mate_genomicpos = chroffsets[mate_chrnum - 1];
@@ -553,7 +614,7 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM
/* 8. MPOS: Mate chrpos */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%d",&mate_chrpos) != 1) {
@@ -779,7 +840,7 @@ void
Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq, char **chr, Chrpos_T *chrpos, char **cigar,
char **mate_chr, Chrpos_T *mate_chrpos_low,
int *readlength, char **read, char **quality_string, int linelength) {
- char *p;
+ char *p, c;
int length, i;
char *substring;
@@ -789,12 +850,12 @@ Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq
/* 1. QNAME */
p = *acc = (char *) MALLOC((linelength + 1) * sizeof(char));
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
/* 2. FLAG */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%u",&(*flag)) != 1) {
@@ -806,12 +867,12 @@ Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq
/* 3. RNAME: chr */
p = *chr = (char *) MALLOC((linelength + 1) * sizeof(char));
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
/* 4. POS: chrpos */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%u",&(*chrpos)) != 1) {
@@ -823,7 +884,7 @@ Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq
/* 5. MAPQ: Mapping quality */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%d",&(*mapq)) != 1) {
@@ -835,17 +896,17 @@ Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq
/* 5. CIGAR */
p = *cigar = (char *) MALLOC((linelength + 1) * sizeof(char));
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
/* 7. MRNM: Mate chr */
p = *mate_chr = (char *) MALLOC((linelength + 1) * sizeof(char));
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
/* 8. MPOS: Mate chrpos */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%d",&(*mate_chrpos_low)) != 1) {
@@ -860,13 +921,13 @@ Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq
/* 10. SEQ: queryseq */
p = *read = (char *) MALLOC((linelength + 1) * sizeof(char));
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
*readlength = (p - *read)/sizeof(char);
/* 11. QUAL: quality scores */
p = *quality_string = (char *) MALLOC((*readlength + 1) * sizeof(char));
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
length = (p - *quality_string)/sizeof(char);
@@ -914,7 +975,7 @@ Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_outpu
Univcoord_T genomicpos;
Chrnum_T chrnum;
Chrpos_T chrpos;
- char *substring, *p;
+ char *substring, *p, c;
substring = MALLOCA((linelength + 1) * sizeof(char));
@@ -923,7 +984,7 @@ Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_outpu
/* 2. FLAG */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%u",&(*flag)) != 1) {
@@ -935,13 +996,15 @@ Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_outpu
/* 3. RNAME: chr */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (!strcmp(substring,"*")) {
genomicpos = 0;
} else if ((chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+ fprintf(stderr,"Available chromosomes in the genome: ");
+ Univ_IIT_dump_labels(stderr,chromosome_iit);
exit(9);
} else {
genomicpos = chroffsets[chrnum - 1];
@@ -949,7 +1012,7 @@ Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_outpu
/* 4. POS: chrpos */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%u",&chrpos) != 1) {
@@ -992,22 +1055,20 @@ char *
Samread_parse_aux_fromfile (FILE *fp, char *auxfield, int linelength) {
char *value, *p, c = 1, c0, c1;
- while (c != '\0') {
- while ((c = fgetc(fp)) != '\0' && c != '\t') ;
- if (c == '\0') {
- return (char *) NULL;
- } else {
- c0 = fgetc(fp);
- c1 = fgetc(fp);
- if (c0 == auxfield[0] && c1 == auxfield[1]) {
- fgetc(fp); /* : */
- fgetc(fp); /* type */
- fgetc(fp); /* : */
- p = value = MALLOC((linelength+1) * sizeof(char));
- while ((c = *p++ = fgetc(fp)) != '\0' && c != '\t') ;
- *--p = '\0'; /* terminating char */
- return value;
- }
+ while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+ if (c == '\0') {
+ return (char *) NULL;
+ } else {
+ c0 = fgetc(fp);
+ c1 = fgetc(fp);
+ if (c0 == auxfield[0] && c1 == auxfield[1]) {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+ p = value = MALLOC((linelength+1) * sizeof(char));
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+ *--p = '\0'; /* terminating char */
+ return value;
}
}
@@ -1032,7 +1093,7 @@ Samread_parse_read_and_mateinfo_fromfile (FILE *fp, unsigned int *flag, char **m
/* 2. FLAG */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%u",&(*flag)) != 1) {
@@ -1053,7 +1114,7 @@ Samread_parse_read_and_mateinfo_fromfile (FILE *fp, unsigned int *flag, char **m
/* 6. CIGAR. Parse for cigar_readlength. */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
/* For a nomapper, this readlength is incorrect */
@@ -1062,12 +1123,12 @@ Samread_parse_read_and_mateinfo_fromfile (FILE *fp, unsigned int *flag, char **m
/* 7. MRNM: Mate chr */
p = *mate_chr = (char *) MALLOC((linelength + 1) * sizeof(char));
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
/* 8. MPOS: Mate chrpos */
w p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%d",&(*mate_chrpos)) != 1) {
@@ -1085,7 +1146,7 @@ w p = substring;
p = &((*read)[hardclip_low]);
subseq_length = 0;
- while ((*p++ = fgetc(fp)) != '\t') {
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') {
subseq_length++;
}
@@ -1143,7 +1204,7 @@ Samread_parse_read_fromfile (FILE *fp, unsigned int *flag, int *readlength, char
/* 2. FLAG. Skip */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%u",&(*flag)) != 1) {
@@ -1164,7 +1225,7 @@ Samread_parse_read_fromfile (FILE *fp, unsigned int *flag, int *readlength, char
/* 6. CIGAR. Parse for cigar_readlength. */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
/* For a nomapper, this readlength is incorrect */
@@ -1185,7 +1246,7 @@ Samread_parse_read_fromfile (FILE *fp, unsigned int *flag, int *readlength, char
p = &((*read)[hardclip_low]);
subseq_length = 0;
- while ((*p++ = fgetc(fp)) != '\t') {
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') {
subseq_length++;
}
@@ -1235,7 +1296,7 @@ Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Uni
Univcoord_T mate_genomicpos;
Chrpos_T mate_chrpos;
Chrnum_T chrnum, mate_chrnum;
- char *p;
+ char *p, c;
char *substring;
substring = MALLOCA((linelength + 1) * sizeof(char));
@@ -1248,7 +1309,7 @@ Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Uni
/* 3. RNAME: chr */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (!strcmp(substring,"*")) {
@@ -1268,7 +1329,7 @@ Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Uni
/* 7. MRNM: Mate chr */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (!strcmp(substring,"*")) {
@@ -1278,6 +1339,8 @@ Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Uni
mate_genomicpos = chroffsets[mate_chrnum - 1];
} else if ((mate_chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+ fprintf(stderr,"Available chromosomes in the genome: ");
+ Univ_IIT_dump_labels(stderr,chromosome_iit);
exit(9);
} else {
mate_genomicpos = chroffsets[mate_chrnum - 1];
@@ -1285,7 +1348,7 @@ Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Uni
/* 8. MPOS: Mate chrpos */
p = substring;
- while ((*p++ = fgetc(fp)) != '\t') ;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
*--p = '\0';
if (sscanf(substring,"%d",&mate_chrpos) != 1) {
@@ -1406,7 +1469,7 @@ Samread_print_as_duplicate_fromfile (FILE *fp, int linelength) {
unsigned int flag;
/* 1. QNAME */
- while ((c = fgetc(fp)) != '\t') {
+ while ((c = fgetc(fp)) != EOF && c != '\t') {
putchar(c);
nread++;
}
@@ -1415,7 +1478,7 @@ Samread_print_as_duplicate_fromfile (FILE *fp, int linelength) {
/* 2. FLAG */
p = buffer;
- while ((*p++ = fgetc(fp)) != '\t') {
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') {
nread++;
}
nread++;
diff --git a/src/stage1hr.c b/src/stage1hr.c
index bf6a34f..c5dfbeb 100644
--- a/src/stage1hr.c
+++ b/src/stage1hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage1hr.c 182407 2016-01-15 17:41:06Z twu $";
+static char rcsid[] = "$Id: stage1hr.c 186090 2016-03-17 22:21:17Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -96,7 +96,6 @@ static char rcsid[] = "$Id: stage1hr.c 182407 2016-01-15 17:41:06Z twu $";
#define MAX_NTERMINALS 100
#define MAX_ALLOCATION 200
-#define MAX_ANCHORS 1000
static bool use_sarray_p = true;
static bool use_only_sarray_p = true;
@@ -188,6 +187,8 @@ static int minendexon = 9;
static int index1part;
static int index1interval;
static int spansize;
+static int max_anchors;
+
static int two_index1intervals;
static int min_kmer_readlength;
static Univ_IIT_T chromosome_iit;
@@ -4487,12 +4488,12 @@ identify_all_segments (int *nsegments, List_T *anchor_segments, Segment_T **spli
debug(printf("%d all segments\n",List_length(all_segments)));
debug(printf("%d anchor segments\n",List_length(*anchor_segments)));
- if (List_length(all_segments) <= MAX_ANCHORS) {
+ if (List_length(all_segments) <= max_anchors) {
/* Might as well use all segments */
List_free(&(*anchor_segments));
*anchor_segments = List_reverse(all_segments);
- } else if (List_length(*anchor_segments) <= MAX_ANCHORS) {
+ } else if (List_length(*anchor_segments) <= max_anchors) {
/* Use only the good anchor segments */
List_free(&all_segments);
*anchor_segments = List_reverse(*anchor_segments);
@@ -4506,9 +4507,9 @@ identify_all_segments (int *nsegments, List_T *anchor_segments, Segment_T **spli
List_free(&(*anchor_segments));
*anchor_segments = (List_T) NULL;
- length_threshold = array[MAX_ANCHORS]->querypos3 - array[MAX_ANCHORS]->querypos5;
- n = MAX_ANCHORS;
- while (n < nanchors && n < MAX_ANCHORS + /*ties*/100 && array[n]->querypos3 - array[n]->querypos5 == length_threshold) {
+ length_threshold = array[max_anchors]->querypos3 - array[max_anchors]->querypos5;
+ n = max_anchors;
+ while (n < nanchors && n < max_anchors + /*ties*/100 && array[n]->querypos3 - array[n]->querypos5 == length_threshold) {
n++;
}
@@ -21081,7 +21082,7 @@ Stage1hr_cleanup () {
void
Stage1hr_setup (bool use_sarray_p_in, bool use_only_sarray_p_in, int index1part_in, int index1interval_in,
- int spansize_in, Univ_IIT_T chromosome_iit_in, int nchromosomes_in,
+ int spansize_in, int max_anchors_in, Univ_IIT_T chromosome_iit_in, int nchromosomes_in,
Genome_T genome_in, Genome_T genomealt, Mode_T mode_in, int maxpaths_search_in,
Univcoord_T *splicesites_in, Splicetype_T *splicetypes_in,
@@ -21109,6 +21110,7 @@ Stage1hr_setup (bool use_sarray_p_in, bool use_only_sarray_p_in, int index1part_
index1interval = index1interval_in;
two_index1intervals = index1interval_in + index1interval_in;
spansize = spansize_in;
+ max_anchors = max_anchors_in;
min_kmer_readlength = index1part_in + index1interval_in - 1;
chromosome_iit = chromosome_iit_in;
diff --git a/src/stage1hr.h b/src/stage1hr.h
index 84ba163..5bd43f6 100644
--- a/src/stage1hr.h
+++ b/src/stage1hr.h
@@ -1,4 +1,4 @@
-/* $Id: stage1hr.h 173896 2015-09-12 00:11:40Z twu $ */
+/* $Id: stage1hr.h 186090 2016-03-17 22:21:17Z twu $ */
#ifndef STAGE1HR_INCLUDED
#define STAGE1HR_INCLUDED
@@ -89,7 +89,7 @@ Stage1hr_cleanup ();
extern void
Stage1hr_setup (bool use_sarray_p_in, bool use_only_sarray_p_in, int index1part_in, int index1interval_in,
- int spansize_in, Univ_IIT_T chromosome_iit_in, int nchromosomes_in,
+ int spansize_in, int max_anchors_in, Univ_IIT_T chromosome_iit_in, int nchromosomes_in,
Genome_T genome_in, Genome_T genomealt, Mode_T mode_in, int maxpaths_search_in,
Univcoord_T *splicesites_in, Splicetype_T *splicetypes_in,
diff --git a/src/stage2.c b/src/stage2.c
index c0e0e63..5298112 100644
--- a/src/stage2.c
+++ b/src/stage2.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage2.c 183727 2016-02-04 00:40:29Z twu $";
+static char rcsid[] = "$Id: stage2.c 186092 2016-03-17 22:28:37Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -684,7 +684,12 @@ while (prevhit != -1 && (prevposition = mappings[prev_querypos][prevhit]) + inde
prevlink = &(links[prev_querypos][prevhit]);
gendistance = position - prevposition - indexsize_nt;
- diffdistance = abs(gendistance - querydistance);
+ /* diffdistance = abs(gendistance - querydistance); */
+ if (gendistance > querydistance) {
+ diffdistance = gendistance - querydistance;
+ } else {
+ diffdistance = querydistance - gendistance;
+ }
if (diffdistance < maxintronlen) {
if (diffdistance <= EQUAL_DISTANCE_NOT_SPLICING) {
@@ -1054,7 +1059,12 @@ score_querypos_lookback_one (
prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
gendistance = position - prevposition;
- diffdistance = abs(gendistance - querydistance);
+ /* diffdistance = abs(gendistance - querydistance); */
+ if (gendistance > querydistance) {
+ diffdistance = gendistance - querydistance;
+ } else {
+ diffdistance = querydistance - gendistance;
+ }
#ifdef BAD_GMAX
fwd_score = prevlink->fwd_score + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/;
@@ -1557,7 +1567,12 @@ score_querypos_lookback_mult (
prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
gendistance = position - prevposition;
- diffdistance = abs(gendistance - querydistance);
+ /* diffdistance = abs(gendistance - querydistance); */
+ if (gendistance > querydistance) {
+ diffdistance = gendistance - querydistance;
+ } else {
+ diffdistance = querydistance - gendistance;
+ }
#ifdef BAD_GMAX
fwd_score = prevlink->fwd_score + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/;
@@ -1921,7 +1936,12 @@ score_querypos_lookforward_one (
prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
gendistance = prevposition - position;
- diffdistance = abs(gendistance - querydistance);
+ /* diffdistance = abs(gendistance - querydistance); */
+ if (gendistance > querydistance) {
+ diffdistance = gendistance - querydistance;
+ } else {
+ diffdistance = querydistance - gendistance;
+ }
#ifdef BAD_GMAX
fwd_score = prevlink->fwd_score + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/;
@@ -2419,7 +2439,12 @@ score_querypos_lookforward_mult (
prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
gendistance = prevposition - position;
- diffdistance = abs(gendistance - querydistance);
+ /* diffdistance = abs(gendistance - querydistance); */
+ if (gendistance > querydistance) {
+ diffdistance = gendistance - querydistance;
+ } else {
+ diffdistance = querydistance - gendistance;
+ }
#ifdef BAD_GMAX
fwd_score = prevlink->fwd_score + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/;
diff --git a/src/uniqscan.c b/src/uniqscan.c
index 58c9cc6..77699cc 100644
--- a/src/uniqscan.c
+++ b/src/uniqscan.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: uniqscan.c 173896 2015-09-12 00:11:40Z twu $";
+static char rcsid[] = "$Id: uniqscan.c 186090 2016-03-17 22:21:17Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -1260,7 +1260,7 @@ main (int argc, char *argv[]) {
spansize = Spanningelt_setup(index1part,index1interval);
Indel_setup(min_indel_end_matches,indel_penalty_middle);
Stage1hr_setup(/*use_sarray_p*/false,/*use_only_sarray_p*/false,index1part,index1interval,
- spansize,chromosome_iit,nchromosomes,
+ spansize,/*max_anchors*/10,chromosome_iit,nchromosomes,
genome,genomealt,mode,/*maxpaths_search*/10,
splicesites,splicetypes,splicedists,nsplicesites,
novelsplicingp,knownsplicingp,/*find_dna_chimeras_p*/false,distances_observed_p,
diff --git a/util/gff3_genes.pl.in b/util/gff3_genes.pl.in
index eb0fe73..017c010 100644
--- a/util/gff3_genes.pl.in
+++ b/util/gff3_genes.pl.in
@@ -5,6 +5,8 @@ use warnings;
$gene_name = "";
$last_transcript_id = "";
+$last_cds_id = "";
+$cds_transcript_id = "";
@exons = ();
@CDS_regions = ();
while (defined($line = <>)) {
@@ -32,7 +34,10 @@ while (defined($line = <>)) {
print "$gene_name\n";
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand);
- } elsif ($#CDS_regions >= 0) {
+ }
+ @exons = ();
+
+ if ($#CDS_regions >= 0) {
# Handle last mRNA of previous gene
if ($strand eq "+") {
($start,$end) = get_gene_bounds_plus(\@CDS_regions);
@@ -44,15 +49,14 @@ while (defined($line = <>)) {
die "strand $strand";
}
print "$gene_name\n";
- print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand);
+ print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand);
}
+ @CDS_regions = ();
($gene_name) = $fields[8] =~ /ID=([^;]+)/;
$chr = $fields[0];
- @exons = ();
- @CDS_regions = ();
- } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") {
+ } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript" || $fields[2] eq "ncRNA") {
if ($#exons >= 0) {
if ($strand eq "+") {
($start,$end) = get_gene_bounds_plus(\@exons);
@@ -66,7 +70,10 @@ while (defined($line = <>)) {
print "$gene_name\n";
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand);
- } elsif ($#CDS_regions >= 0) {
+ }
+ @exons = ();
+
+ if ($#CDS_regions >= 0) {
if ($strand eq "+") {
($start,$end) = get_gene_bounds_plus(\@CDS_regions);
printf ">$last_transcript_id $chr:%u..%u\n",$start,$end;
@@ -77,8 +84,9 @@ while (defined($line = <>)) {
die "strand $strand";
}
print "$gene_name\n";
- print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand);
+ print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand);
}
+ @CDS_regions = ();
($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/;
$strand = $fields[6];
@@ -87,14 +95,34 @@ while (defined($line = <>)) {
if (!defined($gene_name) || $gene_name !~ /\S/) {
$gene_name = $last_transcript_id;
}
- $chr = $fields[0];
- @exons = ();
- @CDS_regions = ();
+ if (!defined($chr) || $chr !~ /\S/) {
+ $chr = $fields[0];
+ }
} elsif ($fields[2] eq "exon") {
- push @exons,"$fields[3] $fields[4]";
+ ($parent) = $fields[8] =~ /Parent=([^;]+)/;
+ if (!defined($parent) || $parent eq $last_transcript_id) {
+ push @exons,"$fields[3] $fields[4]";
+ }
+
} elsif ($fields[2] eq "CDS") {
- push @CDS_regions,"$fields[3] $fields[4]";
+ ($cds_id) = $fields[8] =~ /ID=([^;]+)/;
+ if (!defined($cds_id)) {
+ push @CDS_regions,"$fields[3] $fields[4]";
+ } else {
+ if ($cds_id ne $last_cds_id) {
+ if ($#CDS_regions > 0) {
+ print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand);
+ }
+ @CDS_regions = (); # Need to clear single CDS regions
+ }
+ push @CDS_regions,"$fields[3] $fields[4]";
+ $last_cds_id = $cds_id;
+ }
+ ($cds_transcript_id) = $fields[8] =~ /Parent=([^;]+)/;
+ if (!defined($cds_transcript_id)) {
+ $cds_transcript_id = $last_transcript_id;
+ }
}
}
}
@@ -112,7 +140,9 @@ if ($#exons >= 0) {
print "$gene_name\n";
print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand);
-} elsif ($#CDS_regions >= 0) {
+}
+ at exons = ();
+if ($#CDS_regions >= 0) {
if ($strand eq "+") {
($start,$end) = get_gene_bounds_plus(\@CDS_regions);
printf ">$last_transcript_id $chr:%u..%u\n",$start,$end;
@@ -125,6 +155,7 @@ if ($#exons >= 0) {
print "$gene_name\n";
print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand);
}
+ at CDS_regions = ();
exit;
diff --git a/util/gff3_introns.pl.in b/util/gff3_introns.pl.in
index 73699e3..77acd71 100755
--- a/util/gff3_introns.pl.in
+++ b/util/gff3_introns.pl.in
@@ -28,6 +28,7 @@ if (defined($opt_d)) {
$gene_name = "";
$last_transcript_id = "";
+ $last_cds_id = "";
@exons = ();
@CDS_regions = ();
while (defined($line = <>)) {
@@ -46,25 +47,29 @@ if (defined($opt_d)) {
# Handle last mRNA of previous gene
query_dinucleotides(\@exons,$chr,$strand,$FP);
#print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
- } elsif ($#CDS_regions > 0) {
+ }
+ @exons = (); # Need to clear single exons
+ if ($#CDS_regions > 0) {
# Handle last mRNA of previous gene
query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
#print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
+ @CDS_regions = (); # Need to clear single CDS regions
($gene_name) = $fields[8] =~ /ID=([^;]+)/;
$chr = $fields[0];
- @exons = ();
- @CDS_regions = ();
- } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") {
+ } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript" || $fields[2] eq "primary_transcript" || $fields[2] eq "ncRNA") {
if ($#exons > 0) {
query_dinucleotides(\@exons,$chr,$strand,$FP);
#print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
- } elsif ($#CDS_regions > 0) {
+ }
+ @exons = (); # Need to clear single exons
+ if ($#CDS_regions > 0) {
query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
#print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
+ @CDS_regions = (); # Need to clear single CDS regions
($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/;
$strand = $fields[6];
@@ -73,14 +78,34 @@ if (defined($opt_d)) {
if (!defined($gene_name) || $gene_name !~ /\S/) {
$gene_name = $last_transcript_id;
}
- $chr = $fields[0];
- @exons = ();
- @CDS_regions = ();
+ if (!defined($chr) || $chr !~ /\S/) {
+ $chr = $fields[0];
+ }
} elsif ($fields[2] eq "exon") {
- push @exons,"$fields[3] $fields[4]";
+ ($parent) = $fields[8] =~ /Parent=([^;]+)/;
+ if (!defined($parent) || $parent eq $last_transcript_id) {
+ push @exons,"$fields[3] $fields[4]";
+ }
+
} elsif ($fields[2] eq "CDS") {
- push @CDS_regions,"$fields[3] $fields[4]";
+ ($cds_id) = $fields[8] =~ /ID=([^;]+)/;
+ if (!defined($cds_id)) {
+ push @CDS_regions,"$fields[3] $fields[4]";
+ } else {
+ if ($cds_id ne $last_cds_id) {
+ if ($#CDS_regions > 0) {
+ query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
+ }
+ @CDS_regions = (); # Need to clear single CDS regions
+ }
+ push @CDS_regions,"$fields[3] $fields[4]";
+ $last_cds_id = $cds_id;
+ }
+ ($cds_transcript_id) = $fields[8] =~ /Parent=([^;]+)/;
+ if (!defined($cds_transcript_id)) {
+ $cds_transcript_id = $last_transcript_id;
+ }
}
}
}
@@ -88,9 +113,12 @@ if (defined($opt_d)) {
if ($#exons > 0) {
query_dinucleotides(\@exons,$chr,$strand,$FP);
- } elsif ($#CDS_regions > 0) {
+ }
+ @exons = (); # Need to clear single exons
+ if ($#CDS_regions > 0) {
query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
}
+ @CDS_regions = (); # Need to clear single CDS regions
close($FP);
@@ -115,6 +143,8 @@ if (defined($opt_d)) {
$gene_name = "";
$last_transcript_id = "";
+$last_cds_id = "";
+$cds_transcript_id = "";
@exons = ();
@CDS_regions = ();
while (defined($line = get_line())) {
@@ -130,23 +160,27 @@ while (defined($line = get_line())) {
if ($fields[2] eq "gene") {
if ($#exons > 0) {
# Handle last mRNA of previous gene
- print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
- } elsif ($#CDS_regions > 0) {
+ print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon");
+ }
+ @exons = (); # Need to clear single exons
+ if ($#CDS_regions > 0) {
# Handle last mRNA of previous gene
- print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS");
}
+ @CDS_regions = (); # Need to clear single CDS regions
($gene_name) = $fields[8] =~ /ID=([^;]+)/;
$chr = $fields[0];
- @exons = ();
- @CDS_regions = ();
- } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") {
+ } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript" || $fields[2] eq "primary_transcript" || $fields[2] eq "ncRNA") {
if ($#exons > 0) {
- print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
- } elsif ($#CDS_regions > 0) {
- print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon");
}
+ @exons = (); # Need to clear single exons
+ if ($#CDS_regions > 0) {
+ print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS");
+ }
+ @CDS_regions = (); # Need to clear single CDS regions
($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/;
$strand = $fields[6];
@@ -159,22 +193,42 @@ while (defined($line = get_line())) {
$chr = $fields[0];
}
- @exons = ();
- @CDS_regions = ();
-
} elsif ($fields[2] eq "exon") {
- push @exons,"$fields[3] $fields[4]";
+ ($parent) = $fields[8] =~ /Parent=([^;]+)/;
+ if (!defined($parent) || $parent eq $last_transcript_id) {
+ push @exons,"$fields[3] $fields[4]";
+ }
+
} elsif ($fields[2] eq "CDS") {
- push @CDS_regions,"$fields[3] $fields[4]";
+ ($cds_id) = $fields[8] =~ /ID=([^;]+)/;
+ if (!defined($cds_id)) {
+ push @CDS_regions,"$fields[3] $fields[4]";
+ } else {
+ if ($cds_id ne $last_cds_id) {
+ if ($#CDS_regions > 0) {
+ print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS");
+ }
+ @CDS_regions = (); # Need to clear single CDS regions
+ }
+ push @CDS_regions,"$fields[3] $fields[4]";
+ $last_cds_id = $cds_id;
+ }
+ ($cds_transcript_id) = $fields[8] =~ /Parent=([^;]+)/;
+ if (!defined($cds_transcript_id)) {
+ $cds_transcript_id = $last_transcript_id;
+ }
}
}
}
if ($#exons > 0) {
- print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
-} elsif ($#CDS_regions > 0) {
- print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon");
}
+ at exons = (); # Need to clear single exons
+if ($#CDS_regions > 0) {
+ print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP,"CDS");
+}
+ at CDS_regions = (); # Need to clear single CDS regions
if (defined($opt_d)) {
close($FP);
@@ -336,7 +390,7 @@ sub acceptor_okay_p {
sub print_exons {
- my ($exons, $gene_name, $transcript_id, $chr, $strand, $FP) = @_;
+ my ($exons, $gene_name, $transcript_id, $chr, $strand, $FP, $transcript_type) = @_;
$nexons = $#{$exons} + 1;
if ($strand eq "+") {
diff --git a/util/gff3_splicesites.pl.in b/util/gff3_splicesites.pl.in
index 93ba453..592075b 100755
--- a/util/gff3_splicesites.pl.in
+++ b/util/gff3_splicesites.pl.in
@@ -28,6 +28,7 @@ if (defined($opt_d)) {
$gene_name = "";
$last_transcript_id = "";
+ $last_cds_id = "";
@exons = ();
@CDS_regions = ();
while (defined($line = <>)) {
@@ -46,25 +47,29 @@ if (defined($opt_d)) {
# Handle last mRNA of previous gene
query_dinucleotides(\@exons,$chr,$strand,$FP);
#print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
- } elsif ($#CDS_regions > 0) {
+ }
+ @exons = (); # Need to clear single exons
+ if ($#CDS_regions > 0) {
# Handle last mRNA of previous gene
query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
#print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
+ @CDS_regions = (); # Need to clear single CDS regions
($gene_name) = $fields[8] =~ /ID=([^;]+)/;
$chr = $fields[0];
- @exons = ();
- @CDS_regions = ();
- } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") {
+ } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript" || $fields[2] eq "primary_transcript" || $fields[2] eq "ncRNA") {
if ($#exons > 0) {
query_dinucleotides(\@exons,$chr,$strand,$FP);
#print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
- } elsif ($#CDS_regions > 0) {
+ }
+ @exons = (); # Need to clear single exons
+ if ($#CDS_regions > 0) {
query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
#print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
}
+ @CDS_regions = (); # Need to clear single CDS regions
($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/;
$strand = $fields[6];
@@ -73,16 +78,35 @@ if (defined($opt_d)) {
if (!defined($gene_name) || $gene_name !~ /\S/) {
$gene_name = $last_transcript_id;
}
- $chr = $fields[0];
- @exons = ();
- @CDS_regions = ();
+ if (!defined($chr) || $chr !~ /\S/) {
+ $chr = $fields[0];
+ }
} elsif ($fields[2] eq "exon") {
- push @exons,"$fields[3] $fields[4]";
+ ($parent) = $fields[8] =~ /Parent=([^;]+)/;
+ if (!defined($parent) || $parent eq $last_transcript_id) {
+ push @exons,"$fields[3] $fields[4]";
+ }
} elsif ($fields[2] eq "CDS") {
- push @CDS_regions,"$fields[3] $fields[4]";
-
+ ($cds_id) = $fields[8] =~ /ID=([^;]+)/;
+ if (!defined($cds_id)) {
+ push @CDS_regions,"$fields[3] $fields[4]";
+ } else {
+ if ($cds_id ne $last_cds_id) {
+ if ($#CDS_regions > 0) {
+ query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
+ #print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ }
+ @CDS_regions = (); # Need to clear single CDS regions
+ }
+ push @CDS_regions,"$fields[3] $fields[4]";
+ $last_cds_id = $cds_id;
+ }
+ ($cds_transcript_id) = $fields[8] =~ /Parent=([^;]+)/;
+ if (!defined($cds_transcript_id)) {
+ $cds_transcript_id = $last_transcript_id;
+ }
}
}
}
@@ -90,9 +114,12 @@ if (defined($opt_d)) {
if ($#exons > 0) {
query_dinucleotides(\@exons,$chr,$strand,$FP);
- } elsif ($#CDS_regions > 0) {
+ }
+ @exons = (); # Need to clear single exons
+ if ($#CDS_regions > 0) {
query_dinucleotides(\@CDS_regions,$chr,$strand,$FP);
}
+ @CDS_regions = (); # Need to clear single CDS regions
close($FP);
@@ -117,6 +144,8 @@ if (defined($opt_d)) {
$gene_name = "";
$last_transcript_id = "";
+$last_cds_id = "";
+$cds_transcript_id = "";
@exons = ();
@CDS_regions = ();
while (defined($line = get_line())) {
@@ -132,23 +161,27 @@ while (defined($line = get_line())) {
if ($fields[2] eq "gene") {
if ($#exons > 0) {
# Handle last mRNA of previous gene
- print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
- } elsif ($#CDS_regions > 0) {
+ print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon");
+ }
+ @exons = (); # Need to clear single exons
+ if ($#CDS_regions > 0) {
# Handle last mRNA of previous gene
- print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS");
}
+ @CDS_regions = (); # Need to clear single CDS regions
($gene_name) = $fields[8] =~ /ID=([^;]+)/;
$chr = $fields[0];
- @exons = ();
- @CDS_regions = ();
- } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") {
+ } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript" || $fields[2] eq "primary_transcript" || $fields[2] eq "ncRNA") {
if ($#exons > 0) {
- print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
- } elsif ($#CDS_regions > 0) {
- print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon");
}
+ @exons = (); # Need to clear single exons
+ if ($#CDS_regions > 0) {
+ print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS");
+ }
+ @CDS_regions = (); # Need to clear single CDS regions
($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/;
$strand = $fields[6];
@@ -161,22 +194,42 @@ while (defined($line = get_line())) {
$chr = $fields[0];
}
- @exons = ();
- @CDS_regions = ();
-
} elsif ($fields[2] eq "exon") {
- push @exons,"$fields[3] $fields[4]";
+ ($parent) = $fields[8] =~ /Parent=([^;]+)/;
+ if (!defined($parent) || $parent eq $last_transcript_id) {
+ push @exons,"$fields[3] $fields[4]";
+ }
+
} elsif ($fields[2] eq "CDS") {
- push @CDS_regions,"$fields[3] $fields[4]";
+ ($cds_id) = $fields[8] =~ /ID=([^;]+)/;
+ if (!defined($cds_id)) {
+ push @CDS_regions,"$fields[3] $fields[4]";
+ } else {
+ if ($cds_id ne $last_cds_id) {
+ if ($#CDS_regions > 0) {
+ print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS");
+ }
+ @CDS_regions = (); # Need to clear single CDS regions
+ }
+ push @CDS_regions,"$fields[3] $fields[4]";
+ $last_cds_id = $cds_id;
+ }
+ ($cds_transcript_id) = $fields[8] =~ /Parent=([^;]+)/;
+ if (!defined($cds_transcript_id)) {
+ $cds_transcript_id = $last_transcript_id;
+ }
}
}
}
if ($#exons > 0) {
- print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP);
-} elsif ($#CDS_regions > 0) {
- print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP);
+ print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon");
+}
+ at exons = (); # Need to clear single exons
+if ($#CDS_regions > 0) {
+ print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP,"CDS");
}
+ at CDS_regions = (); # Need to clear single CDS regions
if (defined($opt_d)) {
close($FP);
@@ -338,7 +391,7 @@ sub acceptor_okay_p {
sub print_exons {
- my ($exons, $gene_name, $transcript_id, $chr, $strand, $FP) = @_;
+ my ($exons, $gene_name, $transcript_id, $chr, $strand, $FP, $transcript_type) = @_;
$nexons = $#{$exons} + 1;
if ($strand eq "+") {
@@ -347,8 +400,8 @@ sub print_exons {
if (($intron_length = $ {$starts}[$i] - $ {$ends}[$i] - 1) < 0) {
printf STDERR "gene %s, transcript %s, intron %d with donor at %s:%u and acceptor at %s:%u implies a negative intron length of %d...skipping\n",$gene_name,$transcript_id,$i+1,$chr,$ {$ends}[$i],$chr,$ {$starts}[$i],$intron_length;
} elsif (!defined($opt_d)) {
- printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length\n",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1;
- printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length\n",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i];
+ printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length\n",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1;
+ printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length\n",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i];
} else {
$query = sprintf("%s:%u..%u",$chr,$ends[$i]+1,$ends[$i]+2);
$donor_dinucl = get_dinucleotide($query,$FP);
@@ -358,17 +411,17 @@ sub print_exons {
if (defined($opt_C) && donor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) {
if (!defined($opt_Q)) {
- printf STDERR "Skipping non-canonical donor $donor_dinucl, intron length %d for %s.%s.exon%d/%d on plus strand\n",$intron_length,$gene_name,$transcript_id,$i+1,$nexons;
+ printf STDERR "Skipping non-canonical donor $donor_dinucl, intron length %d for %s.%s.%s%d/%d on plus strand\n",$intron_length,$gene_name,$transcript_id,$transcript_type,$i+1,$nexons;
}
} elsif (defined($opt_R)) {
if (donor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) {
- printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1;
+ printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1;
print " $donor_dinucl";
print "\n";
}
} else {
- printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1;
+ printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1;
if (defined($opt_2)) {
print " $donor_dinucl";
}
@@ -377,17 +430,17 @@ sub print_exons {
if (defined($opt_C) && acceptor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) {
if (!defined($opt_Q)) {
- printf STDERR "Skipping non-canonical acceptor $acceptor_dinucl, intron length %d for %s.%s.exon%d/%d on plus strand\n",$intron_length,$gene_name,$transcript_id,$i+2,$nexons;
+ printf STDERR "Skipping non-canonical acceptor $acceptor_dinucl, intron length %d for %s.%s.%s%d/%d on plus strand\n",$intron_length,$gene_name,$transcript_id,$transcript_type,$i+2,$nexons;
}
} elsif (defined($opt_R)) {
if (acceptor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) {
- printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i];
+ printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i];
print " $acceptor_dinucl";
print "\n";
}
} else {
- printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i];
+ printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i];
if (defined($opt_2)) {
print " $acceptor_dinucl";
}
@@ -402,8 +455,8 @@ sub print_exons {
if (($intron_length = $ {$starts}[$i] - $ {$ends}[$i] - 1) < 0) {
printf STDERR "gene %s, transcript %s, intron %d with donor at %s:%u and acceptor at %s:%u implies a negative intron length of %d...skipping\n",$gene_name,$transcript_id,$i+1,$chr,$ {$starts}[$i],$chr,$ {$ends}[$i],$intron_length;
} elsif (!defined($opt_d)) {
- printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length\n",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1;
- printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length\n",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i];
+ printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length\n",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1;
+ printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length\n",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i];
} else {
$query = sprintf("%s:%u..%u",$chr,$starts[$i]-1,$starts[$i]-2);
$donor_dinucl = get_dinucleotide($query,$FP);
@@ -413,17 +466,17 @@ sub print_exons {
if (defined($opt_C) && donor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) {
if (!defined($opt_Q)) {
- printf STDERR "Skipping non-canonical donor $donor_dinucl, intron length %d for %s.%s.exon%d/%d on minus strand\n",$intron_length,$gene_name,$transcript_id,$i+1,$nexons;
+ printf STDERR "Skipping non-canonical donor $donor_dinucl, intron length %d for %s.%s.%s%d/%d on minus strand\n",$intron_length,$gene_name,$transcript_id,$transcript_type,$i+1,$nexons;
}
} elsif (defined($opt_R)) {
if (donor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) {
- printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1;
+ printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1;
print " $donor_dinucl";
print "\n";
}
} else {
- printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1;
+ printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1;
if (defined($opt_2)) {
print " $donor_dinucl";
}
@@ -432,17 +485,17 @@ sub print_exons {
if (defined($opt_C) && acceptor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) {
if (!defined($opt_Q)) {
- printf STDERR "Skipping non-canonical acceptor $acceptor_dinucl, intron length %d for %s.%s.exon%d/%d on minus strand\n",$intron_length,$gene_name,$transcript_id,$i+2,$nexons;
+ printf STDERR "Skipping non-canonical acceptor $acceptor_dinucl, intron length %d for %s.%s.%s%d/%d on minus strand\n",$intron_length,$gene_name,$transcript_id,$transcript_type,$i+2,$nexons;
}
} elsif (defined($opt_R)) {
if (acceptor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) {
- printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i];
+ printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i];
print " $acceptor_dinucl";
print "\n";
}
} else {
- printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i];
+ printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i];
if (defined($opt_2)) {
print " $acceptor_dinucl";
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gmap.git
More information about the debian-med-commit
mailing list