[med-svn] [Git][med-team/gmap][upstream] New upstream version 2020-09-12+ds
Alexandre Mestiashvili
gitlab at salsa.debian.org
Fri Sep 18 12:42:05 BST 2020
Alexandre Mestiashvili pushed to branch upstream at Debian Med / gmap
Commits:
56daea0c by Alexandre Mestiashvili at 2020-09-18T12:00:58+02:00
New upstream version 2020-09-12+ds
- - - - -
13 changed files:
- ChangeLog
- VERSION
- src/Makefile.am
- src/distant-rna.c
- src/path-solve.c
- + src/sam_sort.c
- + src/samread.c
- + src/samread.h
- src/stage3hr.c
- src/substring.c
- src/substring.h
- src/terminal.c
- util/gmap_build.pl.in
Changes:
=====================================
ChangeLog
=====================================
@@ -1,3 +1,19 @@
+2020-09-17 twu
+
+ * Makefile.gsnaptoo.am: Including sam_sort again
+
+2020-09-13 twu
+
+ * VERSION, index.html: Updated version number
+
+ * path-solve.c, distant-rna.c, stage3hr.c, terminal.c: Using new interfaces
+ to Substring trim functions
+
+ * gmap_build.pl.in: Fixed flag in help output
+
+ * substring.c, substring.h: For DNA-seq, not allowing extension of last
+ mismatch at end if it extends beyond chromosomal bounds
+
2020-07-10 twu
* pair.c: Added semicolon between Dir and coverage in gff3 output
=====================================
VERSION
=====================================
@@ -1 +1 @@
-2020-06-30
\ No newline at end of file
+2020-09-12
\ No newline at end of file
=====================================
src/Makefile.am
=====================================
@@ -42,10 +42,10 @@ EXTRA_DIST = mpidebug.c mpidebug.h master.c master.h
#endif
-# sam_sort
bin_PROGRAMS = cpuid gmap gmapl get-genome gmapindex indexdb_cat \
iit_store iit_get iit_dump \
- gsnap gsnapl snpindex cmetindex atoiindex trindex
+ gsnap gsnapl snpindex cmetindex atoiindex trindex \
+ sam_sort
bin_PROGRAMS += gmap.nosimd
bin_PROGRAMS += gmapl.nosimd
=====================================
src/distant-rna.c
=====================================
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: distant-rna.c 222804 2020-06-03 21:58:26Z twu $";
+static char rcsid[] = "$Id: distant-rna.c 223081 2020-09-13 14:21:03Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -435,8 +435,8 @@ find_spliceends_rna_plus (List_T **distant_donors, List_T **distant_antidonors,
/* SENSE FORWARD */
splice_queryend_p = Substring_qend_trim(&splice_pos_qend,&splicetype_queryend,&ambig_prob_queryend,
segment_univdiagonal,/*pos5*/elt->qstart,querylength,
- /*plusp*/true,genestrand,mismatch_positions_alloc,query_compress,chroffset,
- /*sensedir*/SENSE_FORWARD);
+ /*plusp*/true,genestrand,mismatch_positions_alloc,query_compress,
+ chroffset,chrhigh,/*sensedir*/SENSE_FORWARD);
debug4e(printf("sense: splice_pos_qend %d with splice_queryend_p %d\n",splice_pos_qend,splice_queryend_p));
if (splice_pos_qend >= 0 && (splice_pos_qend > querylength - 6 || splice_queryend_p == true)) {
/* genomic right anchor or middle anchor */
@@ -538,8 +538,8 @@ find_spliceends_rna_plus (List_T **distant_donors, List_T **distant_antidonors,
/* SENSE ANTI */
splice_queryend_p = Substring_qend_trim(&splice_pos_qend,&splicetype_queryend,&ambig_prob_queryend,
segment_univdiagonal,/*pos5*/elt->qstart,querylength,
- /*plusp*/true,genestrand,mismatch_positions_alloc,query_compress,chroffset,
- /*sensedir*/SENSE_ANTI);
+ /*plusp*/true,genestrand,mismatch_positions_alloc,query_compress,
+ chroffset,chrhigh,/*sensedir*/SENSE_ANTI);
debug4e(printf("antisense: splice_pos_qend %d with splice_queryend_p %d\n",splice_pos_qend,splice_queryend_p));
if (splice_pos_qend >= 0 && (splice_pos_qend > querylength - 6 || splice_queryend_p == true)) {
/* genomic right anchor or middle anchor */
@@ -953,8 +953,8 @@ find_spliceends_rna_minus (List_T **distant_donors, List_T **distant_antidonors,
/* SENSE ANTI */
splice_querystart_p = Substring_qend_trim(&splice_pos_qend,&splicetype_querystart,&ambig_prob_querystart,
segment_univdiagonal,/*pos5*/elt->qstart,querylength,
- /*plusp*/false,genestrand,mismatch_positions_alloc,query_compress,chroffset,
- /*sensedir*/SENSE_ANTI);
+ /*plusp*/false,genestrand,mismatch_positions_alloc,query_compress,
+ chroffset,chrhigh,/*sensedir*/SENSE_ANTI);
debug4e(printf("antisense: splice_pos_qend %d with splice_querystart_p %d\n",splice_pos_qend,splice_querystart_p));
if (splice_pos_qend >= 0 && (splice_pos_qend > querylength - 6 || splice_querystart_p == true)) {
/* genomic right anchor or middle anchor */
@@ -1057,8 +1057,8 @@ find_spliceends_rna_minus (List_T **distant_donors, List_T **distant_antidonors,
/* SENSE FORWARD */
splice_querystart_p = Substring_qend_trim(&splice_pos_qend,&splicetype_querystart,&ambig_prob_querystart,
segment_univdiagonal,/*pos5*/elt->qstart,querylength,
- /*plusp*/false,genestrand,mismatch_positions_alloc,query_compress,chroffset,
- /*sensedir*/SENSE_FORWARD);
+ /*plusp*/false,genestrand,mismatch_positions_alloc,query_compress,
+ chroffset,chrhigh,/*sensedir*/SENSE_FORWARD);
debug4e(printf("sense: splice_pos_qend %d with splice_querystart_p %d\n",splice_pos_qend,splice_querystart_p));
if (splice_pos_qend >= 0 && (splice_pos_qend > querylength - 6 || splice_querystart_p == true)) {
/* genomic right anchor or middle anchor */
=====================================
src/path-solve.c
=====================================
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: path-solve.c 222797 2020-06-03 21:05:08Z twu $";
+static char rcsid[] = "$Id: path-solve.c 223083 2020-09-13 15:26:10Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -1455,7 +1455,7 @@ Path_new_for_qend_extension (Univcoord_T univdiagonal, int qstart, int qend, boo
be attached. All Path_T objects are copies of the original path */
static List_T
attach_qend_diagonal (T path, Univcoord_T high_univdiagonal, int high_qstart, int high_qend,
- Chrnum_T chrnum, Univcoord_T chroffset, int querylength,
+ Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, int querylength,
Spliceinfo_T spliceinfo, int *mismatch_positions_alloc, Compress_T query_compress,
bool plusp, int genestrand, int max_mismatches_allowed, int max_insertionlen, int max_deletionlen,
Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
@@ -1515,7 +1515,7 @@ attach_qend_diagonal (T path, Univcoord_T high_univdiagonal, int high_qstart, in
/*univdiagonal*/high_univdiagonal,/*qstart*/high_qstart,
/*qend:querylength,*/querylength,plusp,genestrand,
mismatch_positions_alloc,query_compress,
- chroffset,sensedir);
+ chroffset,chrhigh,sensedir);
if (result < 0) {
nspliceends = 0;
@@ -2139,7 +2139,7 @@ compute_qend_paths (int depth, bool *terminalp, T path, List_T diagonals,
} else if ((child_paths = attach_qend_diagonal(path,/*high_univdiagonal*/diagonal->univdiagonal,
/*high_qstart*/diagonal->qstart,
/*high_qend*/diagonal->qend,
- chrnum,chroffset,querylength,
+ chrnum,chroffset,chrhigh,querylength,
spliceinfo,mismatch_positions_alloc,query_compress,
plusp,genestrand,max_mismatches_allowed,
max_insertionlen,max_deletionlen,intlistpool,
@@ -2323,7 +2323,7 @@ add_qend_local (bool *addedp, T path, Univcoord_T *local_diagonals, int ndiagona
if ((paths = attach_qend_diagonal(path,/*high_univdiagonal*/local_diagonals[i],
/*high_qstart*/pos5,high_qend,
- chrnum,chroffset,querylength,
+ chrnum,chroffset,chrhigh,querylength,
spliceinfo,mismatch_positions_alloc,query_compress,
plusp,genestrand,max_mismatches_allowed,
max_insertionlen,max_deletionlen,intlistpool,
@@ -3803,7 +3803,7 @@ Path_solve_from_diagonals (bool *foundp, int *found_score_overall, int *found_sc
/*qend:middle_diagonal_qend,*/
querylength,plusp,genestrand,
mismatch_positions_alloc,query_compress,
- chroffset,/*sensedir*/SENSE_FORWARD);
+ chroffset,chrhigh,/*sensedir*/SENSE_FORWARD);
debug13(printf("splice3p %d, result %d\n",splice3p,result));
if (result < 0) {
@@ -3891,7 +3891,7 @@ Path_solve_from_diagonals (bool *foundp, int *found_score_overall, int *found_sc
/*qend:middle_diagonal_qend,*/
querylength,plusp,genestrand,
mismatch_positions_alloc,query_compress,
- chroffset,/*sensedir*/SENSE_ANTI);
+ chroffset,chrhigh,/*sensedir*/SENSE_ANTI);
debug13(printf("splice3p %d, result %d\n",splice3p,result));
if (result < 0) {
=====================================
src/sam_sort.c
=====================================
@@ -0,0 +1,1588 @@
+static char rcsid[] = "$Id: sam_sort.c 200239 2016-11-08 01:00:30Z twu $";
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h> /* For off_t */
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bool.h"
+#include "mem.h"
+#include "access.h"
+#include "complement.h"
+#include "littleendian.h"
+#include "genomicpos.h"
+#include "chrnum.h"
+#include "samheader.h"
+#include "samread.h"
+#include "samflags.h"
+#include "stopwatch.h"
+#include "datadir.h"
+#include "filestring.h"
+#include "getopt.h"
+
+
+#define MONITOR_INTERVAL 100000
+
+
+/************************************************************************
+ *
+ * Check for correctness:
+ *
+ * Run ./sam_sort -d <genome> --mark-dups --mark-first --dups-only --no-sam-headers on a SAM file
+ * Do "cut -f 1 | sort | uniq" to find all duplicate accessions
+ *
+ * Run the following Perl program on the original FASTA file
+ * Do "grep '^>' | sed -e 's/>//' | sort | uniq" to use as a gold standard
+ *
+ * use IO::File;
+ * $fastafile = $ARGV[0];
+ *
+ * $FP = new IO::File($fastafile);
+ * while (defined($header = <$FP>) && defined($queryseq5 = <$FP>) && defined($queryseq3 = <$FP>)) {
+ * chop $queryseq5; chop $queryseq3;
+ *
+ * if ($queryseq5 lt $queryseq3) {
+ * $nseen{$queryseq5 . $queryseq3} += 1;
+ * } else {
+ * $nseen{$queryseq3 . $queryseq5} += 1;
+ * }
+ * }
+ * close($FP);
+ *
+ * $FP = new IO::File($fastafile);
+ * while (defined($header = <$FP>) && defined($queryseq5 = <$FP>) && defined($queryseq3 = <$FP>)) {
+ * chop $queryseq5; chop $queryseq3;
+ *
+ * if ($queryseq5 lt $queryseq3) {
+ * $queryseq = $queryseq5 . $queryseq3;
+ * } else {
+ * $queryseq = $queryseq3 . $queryseq5;
+ * }
+ *
+ * if (($n = $nseen{$queryseq}) > 1) {
+ * print $header; print $queryseq5 . "\n"; print $queryseq3 . "\n";
+ * }
+ * }
+ * close($FP);
+ *
+ * exit;
+ *
+ ************************************************************************/
+
+
+#define CHUNK 1024
+typedef enum {NO_SECONDARY_SORT, ORIG_SECONDARY_SORT, ACC_SECONDARY_SORT, MATEFWD_SECONDARY_SORT, MATEREV_SECONDARY_SORT} Secondary_sort_T;
+
+#ifdef DEBUG
+#define debug(x) x
+#else
+#define debug(x)
+#endif
+
+/* Details of getting queryseqs */
+#ifdef DEBUG9
+#define debug9(x) x
+#else
+#define debug9(x)
+#endif
+
+/* Cell_binary_search */
+#ifdef DEBUG10
+#define debug10(x) x
+#else
+#define debug10(x)
+#endif
+
+/* Testing of linelen algorithm */
+#ifdef DEBUG14
+#define debug14(x) x
+#else
+#define debug14(x)
+#endif
+
+
+
+#ifdef HAVE_FSEEKO
+#define moveto(fp,offset) fseeko(fp,offset,SEEK_SET)
+#else
+#define moveto(fp,offset) fseek(fp,offset,SEEK_SET)
+#endif
+
+
+
+/* Program Options */
+static char *genomesubdir = NULL;
+static char *dbroot = NULL;
+static char *dbversion = NULL;
+static char *user_genomedir = NULL;
+
+static bool sam_headers_p = true;
+
+static bool mark_duplicates_p = false;
+static bool mark_first_p = false;
+static bool print_unique_p = true;
+static bool print_duplicates_p = true;
+static bool restore_original_order_p = false;
+
+static bool secondary_sort_method = NO_SECONDARY_SORT;
+static bool multiple_primaries_p = false;
+
+static Stopwatch_T stopwatch = NULL;
+
+static char *split_output_root = NULL;
+static bool appendp = false;
+static FILE **outputs = NULL;
+
+
+static struct option long_options[] = {
+ /* Input options */
+ {"dir", required_argument, 0, 'D'}, /* user_genomedir */
+ {"db", required_argument, 0, 'd'}, /* dbroot */
+ {"split-output", required_argument, 0, 0}, /* outputs */
+ {"append-output", no_argument, 0, 0}, /* appendp */
+
+ {"sort2", required_argument, 0, 0}, /* secondary_sort_method */
+
+ {"mark-dups", no_argument, 0, 0}, /* mark_duplicates_p, print_unique_p, print_duplicates_p */
+ {"mark-first", no_argument, 0, 0}, /* mark_first_p */
+ {"dups-only", no_argument, 0, 0}, /* mark_duplicates_p, print_unique_p, print_duplicates_p */
+ {"uniq-only", no_argument, 0, 0}, /* mark_duplicates_p, print_unique_p, print_duplicates_p */
+ {"restore-orig-order", no_argument, 0, 0}, /* restore_original_order_p */
+ {"multiple-primaries", no_argument, 0, 0}, /* multiple_primaries_p */
+ {"no-sam-headers", no_argument, 0, 0}, /* sam_headers_p */
+
+ /* Help options */
+ {"version", no_argument, 0, '^'}, /* print_program_version */
+ {"help", no_argument, 0, '?'}, /* print_program_usage */
+ {0, 0, 0, 0}
+};
+
+
+static void
+print_program_version () {
+ fprintf(stdout,"\n");
+ fprintf(stdout,"sam_sort: sorting and duplication-marking utility for SAM files\n");
+ fprintf(stdout,"Part of GMAP package, version %s\n",PACKAGE_VERSION);
+ fprintf(stdout,"Thomas D. Wu, Genentech, Inc.\n");
+ fprintf(stdout,"Contact: twu at gene.com\n");
+ fprintf(stdout,"\n");
+ return;
+}
+
+static void
+print_program_usage () {
+ fprintf(stdout,"\
+Usage: sam_sort [OPTIONS...] -d genome <sam file>\n\
+\n\
+Input options\n\
+ -D, --dir=STRING Genome directory\n\
+ -d, --db=STRING Genome database. If argument is '?' (with\n\
+ the quotes), this command lists available databases.\n\
+Output file options\n\
+ --split-output=STRING Basename for multiple-file output, separately for nomapping,\n\
+ halfmapping_uniq, halfmapping_mult, unpaired_uniq, unpaired_mult,\n\
+ paired_uniq, paired_mult, concordant_uniq, and concordant_mult results\n\
+ --append-output When --split-output is given, this flag will append output\n\
+ to the existing files. Otherwise, the default is to create new files.\n\
+\n\
+Other options\n\
+ --sort2=STRING For positions with the same genomic position, sort secondarily by\n\
+ none: no guarantee about the secondary sort (default)\n\
+ orig: original order in the SAM output file (what samtools sort does)\n\
+ accession: alphabetically by accession name\n\
+ mate-fwd: by genomic position of the mate, in ascending order\n\
+ mate-rev: by genomic position of the mate, in descending order\n\
+ For sorting by mate genomic position, a nomapping mate is treated as genomic position 0\n\
+ --mark-dups Mark duplicate reads by altering the flag accordingly\n\
+ --mark-first Also mark the first occurrence of a read that has a subsequent duplicate\n\
+\n\
+ --dups-only Print only duplicate reads\n\
+ --uniq-only Print only unique reads\n\
+ --restore-orig-order Restore original order of SAM file. Useful when --mark-dups is specified\n\
+ and sorting is not desired\n\
+ --multiple-primaries Specify if GSNAP or GMAP was run with the --multiple-primaries flag\n\
+ --no-sam-headers Do not print SAM header lines\n\
+");
+ return;
+}
+
+
+static char complCode[128] = COMPLEMENT_LC;
+
+static void
+make_complement_inplace (char *sequence, unsigned int length) {
+ char temp;
+ unsigned int i, j;
+
+ for (i = 0, j = length-1; i < length/2; i++, j--) {
+ temp = complCode[(int) sequence[i]];
+ sequence[i] = complCode[(int) sequence[j]];
+ sequence[j] = temp;
+ }
+ if (i == j) {
+ sequence[i] = complCode[(int) sequence[i]];
+ }
+
+ return;
+}
+
+
+#define T Cell_T
+typedef struct T *T;
+struct T {
+ unsigned int flag;
+ SAM_split_output_type split_output;
+ bool low_read_p;
+
+ Chrnum_T chrnum;
+ Univcoord_T genomicpos;
+ Univcoord_T genomicpos_extend_softclip;
+ Univcoord_T mate_genomicpos;
+ int filei;
+ size_t linestart;
+ int linelen;
+
+ char *acc; /* Needed for ACC_SECONDARY_SORT */
+
+ int lineindex; /* Original line order */
+ int readindex; /* inputi or outputi. Grouped by accession. Needed for marking duplicates to find the other queryseq */
+ char *queryseq5;
+ char *queryseq3;
+
+ char *queryseq_alpha1; /* Pointers to queryseq5 and queryseq3. Needed for handling 5/3 swaps. */
+ char *queryseq_alpha2;
+
+};
+
+
+static void
+Cell_standardize_queryseqs (T this) {
+
+ if (this->queryseq5 == NULL && this->queryseq3 == NULL) {
+ this->queryseq_alpha1 = this->queryseq_alpha2 = NULL;
+ } else if (this->queryseq5 == NULL) {
+ this->queryseq_alpha1 = this->queryseq3;
+ this->queryseq_alpha2 = NULL;
+ } else if (this->queryseq3 == NULL) {
+ this->queryseq_alpha1 = this->queryseq5;
+ this->queryseq_alpha2 = NULL;
+ } else if (strcmp(this->queryseq5,this->queryseq3) < 0) {
+ this->queryseq_alpha1 = this->queryseq5;
+ this->queryseq_alpha2 = this->queryseq3;
+ } else {
+ this->queryseq_alpha1 = this->queryseq3;
+ this->queryseq_alpha2 = this->queryseq5;
+ }
+
+ return;
+}
+
+
+/* initial_softclip needs to be determined only if we are marking duplicates */
+static void
+Cell_fill (struct T *this, int lineindex, int readindex, unsigned int flag, SAM_split_output_type split_output,
+ bool query_lowp, int initial_softclip, Univcoord_T genomicpos,
+ int filei, size_t fileposition, int linelen) {
+
+ this->lineindex = lineindex;
+ this->readindex = readindex;
+
+ this->flag = flag;
+ this->split_output = split_output;
+ this->low_read_p = query_lowp;
+
+ this->genomicpos = genomicpos;
+ this->genomicpos_extend_softclip = this->genomicpos - initial_softclip;
+
+ this->filei = filei;
+ this->linestart = fileposition;
+ this->linelen = linelen;
+
+ this->queryseq5 = (char *) NULL;
+ this->queryseq3 = (char *) NULL;
+
+ return;
+}
+
+/* initial_softclip needs to be determined only if we are marking duplicates */
+static void
+Cell_fill_nodups (struct T *this, unsigned int flag, SAM_split_output_type split_output,
+ Univcoord_T genomicpos, int filei, size_t fileposition, int linelen) {
+
+ this->lineindex = 0;
+ this->readindex = 0;
+
+ this->flag = flag;
+ this->split_output = split_output;
+ this->low_read_p = true;
+
+ this->genomicpos = genomicpos;
+ this->genomicpos_extend_softclip = genomicpos;
+
+ this->filei = filei;
+ this->linestart = fileposition;
+ this->linelen = linelen;
+
+ this->queryseq5 = (char *) NULL;
+ this->queryseq3 = (char *) NULL;
+
+ return;
+}
+
+#if 0
+static void
+print_fromfile (FILE *fp, size_t fileposition, int linelength) {
+ char buffer[CHUNK];
+
+ moveto(fp,fileposition);
+
+ while (linelength > CHUNK) {
+ fread(buffer,sizeof(char),CHUNK,fp);
+ fwrite(buffer,sizeof(char),CHUNK,stdout);
+ linelength -= CHUNK;
+ }
+ if (linelength > 0) {
+ fread(buffer,sizeof(char),linelength,fp);
+ fwrite(buffer,sizeof(char),linelength,stdout);
+ }
+
+ return;
+}
+#endif
+
+
+
+static void
+Cell_print_fromfile (FILE *fp_input, T this, Filestring_T headers) {
+ char buffer[CHUNK];
+ int linelength = this->linelen;
+ FILE *fp_output;
+
+#if 0
+ if (nofailsp == true && this->split_output == OUTPUT_NM) {
+ /* Skip */
+ return;
+
+ } else if (failsonlyp == true && this->split_output != OUTPUT_NM &&
+ this->split_output != OUTPUT_HX && this->split_output != OUTPUT_UX &&
+ this->split_output != OUTPUT_PX && this->split_output != OUTPUT_CX) {
+ return;
+ }
+
+ if (failedinput_root != NULL && primaryp(this->flag) == true) {
+ /* Convert SAM line to FASTA or FASTQ and write to a failedinput file */
+ }
+#endif
+
+ if (split_output_root == NULL) {
+ if ((fp_output = outputs[0]) == NULL) {
+ fp_output = outputs[0] = stdout;
+ Filestring_print(fp_output,headers);
+ }
+
+ } else {
+ if ((fp_output = outputs[this->split_output]) == NULL) {
+ fp_output = outputs[this->split_output] = SAM_header_open_file(this->split_output,split_output_root,/*appendp*/false);
+ Filestring_print(fp_output,headers);
+ }
+ }
+
+ moveto(fp_input,this->linestart);
+
+#ifdef DEBUG
+ printf("readindex %d: ",this->readindex);
+#endif
+
+ while (linelength > CHUNK) {
+ fread(buffer,sizeof(char),CHUNK,fp_input);
+ fwrite(buffer,sizeof(char),CHUNK,fp_output);
+ linelength -= CHUNK;
+ }
+ if (linelength > 0) {
+ fread(buffer,sizeof(char),linelength,fp_input);
+ fwrite(buffer,sizeof(char),linelength,fp_output);
+ }
+
+ return;
+}
+
+
+
+static int
+Cell_lineindex_cmp (const void *a, const void *b) {
+ T x = * (T *) a;
+ T y = * (T *) b;
+
+ if (x->lineindex < y->lineindex) {
+ return -1;
+ } else if (y->lineindex < x->lineindex) {
+ return +1;
+ } else {
+ return 0;
+ }
+}
+
+static int
+Cell_genomicpos_cmp (const void *a, const void *b) {
+ T x = * (T *) a;
+ T y = * (T *) b;
+
+ if (x->genomicpos != 0 && y->genomicpos == 0) {
+ return -1;
+ } else if (y->genomicpos != 0 && x->genomicpos == 0) {
+ return +1;
+ } else if (x->genomicpos < y->genomicpos) {
+ return -1;
+ } else if (y->genomicpos < x->genomicpos) {
+ return +1;
+ } else {
+ return 0;
+ }
+}
+
+static int
+Cell_genomicpos_linestart_cmp (const void *a, const void *b) {
+ T x = * (T *) a;
+ T y = * (T *) b;
+
+ if (x->genomicpos != 0 && y->genomicpos == 0) {
+ return -1;
+ } else if (y->genomicpos != 0 && x->genomicpos == 0) {
+ return +1;
+ } else if (x->genomicpos < y->genomicpos) {
+ return -1;
+ } else if (y->genomicpos < x->genomicpos) {
+ return +1;
+ } else if (x->linestart < y->linestart) {
+ return -1;
+ } else if (y->linestart < x->linestart) {
+ return +1;
+ } else {
+ return 0;
+ }
+}
+
+static int
+Cell_genomicpos_extend_softclip_lowhigh_cmp (const void *a, const void *b) {
+ T x = * (T *) a;
+ T y = * (T *) b;
+
+ if (x->genomicpos_extend_softclip != 0 && y->genomicpos_extend_softclip == 0) {
+ return -1;
+ } else if (y->genomicpos_extend_softclip != 0 && x->genomicpos_extend_softclip == 0) {
+ return +1;
+ } else if (x->genomicpos_extend_softclip < y->genomicpos_extend_softclip) {
+ return -1;
+ } else if (y->genomicpos_extend_softclip < x->genomicpos_extend_softclip) {
+ return +1;
+
+ } else if (x->low_read_p == true && y->low_read_p == false) {
+ return -1;
+ } else if (y->low_read_p == true && x->low_read_p == false) {
+ return +1;
+ } else {
+ return 0;
+ }
+}
+
+static int
+Cell_accession_cmp (const void *a, const void *b) {
+ T x = * (T *) a;
+ T y = * (T *) b;
+
+ return strcmp(x->acc,y->acc);
+}
+
+static int
+Cell_matefwd_cmp (const void *a, const void *b) {
+ T x = * (T *) a;
+ T y = * (T *) b;
+
+ if (x->mate_genomicpos < y->mate_genomicpos) {
+ return -1;
+ } else if (y->mate_genomicpos < x->mate_genomicpos) {
+ return +1;
+ } else {
+ return 0;
+ }
+}
+
+static int
+Cell_materev_cmp (const void *a, const void *b) {
+ T x = * (T *) a;
+ T y = * (T *) b;
+
+ if (x->mate_genomicpos > y->mate_genomicpos) {
+ return -1;
+ } else if (y->mate_genomicpos > x->mate_genomicpos) {
+ return +1;
+ } else {
+ return 0;
+ }
+}
+
+
+static int
+Cell_queryseq_cmp (const void *a, const void *b) {
+ T x = * (T *) a;
+ T y = * (T *) b;
+ int cmp;
+
+ if (x->queryseq_alpha1 != NULL && y->queryseq_alpha1 == NULL) {
+ return -1;
+ } else if (y->queryseq_alpha1 == NULL && x->queryseq_alpha1 != NULL) {
+ return +1;
+ } else if (x->queryseq_alpha1 == NULL && y->queryseq_alpha1 == NULL) {
+ return 0;
+ } else if ((cmp = strcmp(x->queryseq_alpha1,y->queryseq_alpha1)) != 0) {
+ return cmp;
+ } else if (x->queryseq_alpha2 != NULL && y->queryseq_alpha2 == NULL) {
+ return -1;
+ } else if (y->queryseq_alpha2 == NULL && x->queryseq_alpha2 != NULL) {
+ return +1;
+ } else if (x->queryseq_alpha2 == NULL && y->queryseq_alpha2 == NULL) {
+ return 0;
+ } else {
+ return strcmp(x->queryseq_alpha2,y->queryseq_alpha2);
+ }
+}
+
+
+#if 0
+static int
+Cell_binary_search (int lowi, int highi, T *cells, Univcoord_T goal) {
+ int middlei;
+
+ debug10(printf("entered binary search with lowi=%d, highi=%d, goal=%u\n",lowi,highi,goal));
+
+ while (lowi < highi) {
+ middlei = lowi + ((highi - lowi) / 2);
+ debug10(printf(" binary: %d:%u %d:%u %d:%u vs. %u\n",
+ lowi,cells[lowi]->genomicpos,middlei,cells[middlei]->genomicpos,
+ highi,cells[highi]->genomicpos,goal));
+ if (goal < cells[middlei]->genomicpos) {
+ highi = middlei;
+ } else if (goal > cells[middlei]->genomicpos) {
+ lowi = middlei + 1;
+ } else {
+ debug10(printf("binary search returns %d\n",middlei));
+ /* Rewind to first cell having the goal */
+ while (middlei - 1 >= lowi && cells[middlei - 1]->genomicpos == goal) {
+ middlei--;
+ }
+ return middlei;
+ }
+ }
+
+ debug10(printf("binary search returns %d\n",highi));
+ return highi;
+}
+#endif
+
+#if 0
+static int
+Cell_find (int lowi, int highi, T *cells, Univcoord_T goal, int readindex) {
+ int i;
+
+ i = Cell_binary_search(lowi,highi,cells,goal);
+ while (i < highi && cells[i]->genomicpos == goal) {
+ if (cells[i]->readindex == readindex) {
+ return i;
+ } else {
+ i++;
+ }
+ }
+
+ fprintf(stderr,"Cannot find cell in %d..%d with genomicpos %u and readindex %d\n",
+ lowi,highi,goal,readindex);
+ return -1;
+}
+#endif
+
+
+static void
+process_without_dups (FILE **sam_inputs, int *headerlengths, int *ncells, int ninputs,
+ Intlist_T linelengths, int ncells_total, Univ_IIT_T chromosome_iit,
+ Univcoord_T *chroffsets, Filestring_T headers) {
+ T *cells;
+ FILE *fp_sam;
+ int filei, linei;
+ int n_mappers = 0, n_nomappers = 0;
+ Intlist_T l;
+ struct T *cells_allocated, *ptr;
+ int b, i, j, k;
+
+ size_t fileposition;
+ int linelen;
+ unsigned int flag;
+ SAM_split_output_type split_output;
+ Univcoord_T genomicpos;
+
+ int acclength;
+
+ ptr = cells_allocated = (struct T *) MALLOC(ncells_total * sizeof(struct T));
+ cells = (T *) MALLOC(ncells_total * sizeof(T));
+ for (i = 0; i < ncells_total; i++) {
+ cells[i] = &(ptr[i]);
+ }
+
+ fprintf(stderr,"Reading SAM files...\n");
+
+ k = 0;
+ l = linelengths;
+ for (filei = 0; filei < ninputs; filei++) {
+ fprintf(stderr," Reading file %d...",filei+1);
+ fp_sam = sam_inputs[filei];
+ fileposition = headerlengths[filei];
+ for (linei = 0; linei < ncells[filei]; linei++) {
+ linelen = Intlist_head(l);
+ moveto(fp_sam,fileposition);
+ genomicpos = Samread_parse_genomicpos_fromfile(fp_sam,&flag,&split_output,
+ chromosome_iit,chroffsets,linelen);
+ Cell_fill_nodups(cells[k++],flag,split_output,genomicpos,filei,fileposition,linelen);
+ if (flag & QUERY_UNMAPPED) {
+ n_nomappers++;
+ } else {
+ n_mappers++;
+ }
+ fileposition += linelen;
+ l = Intlist_next(l);
+ }
+
+ fprintf(stderr,"done\n");
+ }
+
+ /* Sort and print */
+ if (secondary_sort_method == NO_SECONDARY_SORT) {
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Sorting entries by genomicpos...");
+ qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Printing entries...");
+ for (b = 0; b + MONITOR_INTERVAL < ncells_total; b += MONITOR_INTERVAL) {
+ fprintf(stderr,"Printed %d entries\n",b);
+ for (i = 0; i < MONITOR_INTERVAL; i++) {
+ k = b + i;
+ debug(printf("%u\t%u\t%d\n",cells[k]->genomicpos,cells[k]->linestart,cells[k]->linelen));
+ fp_sam = sam_inputs[cells[k]->filei];
+ Cell_print_fromfile(fp_sam,cells[k],headers);
+ }
+ }
+ for (k = b; k < ncells_total; k++) {
+ debug(printf("%u\t%u\t%d\n",cells[k]->genomicpos,cells[k]->linestart,cells[k]->linelen));
+ fp_sam = sam_inputs[cells[k]->filei];
+ Cell_print_fromfile(fp_sam,cells[k],headers);
+ }
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+ } else if (secondary_sort_method == ORIG_SECONDARY_SORT) {
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Sorting entries by genomicpos and original file position...");
+ qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_linestart_cmp);
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Printing entries...");
+ for (b = 0; b + MONITOR_INTERVAL < ncells_total; b += MONITOR_INTERVAL) {
+ fprintf(stderr,"Printed %d entries\n",b);
+ for (i = 0; i < MONITOR_INTERVAL; i++) {
+ k = b + i;
+ fp_sam = sam_inputs[cells[k]->filei];
+ Cell_print_fromfile(fp_sam,cells[k],headers);
+ }
+ }
+ for (k = b; k < ncells_total; k++) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ Cell_print_fromfile(fp_sam,cells[k],headers);
+ }
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+ } else if (secondary_sort_method == ACC_SECONDARY_SORT) {
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Sorting entries by genomicpos...");
+ qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Subsorting by accession and printing entries...");
+ i = 0;
+ while (i < n_mappers) {
+ j = i + 1;
+ while (j < n_mappers && cells[j]->genomicpos == cells[i]->genomicpos) {
+ j++;
+ }
+
+ if (j > i + 1) {
+ for (k = i; k < j; k++) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ moveto(fp_sam,cells[k]->linestart);
+ cells[k]->acc = Samread_get_acc_fromfile(&acclength,fp_sam,cells[k]->linelen);
+ }
+
+ qsort(&(cells[i]),j - i,sizeof(T),Cell_accession_cmp);
+ for (k = i; k < j; k++) {
+ FREE(cells[k]->acc);
+ }
+ }
+
+ for (k = i; k < j; k++) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ Cell_print_fromfile(fp_sam,cells[k],headers);
+ }
+
+ i = j;
+ }
+
+ if (ncells_total > n_mappers + 1) {
+ for (k = n_mappers; k < ncells_total; k++) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ moveto(fp_sam,cells[k]->linestart);
+ cells[k]->acc = Samread_get_acc_fromfile(&acclength,fp_sam,cells[k]->linelen);
+ }
+
+ qsort(&(cells[n_mappers]),n_nomappers,sizeof(T),Cell_accession_cmp);
+ for (k = n_mappers; k < ncells_total; k++) {
+ FREE(cells[k]->acc);
+ }
+ }
+
+ for (k = n_mappers; k < ncells_total; k++) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ Cell_print_fromfile(fp_sam,cells[k],headers);
+ }
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+ } else if (secondary_sort_method == MATEFWD_SECONDARY_SORT || secondary_sort_method == MATEREV_SECONDARY_SORT) {
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Sorting entries by genomicpos...");
+ qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Subsorting by mate position and printing entries...");
+ i = 0;
+ while (i < n_mappers) {
+ j = i + 1;
+ while (j < n_mappers && cells[j]->genomicpos == cells[i]->genomicpos) {
+ j++;
+ }
+
+ if (j > i + 1) {
+ for (k = i; k < j; k++) {
+ moveto(fp_sam,cells[k]->linestart);
+ cells[k]->mate_genomicpos = Samread_parse_mate_genomicpos_fromfile(fp_sam,chromosome_iit,chroffsets,cells[k]->linelen);
+ }
+
+ if (secondary_sort_method == MATEFWD_SECONDARY_SORT) {
+ qsort(&(cells[i]),j - i,sizeof(T),Cell_matefwd_cmp);
+ } else {
+ qsort(&(cells[i]),j - i,sizeof(T),Cell_materev_cmp);
+ }
+ }
+
+ for (k = i; k < j; k++) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ Cell_print_fromfile(fp_sam,cells[k],headers);
+ }
+
+ i = j;
+ }
+
+ if (ncells_total > n_mappers + 1) {
+ for (k = n_mappers; k < ncells_total; k++) {
+ moveto(fp_sam,cells[k]->linestart);
+ cells[k]->mate_genomicpos = Samread_parse_mate_genomicpos_fromfile(fp_sam,chromosome_iit,chroffsets,cells[k]->linelen);
+ }
+
+ if (secondary_sort_method == MATEFWD_SECONDARY_SORT) {
+ qsort(&(cells[n_mappers]),n_nomappers,sizeof(T),Cell_matefwd_cmp);
+ } else {
+ qsort(&(cells[n_mappers]),n_nomappers,sizeof(T),Cell_materev_cmp);
+ }
+ }
+
+ for (k = n_mappers; k < ncells_total; k++) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ Cell_print_fromfile(fp_sam,cells[k],headers);
+ }
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+ } else {
+ fprintf(stderr,"Secondary sort method not recognized\n");
+ abort();
+ }
+
+ FREE(cells);
+ FREE(cells_allocated);
+
+ return;
+}
+
+
+static int
+process_with_dups (FILE **sam_inputs, int *headerlengths, int *ncells, int ninputs,
+ Intlist_T linelengths, int ncells_total, Univ_IIT_T chromosome_iit,
+ Univcoord_T *chroffsets, Filestring_T headers) {
+ FILE *fp_sam;
+ int filei, linei;
+ int nmarked = 0;
+ int n_mappers = 0, n_nomappers = 0;
+ T *cells, mate;
+ struct T *cells_allocated, *ptr;
+ int *queryseq5_index, *queryseq3_index;
+ int i, k;
+ int j, j_low, j_high, mate_allocated;
+
+ size_t fileposition;
+ int linelen;
+ Univcoord_T genomicpos;
+ char *hiti;
+
+ Intlist_T l;
+ unsigned int flag;
+ SAM_split_output_type split_output;
+ int initial_softclip;
+ char *acc, *last_acc, *read;
+ int lineindex, readindex, nreads;
+ int acclength, last_acclength, readlength;
+ bool query_lowp;
+ bool *duplicatep = NULL;
+ bool all_duplicates_p;
+
+
+
+ /* Actually, array lengths should be nreads, but we don't know that yet */
+ queryseq5_index = (int *) CALLOC(ncells_total,sizeof(int));
+ queryseq3_index = (int *) CALLOC(ncells_total,sizeof(int));
+
+ ptr = cells_allocated = (struct T *) MALLOC(ncells_total * sizeof(struct T));
+ cells = (T *) MALLOC(ncells_total * sizeof(T));
+ for (i = 0; i < ncells_total; i++) {
+ cells[i] = &(ptr[i]);
+ }
+
+
+ last_acc = MALLOC(sizeof(char));
+ last_acc[0] = '\0';
+ last_acclength = 0;
+ lineindex = -1;
+ readindex = -1; /* readindex is 0-based */
+
+ fprintf(stderr,"Reading SAM files...\n");
+
+ k = 0;
+ l = linelengths;
+ for (filei = 0; filei < ninputs; filei++) {
+ fprintf(stderr," Reading file %d...",filei+1);
+ fp_sam = sam_inputs[filei];
+ fileposition = headerlengths[filei];
+ for (linei = 0; linei < ncells[filei]; linei++) {
+ linelen = Intlist_head(l);
+ moveto(fp_sam,fileposition);
+ acc = Samread_parse_acc_and_softclip_fromfile(&acclength,&flag,&split_output,&hiti,
+ &genomicpos,&initial_softclip,&query_lowp,
+ fp_sam,chromosome_iit,chroffsets,linelen);
+ lineindex++;
+ if (acclength != last_acclength) {
+ readindex++;
+ } else if (strcmp(acc,last_acc)) {
+ readindex++;
+ }
+ FREE(last_acc);
+ last_acc = acc;
+ last_acclength = acclength;
+
+ if (flag & QUERY_UNMAPPED) {
+ n_nomappers++;
+ } else {
+ n_mappers++;
+ }
+
+ /* debug(printf("Read readindex %d, chrnum %d, chrpos %u, linelen %d\n",readindex,chrnum,chrpos,linelen)); */
+ if (flag & NOT_PRIMARY) {
+ /* Don't use secondary hit for accessing reads */
+
+ } else if (multiple_primaries_p == true) {
+#if 0
+ /* Now always parsed */
+ hiti = Samread_parse_aux_fromfile(fp_sam,/*auxfield*/"HI",linelen);
+#endif
+ if (strcmp(hiti,"1")) {
+ /* Don't use second or later primary hit for accessing reads */
+ } else if (flag & FIRST_READ_P) {
+ queryseq5_index[readindex] = k;
+ } else {
+ queryseq3_index[readindex] = k;
+ }
+
+ } else {
+ if (flag & FIRST_READ_P) {
+ queryseq5_index[readindex] = k;
+ } else {
+ queryseq3_index[readindex] = k;
+ }
+ }
+
+ FREE(hiti);
+ Cell_fill(cells[k++],lineindex,readindex,flag,split_output,query_lowp,initial_softclip,genomicpos,filei,fileposition,linelen);
+
+ fileposition += linelen;
+ l = Intlist_next(l);
+ }
+
+ fprintf(stderr,"done\n");
+ }
+ FREE(last_acc);
+
+ nreads = readindex + 1;
+ duplicatep = (bool *) CALLOC(nreads,sizeof(bool));
+
+ /* Sort entries, based on genomicpos_extend_softclip */
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Sorting SAM lines...");
+ qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_extend_softclip_lowhigh_cmp);
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+ /* Mark all duplicates within mappers, based on genomicpos_extend_softclip */
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Finding duplicates...");
+
+ i = 0;
+ while (i < n_mappers) {
+ j_low = i + 1;
+ while (j_low < n_mappers && cells[j_low]->genomicpos_extend_softclip == cells[i]->genomicpos_extend_softclip &&
+ cells[j_low]->low_read_p == true) {
+ j_low++;
+ }
+
+ j_high = j_low;
+ while (j_high < n_mappers && cells[j_high]->genomicpos_extend_softclip == cells[i]->genomicpos_extend_softclip) {
+ j_high++;
+ }
+
+ if (j_low > i + 1) {
+ debug(printf("\nFound multiple low hits from %d to (%d - 1) at genomicpos_extend_softclip %u\n",
+ i,j_low,cells[i]->genomicpos_extend_softclip));
+
+ /* Multiple low hits with same chrpos, so opportunity to mark duplicatep */
+ /* Find queryseqs for each */
+ for (k = i; k < j_low; k++) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ debug9(printf("Looking for queryseqs for "));
+ debug9(Cell_print_fromfile(fp_sam,cells[k],headers));
+
+ if (cells[k]->flag & FIRST_READ_P) {
+ debug9(printf("Flag for entry %d is %u, indicating a first read\n",k,cells[k]->flag));
+ moveto(fp_sam,cells[k]->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+ if (cells[k]->flag & QUERY_MINUSP) {
+ debug(printf("complementing queryseq5\n"));
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq5 = read;
+ debug9(printf("queryseq5 is %s\n",read));
+
+ mate_allocated = queryseq3_index[cells[k]->readindex];
+ mate = &(cells_allocated[mate_allocated]);
+
+ debug9(printf("Mate is "));
+ debug9(Cell_print_fromfile(fp_sam,mate,headers));
+ moveto(fp_sam,mate->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+ if (mate->flag & QUERY_MINUSP) {
+ debug(printf("complementing queryseq3\n"));
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq3 = read;
+ debug9(printf("queryseq3 is %s\n",read));
+
+ } else {
+ debug9(printf("Flag for entry %d is %u, indicating a second read\n",k,cells[k]->flag));
+ moveto(fp_sam,cells[k]->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+ if (cells[k]->flag & QUERY_MINUSP) {
+ debug(printf("complementing queryseq3\n"));
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq3 = read;
+ debug9(printf("queryseq3 is %s\n",read));
+
+ mate_allocated = queryseq5_index[cells[k]->readindex];
+ mate = &(cells_allocated[mate_allocated]);
+ debug9(printf("Mate is "));
+ debug9(Cell_print_fromfile(fp_sam,mate,headers));
+ moveto(fp_sam,mate->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+ if (mate->flag & QUERY_MINUSP) {
+ debug(printf("complementing queryseq5\n"));
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq5 = read;
+ debug9(printf("queryseq5 is %s\n",read));
+ }
+
+ Cell_standardize_queryseqs(cells[k]);
+ }
+
+ qsort(&(cells[i]),j_low - i,sizeof(T),Cell_queryseq_cmp);
+
+ for (k = i + 1; k < j_low; k++) {
+ debug(printf("Comparing cell %d with %d => cmp %d\n",k,k-1,Cell_queryseq_cmp(&(cells[k]),&(cells[k-1]))));
+ debug(printf(" %s %s\n",cells[k-1]->queryseq_alpha1,cells[k-1]->queryseq_alpha2));
+ debug(printf(" %s %s\n",cells[k]->queryseq_alpha1,cells[k]->queryseq_alpha2));
+ if (Cell_queryseq_cmp(&(cells[k]),&(cells[k-1])) == 0) {
+ readindex = cells[k]->readindex;
+ duplicatep[readindex] = true;
+ if (mark_first_p == true) {
+ readindex = cells[k-1]->readindex;
+ duplicatep[readindex] = true;
+ }
+ }
+ }
+
+ for (k = i; k < j_low; k++) {
+ FREE(cells[k]->queryseq5);
+ FREE(cells[k]->queryseq3);
+ }
+ }
+
+
+ /* Also analyze high ends, to avoid a false negative when the initial_softclip extension is wrong
+ due to an end indel */
+ if (j_high > j_low + 1) {
+ all_duplicates_p = true;
+ for (k = j_low; k < j_high; k++) {
+ readindex = cells[k]->readindex;
+ if (duplicatep[readindex] == false) {
+ all_duplicates_p = false;
+ }
+ }
+
+ if (all_duplicates_p == false) {
+ debug(printf("\nFound multiple high hits from %d to (%d - 1) at genomicpos_extend_softclip %u\n",
+ j_low,j_high,cells[j_low]->genomicpos_extend_softclip));
+
+ /* Multiple high hits with same chrpos, so opportunity to mark duplicatep */
+ /* Find queryseqs for each */
+ for (k = j_low; k < j_high; k++) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ debug9(printf("Looking for queryseqs for "));
+ debug9(Cell_print_fromfile(fp_sam,cells[k],headers));
+
+ if (cells[k]->flag & FIRST_READ_P) {
+ debug9(printf("Flag for entry %d is %u, indicating a first read\n",k,cells[k]->flag));
+ moveto(fp_sam,cells[k]->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+ if (cells[k]->flag & QUERY_MINUSP) {
+ debug(printf("complementing queryseq5\n"));
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq5 = read;
+ debug9(printf("queryseq5 is %s\n",read));
+
+ mate_allocated = queryseq3_index[cells[k]->readindex];
+ mate = &(cells_allocated[mate_allocated]);
+ debug9(printf("Mate is "));
+ debug9(Cell_print_fromfile(fp_sam,mate,headers));
+ moveto(fp_sam,mate->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+ if (mate->flag & QUERY_MINUSP) {
+ debug(printf("complementing queryseq3\n"));
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq3 = read;
+ debug9(printf("queryseq3 is %s\n",read));
+
+ } else {
+ debug9(printf("Flag for entry %d is %u, indicating a second read\n",k,cells[k]->flag));
+ moveto(fp_sam,cells[k]->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+ if (cells[k]->flag & QUERY_MINUSP) {
+ debug(printf("complementing queryseq3\n"));
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq3 = read;
+ debug9(printf("queryseq3 is %s\n",read));
+
+ mate_allocated = queryseq5_index[cells[k]->readindex];
+ mate = &(cells_allocated[mate_allocated]);
+ debug9(printf("Mate is "));
+ debug9(Cell_print_fromfile(fp_sam,mate,headers));
+ moveto(fp_sam,mate->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+ if (mate->flag & QUERY_MINUSP) {
+ debug(printf("complementing queryseq5\n"));
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq5 = read;
+ debug9(printf("queryseq5 is %s\n",read));
+ }
+
+ Cell_standardize_queryseqs(cells[k]);
+ }
+
+ qsort(&(cells[j_low]),j_high - j_low,sizeof(T),Cell_queryseq_cmp);
+
+ for (k = j_low + 1; k < j_high; k++) {
+ debug(printf("Comparing cell %d with %d => cmp %d\n",k,k-1,Cell_queryseq_cmp(&(cells[k]),&(cells[k-1]))));
+ debug(printf(" %s %s\n",cells[k-1]->queryseq_alpha1,cells[k-1]->queryseq_alpha2));
+ debug(printf(" %s %s\n",cells[k]->queryseq_alpha1,cells[k]->queryseq_alpha2));
+ if (Cell_queryseq_cmp(&(cells[k]),&(cells[k-1])) == 0) {
+ readindex = cells[k]->readindex;
+ duplicatep[readindex] = true;
+ if (mark_first_p == true) {
+ readindex = cells[k-1]->readindex;
+ duplicatep[readindex] = true;
+ }
+ }
+ }
+
+ for (k = j_low; k < j_high; k++) {
+ FREE(cells[k]->queryseq5);
+ FREE(cells[k]->queryseq3);
+ }
+ }
+ }
+
+ i = j_high;
+ }
+
+ /* Mark all duplicates within nomappers, based on queryseq */
+ for (k = n_mappers; k < ncells_total; k++) {
+ if (duplicatep[cells[k]->readindex] == true) {
+ cells[k]->queryseq5 = cells[k]->queryseq3 = NULL; /* Will be sorted to end of list */
+
+ } else if (cells[k]->flag & FIRST_READ_P) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ moveto(fp_sam,cells[k]->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+ if (cells[k]->flag & QUERY_MINUSP) {
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq5 = read;
+
+ mate_allocated = queryseq3_index[cells[k]->readindex];
+ mate = &(cells_allocated[mate_allocated]);
+ moveto(fp_sam,mate->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+ if (mate->flag & QUERY_MINUSP) {
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq3 = read;
+
+ Cell_standardize_queryseqs(cells[k]);
+
+ } else {
+ fp_sam = sam_inputs[cells[k]->filei];
+ moveto(fp_sam,cells[k]->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+ if (cells[k]->flag & QUERY_MINUSP) {
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq3 = read;
+
+ mate_allocated = queryseq5_index[cells[k]->readindex];
+ mate = &(cells_allocated[mate_allocated]);
+ moveto(fp_sam,mate->linestart);
+ Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+ if (mate->flag & QUERY_MINUSP) {
+ make_complement_inplace(read,readlength);
+ }
+ cells[k]->queryseq5 = read;
+
+ Cell_standardize_queryseqs(cells[k]);
+ }
+ }
+
+ FREE(queryseq3_index);
+ FREE(queryseq5_index);
+
+
+ /* Sort non-mapping entries based on queryseqs */
+ qsort(&(cells[n_mappers]),n_nomappers,sizeof(T),Cell_queryseq_cmp);
+
+ for (k = n_mappers + 1; k < ncells_total; k++) {
+ debug(printf("Comparing cell %d with %d => cmp %d\n",k,k-1,Cell_queryseq_cmp(&(cells[k]),&(cells[k-1]))));
+ if (Cell_queryseq_cmp(&(cells[k]),&(cells[k-1])) == 0) {
+ readindex = cells[k]->readindex;
+ duplicatep[readindex] = true;
+ if (mark_first_p == true) {
+ readindex = cells[k-1]->readindex;
+ duplicatep[readindex] = true;
+ }
+ }
+ }
+
+ for (k = n_mappers; k < ncells_total; k++) {
+ FREE(cells[k]->queryseq5);
+ FREE(cells[k]->queryseq3);
+ }
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+
+ /* Re-sort entries, based on genomicpos (not extended by initial_softclip), and secondary criterion */
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Re-sorting entries...");
+ if (restore_original_order_p == true) {
+ qsort(cells,ncells_total,sizeof(T),Cell_lineindex_cmp);
+
+ } else if (secondary_sort_method == NO_SECONDARY_SORT) {
+ qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+
+ } else if (secondary_sort_method == ORIG_SECONDARY_SORT) {
+ qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_linestart_cmp);
+
+ } else if (secondary_sort_method == ACC_SECONDARY_SORT) {
+ qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+
+ i = 0;
+ while (i < ncells_total) {
+ j = i + 1;
+ while (j < ncells_total && cells[j]->genomicpos == cells[i]->genomicpos) {
+ j++;
+ }
+
+ if (j > i + 1) {
+ for (k = i; k < j; k++) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ moveto(fp_sam,cells[k]->linestart);
+ cells[k]->acc = Samread_get_acc_fromfile(&acclength,fp_sam,cells[k]->linelen);
+ }
+ qsort(&(cells[i]),j - i,sizeof(T),Cell_accession_cmp);
+ for (k = i; k < j; k++) {
+ FREE(cells[k]->acc);
+ }
+ }
+
+ i = j;
+ }
+
+ } else if (secondary_sort_method == MATEFWD_SECONDARY_SORT ||
+ secondary_sort_method == MATEREV_SECONDARY_SORT) {
+ qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+
+ i = 0;
+ while (i < ncells_total) {
+ j = i + 1;
+ while (j < ncells_total && cells[j]->genomicpos == cells[i]->genomicpos) {
+ j++;
+ }
+
+ if (j > i + 1) {
+ for (k = i; k < j; k++) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ moveto(fp_sam,cells[k]->linestart);
+ cells[k]->mate_genomicpos = Samread_parse_mate_genomicpos_fromfile(fp_sam,chromosome_iit,chroffsets,cells[k]->linelen);
+ }
+ if (secondary_sort_method == MATEFWD_SECONDARY_SORT) {
+ qsort(&(cells[i]),j - i,sizeof(T),Cell_matefwd_cmp);
+ } else {
+ qsort(&(cells[i]),j - i,sizeof(T),Cell_materev_cmp);
+ }
+ }
+
+ i = j;
+ }
+ }
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+ /* Print results */
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Printing results...");
+
+ for (k = 0; k < ncells_total; k++) {
+ if (duplicatep[cells[k]->readindex] == true) {
+ if (print_duplicates_p == true) {
+ fp_sam = sam_inputs[cells[k]->filei];
+ moveto(fp_sam,cells[k]->linestart);
+ Samread_print_as_duplicate_fromfile(fp_sam,cells[k]->linelen);
+ }
+ nmarked++;
+ } else {
+ if (print_unique_p == true) {
+ /* Non-duplicate */
+ fp_sam = sam_inputs[cells[k]->filei];
+ Cell_print_fromfile(fp_sam,cells[k],headers);
+ }
+ }
+ }
+ fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+ FREE(duplicatep);
+
+ FREE(cells);
+ FREE(cells_allocated);
+
+ return nmarked;
+}
+
+
+
+#define BUFFERLEN 1024
+
+int
+main (int argc, char *argv[]) {
+ FILE **sam_inputs, *fp_sam;
+ int ninputs, filei;
+ int nchromosomes, i;
+ Univcoord_T *chroffsets;
+ Chrpos_T *chrlengths;
+ size_t fileposition;
+ int lastchar;
+
+ char buffer[BUFFERLEN], *lastp, *p;
+ Intlist_T linelengths;
+ int *headerlengths, linelen;
+ Filestring_T headers = NULL;
+#ifdef DEBUG14
+ Intlist_T linelengths_goldstd;
+ int linelen_goldstd;
+#endif
+
+ char *fileroot = NULL, *iitfile;
+ Univ_IIT_T chromosome_iit;
+ int *ncells, ncells_total, nmarked;
+
+ int opt;
+ extern int optind;
+ extern char *optarg;
+ int long_option_index = 0;
+ const char *long_name;
+
+ while ((opt = getopt_long(argc,argv,"D:d:^?",
+ long_options,&long_option_index)) != -1) {
+ switch (opt) {
+ case 0:
+ long_name = long_options[long_option_index].name;
+ if (!strcmp(long_name,"version")) {
+ print_program_version();
+ exit(0);
+ } else if (!strcmp(long_name,"help")) {
+ print_program_usage();
+ exit(0);
+
+ } else if (!strcmp(long_name,"split-output")) {
+ split_output_root = optarg;
+ } else if (!strcmp(long_name,"append-output")) {
+ appendp = true;
+
+ } else if (!strcmp(long_name,"sort2")) {
+ if (!strcmp(optarg,"none")) {
+ secondary_sort_method = NO_SECONDARY_SORT;
+ } else if (!strcmp(optarg,"orig")) {
+ secondary_sort_method = ORIG_SECONDARY_SORT;
+ } else if (!strcmp(optarg,"accession")) {
+ secondary_sort_method = ACC_SECONDARY_SORT;
+ } else if (!strcmp(optarg,"mate-fwd")) {
+ secondary_sort_method = MATEFWD_SECONDARY_SORT;
+ } else if (!strcmp(optarg,"mate-rev")) {
+ secondary_sort_method = MATEREV_SECONDARY_SORT;
+ } else {
+ fprintf(stderr,"--sort2 must be none, orig, accession, mate-fwd, or mate-rev\n");
+ exit(9);
+ }
+
+ } else if (!strcmp(long_name,"mark-dups")) {
+ mark_duplicates_p = true;
+ print_unique_p = true;
+ print_duplicates_p = true;
+
+ } else if (!strcmp(long_name,"mark-first")) {
+ mark_first_p = true;
+
+ } else if (!strcmp(long_name,"dups-only")) {
+ mark_duplicates_p = true;
+ print_unique_p = false;
+ print_duplicates_p = true;
+
+ } else if (!strcmp(long_name,"uniq-only")) {
+ mark_duplicates_p = true;
+ print_unique_p = true;
+ print_duplicates_p = false;
+
+ } else if (!strcmp(long_name,"restore-orig-order")) {
+ restore_original_order_p = true;
+
+ } else if (!strcmp(long_name,"multiple-primaries")) {
+ multiple_primaries_p = true;
+
+ } else if (!strcmp(long_name,"no-sam-headers")) {
+ sam_headers_p = false;
+
+ } else {
+ /* Shouldn't reach here */
+ fprintf(stderr,"Don't recognize option %s. For usage, run 'get-genome --help'",long_name);
+ exit(9);
+ }
+ break;
+
+ case 'D': user_genomedir = optarg; break;
+ case 'd':
+ dbroot = (char *) CALLOC(strlen(optarg)+1,sizeof(char));
+ strcpy(dbroot,optarg);
+ break;
+
+ case '^': print_program_version(); exit(0);
+ case '?': print_program_usage(); exit(0);
+ default: exit(9);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (dbroot == NULL) {
+ print_program_usage();
+ exit(9);
+ } else if (!strcmp(dbroot,"?")) {
+ Datadir_avail_gmap_databases(stdout,user_genomedir);
+ exit(0);
+ } else {
+ genomesubdir = Datadir_find_genomesubdir(&fileroot,&dbversion,user_genomedir,dbroot);
+ iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
+ strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
+ sprintf(iitfile,"%s/%s.chromosome.iit",genomesubdir,fileroot);
+ chromosome_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false);
+ FREE(iitfile);
+
+ FREE(dbversion);
+ FREE(genomesubdir);
+ FREE(fileroot);
+ FREE(dbroot);
+
+ nchromosomes = Univ_IIT_total_nintervals(chromosome_iit);
+ chrlengths = Univ_IIT_chrlengths(chromosome_iit);
+ chroffsets = MALLOC(nchromosomes * sizeof(Univcoord_T));
+ chroffsets[0] = 0;
+ for (i = 1; i < nchromosomes; i++) {
+ chroffsets[i] = chroffsets[i-1] + chrlengths[i-1];
+ }
+ FREE(chrlengths);
+ }
+
+ /* Open all outputs, even if --split-output is not used */
+ outputs = (FILE **) CALLOC((1+N_SPLIT_OUTPUTS),sizeof(FILE *));
+
+
+ /* Inputs */
+ ninputs = argc;
+ sam_inputs = (FILE **) CALLOC(ninputs,sizeof(FILE *));
+ headerlengths = (int *) CALLOC(ninputs,sizeof(int));
+ ncells = (int *) CALLOC(ninputs,sizeof(int));
+ for (filei = 0; filei < ninputs; filei++) {
+ if ((sam_inputs[filei] = fopen(argv[filei],"r")) == NULL) {
+ fprintf(stderr,"Cannot open SAM file %s\n",argv[filei]);
+ exit(9);
+ }
+ }
+
+ stopwatch = Stopwatch_new();
+ Stopwatch_start(stopwatch);
+ fprintf(stderr,"Analyzing %d SAM files...\n",ninputs);
+
+ linelengths = (Intlist_T) NULL;
+ ncells_total = 0;
+ for (filei = 0; filei < ninputs; filei++) {
+ fp_sam = sam_inputs[filei];
+ fileposition = headerlengths[filei] = SAM_header_length(&lastchar,fp_sam); /* Ignore lastchar */
+
+ /* Take care of char read by SAM_header_length */
+#ifdef HAVE_FSEEKO
+ fseeko(fp_sam,-1,SEEK_CUR);
+#else
+ fseek(fp_sam,-1,SEEK_CUR);
+#endif
+
+ linelen = 0;
+ ncells[filei] = 0;
+ while (fgets(buffer,BUFFERLEN,fp_sam) != NULL) {
+ /* printf("Read %s\n",buffer); */
+ lastp = buffer;
+ while ((p = index(lastp,'\n')) != NULL) {
+ linelen += (p - lastp)/sizeof(char) + 1;
+
+ linelengths = Intlist_push(linelengths,linelen);
+ fileposition += linelen;
+ ncells[filei] += 1;
+
+ linelen = 0;
+ lastp = p + 1;
+ }
+ linelen += strlen(lastp);
+ /* printf("Adding %d to get linelen %d\n",strlen(buffer),linelen); */
+ }
+
+ ncells_total += ncells[filei];
+
+ if (fileposition != Access_filesize(argv[filei])) {
+ fprintf(stderr,"Something is wrong with parsing of SAM file %s\n",argv[filei]);
+ fprintf(stderr,"Final file position using sortinfo: %llu\n",(unsigned long long) fileposition);
+ fprintf(stderr,"File size of SAM output file: %llu\n",(unsigned long long) Access_filesize(argv[0]));
+ exit(9);
+ } else {
+ fprintf(stderr," File %d has %d SAM lines.\n",filei+1,ncells[filei]);
+ }
+
+ }
+
+ fprintf(stderr,"Done with analysis (%.1f seconds). Found %d SAM lines total.\n",
+ Stopwatch_stop(stopwatch),ncells_total);
+ Stopwatch_free(&stopwatch);
+
+ if (ncells_total == 0) {
+ /* Exit without printing header */
+
+ } else if (sam_headers_p == false) {
+ /* Don't print SAM headers */
+
+ } else {
+ moveto(sam_inputs[0],0);
+ headers = SAM_header_change_HD_tosorted(sam_inputs[0],headerlengths[0]);
+ }
+
+ linelengths = Intlist_reverse(linelengths);
+
+ if (mark_duplicates_p == false) {
+ process_without_dups(sam_inputs,headerlengths,ncells,ninputs,linelengths,ncells_total,
+ chromosome_iit,chroffsets,headers);
+ } else {
+ nmarked = process_with_dups(sam_inputs,headerlengths,ncells,ninputs,linelengths,ncells_total,
+ chromosome_iit,chroffsets,headers);
+ fprintf(stderr,"Marked %d out of %d SAM lines as duplicates (%.1f%%)\n",
+ nmarked,ncells_total,100.0*(double) nmarked/(double) (ncells_total));
+ }
+
+ for (filei = 0; filei < ninputs; filei++) {
+ fclose(sam_inputs[filei]);
+ }
+ FREE(sam_inputs);
+ FREE(headerlengths);
+ FREE(ncells);
+
+ if (headers != NULL) {
+ Filestring_free(&headers);
+ }
+
+ /* SAM_header_touch(outputs,split_output_root,appendp); -- Don't want to destroy other SAM files */
+ for (i = 1; i <= N_SPLIT_OUTPUTS; i++) {
+ if (outputs[i] != NULL) {
+ fclose(outputs[i]);
+ }
+ }
+ FREE(outputs);
+
+
+ Intlist_free(&linelengths);
+
+ FREE(chroffsets);
+ Univ_IIT_free(&chromosome_iit);
+
+ return 0;
+}
=====================================
src/samread.c
=====================================
@@ -0,0 +1,1890 @@
+static char rcsid[] = "$Id: samread.c 215476 2018-05-25 23:23:37Z twu $";
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* For strcpy */
+#include <strings.h> /* For rindex */
+#include <ctype.h>
+
+#include "except.h"
+#include "mem.h"
+#include "assert.h"
+#include "bool.h"
+
+#include "samread.h"
+#include "samflags.h"
+#include "chrnum.h"
+
+
+#ifdef DEBUG
+#define debug(x) x
+#else
+#define debug(x)
+#endif
+
+/* Prints location of parsing error for Unexpected output type */
+#ifdef DEBUG1
+#define debug1(x) x
+#else
+#define debug1(x)
+#endif
+
+
+static int
+cigar_string_readlength (int *hardclip_low, int *hardclip_high, char *cigar) {
+ int readlength;
+ unsigned int npos;
+ char *p, type;
+ bool firstp = true;
+
+ *hardclip_low = *hardclip_high = 0;
+ if (cigar[0] == '*') {
+ return 0;
+ } else {
+ readlength = 0;
+
+ p = cigar;
+ while (*p != '\0') {
+ if (sscanf(p,"%u",&npos) != 1) {
+ fprintf(stderr,"Unable to parse cigar %s. No number in %s\n",cigar,p);
+ exit(9);
+ }
+
+ while (*p != '\0' && isdigit(*p)) {
+ p++;
+ }
+ if (*p == '\0') {
+ fprintf(stderr,"Unable to parse cigar %s. No letter after number %u\n",cigar,npos);
+ exit(9);
+ } else {
+ type = *p++;
+ }
+
+ if (type == 'S' || type == 'M' || type == 'I') {
+ readlength += (int) npos;
+
+ } else if (type == 'H') {
+ if (firstp == true) {
+ *hardclip_low = npos;
+ } else {
+ *hardclip_high = npos;
+ }
+ readlength += (int) npos;
+
+ } else if (type == 'D' || type == 'N') {
+ /* Ignore */
+ } else if (type == 'P') {
+ /* Ignore */
+ } else {
+ fprintf(stderr,"samread.c cannot parse type %c\n",type);
+ exit(9);
+ }
+
+ firstp = false;
+ }
+ }
+
+ return readlength;
+}
+
+
+static int
+cigar_string_initial_softclip (char *cigar) {
+ unsigned int npos;
+ char *p, type;
+
+ if (cigar[0] == '*') {
+ return 0;
+ } else {
+ p = cigar;
+ while (*p != '\0') {
+ if (sscanf(p,"%u",&npos) != 1) {
+ fprintf(stderr,"Unable to parse cigar %s. No number in %s\n",cigar,p);
+ exit(9);
+ }
+
+ while (*p != '\0' && isdigit(*p)) {
+ p++;
+ }
+ if (*p == '\0') {
+ fprintf(stderr,"Unable to parse cigar %s. No letter after number %u\n",cigar,npos);
+ exit(9);
+ } else {
+ type = *p++;
+ }
+
+ if (type != 'S') {
+ return 0;
+ } else {
+ return (int) npos;
+ }
+ }
+
+ return 0;
+ }
+}
+
+
+#if 0
+char *
+Samread_get_acc (unsigned int *flag, char *line) {
+ char *acc, *p;
+ int length;
+
+ p = line;
+ while (*p != '\0' && *p != '\t') p++;
+ length = (p - line)/sizeof(char);
+ acc = (char *) MALLOC((length+1)*sizeof(char));
+ strncpy(acc,line,length);
+ acc[length] = '\0';
+
+ if (*p == '\0') {
+ fprintf(stderr,"Can't parse flag part of %s\n",line);
+ abort();
+ } else {
+ p++; /* Skip over tab */
+ }
+ *flag = strtoul(p,NULL,10);
+
+ return acc;
+}
+#endif
+
+
+/* Takes advantage of the fact that sam_sort knows the linelength */
+char *
+Samread_get_acc_fromfile (int *acclength, FILE *fp, int linelength) {
+ char *acc, *p, c;
+
+ p = acc = MALLOC((linelength + 1) * sizeof(char));
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0'; /* terminating char */
+ *acclength = (p - acc)/sizeof(char);
+
+ return acc;
+}
+
+
+/* Called just after we read in '\t', so should start at a field */
+static SAM_split_output_type
+parse_XO_fromfile (FILE *fp) {
+ char c = 1, c0, c1;
+ char abbrev0, abbrev1;
+
+ c0 = fgetc(fp);
+ c1 = fgetc(fp);
+ if (c0 == 'X' && c1 == 'O') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+ abbrev0 = fgetc(fp);
+ abbrev1 = fgetc(fp);
+ switch (abbrev0) {
+ case 'N':
+ if (abbrev1 == 'M') {
+ return OUTPUT_NM;
+ } else {
+ debug1(fprintf(stderr,"parse_XO_from_file 1: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ return OUTPUT_NONE;
+ }
+ break;
+ case 'C':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_CU;
+ case 'C': return OUTPUT_CC;
+ case 'T': return OUTPUT_CT;
+ case 'M': return OUTPUT_CM;
+ case 'X': return OUTPUT_CX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 2: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ case 'H':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_HU;
+ case 'C': return OUTPUT_HC;
+ case 'T': return OUTPUT_HT;
+ case 'M': return OUTPUT_HM;
+ case 'X': return OUTPUT_HX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 3: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ case 'U':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_UU;
+ case 'C': return OUTPUT_UC;
+ case 'T': return OUTPUT_UT;
+ case 'M': return OUTPUT_UM;
+ case 'X': return OUTPUT_UX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 4: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ case 'P':
+ switch (abbrev1) {
+ case 'C': return OUTPUT_PC;
+ case 'I': return OUTPUT_PI;
+ case 'S': return OUTPUT_PS;
+ case 'L': return OUTPUT_PL;
+ case 'M': return OUTPUT_PM;
+ case 'X': return OUTPUT_PX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 5: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 6: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ }
+
+ while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+ if (c == EOF || c == '\0') {
+ return OUTPUT_NONE;
+ } else {
+ c0 = fgetc(fp);
+ c1 = fgetc(fp);
+ if (c0 == 'X' && c1 == 'O') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+ abbrev0 = fgetc(fp);
+ abbrev1 = fgetc(fp);
+ switch (abbrev0) {
+ case 'N':
+ if (abbrev1 == 'M') {
+ return OUTPUT_NM;
+ } else {
+ debug1(fprintf(stderr,"parse_XO_from_file 7: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ return OUTPUT_NONE;
+ }
+ break;
+ case 'C':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_CU;
+ case 'C': return OUTPUT_CC;
+ case 'T': return OUTPUT_CT;
+ case 'M': return OUTPUT_CM;
+ case 'X': return OUTPUT_CX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 8: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ case 'H':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_HU;
+ case 'C': return OUTPUT_HC;
+ case 'T': return OUTPUT_HT;
+ case 'M': return OUTPUT_HM;
+ case 'X': return OUTPUT_HX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 9: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ case 'U':
+ switch (abbrev1) {
+ case 'U': return OUTPUT_UU;
+ case 'C': return OUTPUT_UC;
+ case 'T': return OUTPUT_UT;
+ case 'M': return OUTPUT_UM;
+ case 'X': return OUTPUT_UX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 10: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ case 'P':
+ switch (abbrev1) {
+ case 'C': return OUTPUT_PC;
+ case 'I': return OUTPUT_PI;
+ case 'S': return OUTPUT_PS;
+ case 'L': return OUTPUT_PL;
+ case 'M': return OUTPUT_PM;
+ case 'X': return OUTPUT_PX;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 11: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_from_file 12: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+ }
+ }
+ }
+
+ return OUTPUT_NONE;
+}
+
+
+#define HITI_MAXDIGITS 10
+
+/* Called just after we read in '\t', so should start at a field */
+static SAM_split_output_type
+parse_XO_and_HI_fromfile (char **hiti, FILE *fp) {
+ SAM_split_output_type split_output = OUTPUT_NONE;
+ char *p, c = 1, c0, c1;
+ char abbrev0, abbrev1;
+
+ *hiti = MALLOC((HITI_MAXDIGITS + 1) * sizeof(char));
+
+ c0 = fgetc(fp);
+ c1 = fgetc(fp);
+ if (c0 == 'H' && c1 == 'I') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+
+ p = *hiti;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+ *--p = '\0'; /* terminating char */
+
+ } else if (c0 == 'X' && c1 == 'O') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+ abbrev0 = fgetc(fp);
+ abbrev1 = fgetc(fp);
+ switch (abbrev0) {
+ case 'N':
+ if (abbrev1 == 'M') {
+ split_output = OUTPUT_NM;
+ } else {
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 1: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ case 'C':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_CU; break;
+ case 'C': split_output = OUTPUT_CC; break;
+ case 'T': split_output = OUTPUT_CT; break;
+ case 'M': split_output = OUTPUT_CM; break;
+ case 'X': split_output = OUTPUT_CX; break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 2: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ case 'H':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_HU; break;
+ case 'C': split_output = OUTPUT_HC; break;
+ case 'T': split_output = OUTPUT_HT; break;
+ case 'M': split_output = OUTPUT_HM; break;
+ case 'X': split_output = OUTPUT_HX; break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 3: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ case 'U':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_UU; break;
+ case 'C': split_output = OUTPUT_UC; break;
+ case 'T': split_output = OUTPUT_UT; break;
+ case 'M': split_output = OUTPUT_UM; break;
+ case 'X': split_output = OUTPUT_UX; break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 4: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ case 'P':
+ switch (abbrev1) {
+ case 'C': split_output = OUTPUT_PC; break;
+ case 'I': split_output = OUTPUT_PI; break;
+ case 'S': split_output = OUTPUT_PS; break;
+ case 'L': split_output = OUTPUT_PL; break;
+ case 'M': split_output = OUTPUT_PM; break;
+ case 'X': split_output = OUTPUT_PX; break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 5: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 6: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ }
+
+ while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+ if (c == EOF || c == '\0') {
+ return split_output;
+ } else {
+ c0 = fgetc(fp);
+ c1 = fgetc(fp);
+ if (c0 == 'H' && c1 == 'I') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+
+ p = *hiti;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+ *--p = '\0'; /* terminating char */
+
+ } else if (c0 == 'X' && c1 == 'O') {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+ abbrev0 = fgetc(fp);
+ abbrev1 = fgetc(fp);
+ switch (abbrev0) {
+ case 'N':
+ if (abbrev1 == 'M') {
+ split_output = OUTPUT_NM;
+ } else {
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 7: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ case 'C':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_CU; break;
+ case 'C': split_output = OUTPUT_CC; break;
+ case 'T': split_output = OUTPUT_CT; break;
+ case 'M': split_output = OUTPUT_CM; break;
+ case 'X': split_output = OUTPUT_CX; break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 8: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ case 'H':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_HU; break;
+ case 'C': split_output = OUTPUT_HC; break;
+ case 'T': split_output = OUTPUT_HT; break;
+ case 'M': split_output = OUTPUT_HM; break;
+ case 'X': split_output = OUTPUT_HX; break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 9: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ case 'U':
+ switch (abbrev1) {
+ case 'U': split_output = OUTPUT_UU; break;
+ case 'C': split_output = OUTPUT_UC; break;
+ case 'T': split_output = OUTPUT_UT; break;
+ case 'M': split_output = OUTPUT_UM; break;
+ case 'X': split_output = OUTPUT_UX; break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 10: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ case 'P':
+ switch (abbrev1) {
+ case 'C': split_output = OUTPUT_PC; break;
+ case 'I': split_output = OUTPUT_PI; break;
+ case 'S': split_output = OUTPUT_PS; break;
+ case 'L': split_output = OUTPUT_PL; break;
+ case 'M': split_output = OUTPUT_PM; break;
+ case 'X': split_output = OUTPUT_PX; break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 11: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ break;
+ default:
+ debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 12: "));
+ fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+ split_output = OUTPUT_NONE;
+ }
+ }
+ }
+
+ return split_output;
+}
+
+
+
+/* Main parser for processing with dups */
+char *
+Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM_split_output_type *split_output,
+ char **hiti, Univcoord_T *genomicpos, int *initial_softclip, bool *query_lowp,
+ FILE *fp, Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength) {
+ char *acc, *p, c;
+ char *substring;
+ Chrnum_T chrnum, mate_chrnum;
+ Chrpos_T chrpos, mate_chrpos;
+ Univcoord_T mate_genomicpos;
+
+ substring = MALLOCA((linelength + 1) * sizeof(char));
+
+ /* 1. QNAME */
+ p = acc = MALLOC((linelength + 1) * sizeof(char));
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0'; /* terminating char */
+ *acclength = (p - acc)/sizeof(char);
+
+ /* 2. FLAG. Skip */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%u",&(*flag)) != 1) {
+ fprintf(stderr,"Unable to find flag in %s\n",substring);
+ abort();
+ } else {
+ debug(printf(" flag = %u\n",*flag));
+ }
+
+ /* 3. RNAME: chr. Skip */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (!strcmp(substring,"*")) {
+ *genomicpos = 0;
+ } else if ((chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
+ fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+ fprintf(stderr,"Available chromosomes in the genome: ");
+ Univ_IIT_dump_labels(stderr,chromosome_iit);
+ exit(9);
+ } else {
+ *genomicpos = chroffsets[chrnum - 1];
+ }
+
+ /* 4. POS: chrpos */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%u",&chrpos) != 1) {
+ fprintf(stderr,"Unable to find chrpos in %s\n",substring);
+ abort();
+ } else {
+ *genomicpos += chrpos;
+ }
+
+ /* 5. MAPQ: Mapping quality. Skip */
+ while ((c = fgetc(fp)) != EOF && c != '\t') ;
+
+ /* 6. CIGAR. Parse for initial_softclip */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ *initial_softclip = cigar_string_initial_softclip(substring);
+
+
+ /* 7. MRNM: Mate chr */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (!strcmp(substring,"*")) {
+ mate_genomicpos = 0;
+ } else if (!strcmp(substring,"=")) {
+ mate_chrnum = chrnum;
+ mate_genomicpos = chroffsets[mate_chrnum - 1];
+ } else if ((mate_chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
+ fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+ fprintf(stderr,"Available chromosomes in the genome: ");
+ Univ_IIT_dump_labels(stderr,chromosome_iit);
+ exit(9);
+ } else {
+ mate_genomicpos = chroffsets[mate_chrnum - 1];
+ }
+
+ /* 8. MPOS: Mate chrpos */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%d",&mate_chrpos) != 1) {
+ fprintf(stderr,"Unable to find mate_chrpos_low in %s\n",substring);
+ abort();
+ } else {
+ mate_genomicpos += mate_chrpos;
+ }
+
+
+ /* Determine if the query is low */
+ if (*genomicpos == 0) {
+ if (*flag & PAIRED_READ) {
+ if (mate_genomicpos != 0) {
+ /* Mate will be mapped, so this is the high end */
+ *query_lowp = false;
+ } else if (*flag & FIRST_READ_P) {
+ *query_lowp = true;
+ } else {
+ *query_lowp = false;
+ }
+ } else {
+ /* Single-end, so this is the low end */
+ *query_lowp = true;
+ }
+
+ } else if (mate_chrpos == 0) {
+ *query_lowp = true;
+
+ } else if (*genomicpos < mate_genomicpos) {
+ *query_lowp = true;
+
+ } else {
+ *query_lowp = false;
+
+ }
+
+ FREEA(substring);
+
+ /* 9. ISIZE: Insert size. Skip. */
+ while (fgetc(fp) != '\t') ;
+
+ /* 10. SEQ: queryseq. Skip. */
+ while (fgetc(fp) != '\t') ;
+
+ /* 11. QUAL: quality scores. Skip. */
+ while (fgetc(fp) != '\t') ;
+
+ *split_output = parse_XO_and_HI_fromfile(&(*hiti),fp);
+
+ return acc;
+}
+
+
+
+/* ILLUMINA-A1CCE9_0004:1:1:1103:6310#0 0 20 33639850 255 55M21S * 0 0 AAAAATTGTATACCGCAGATTCAGGCATGGATTCCGTGAAGGAACAACACCTAAANCCAAAGNTCGGAAGANCGGN CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCBCCBDBDCBCCDCDC at CCC&AAAA################ NM:i:2 */
+
+#if 0
+char *
+Samread_parse_line (char **acc, unsigned int *flag, int *mapq, char **chr, Chrpos_T *chrpos, char **cigar,
+ char **mate_chr, Chrpos_T *mate_chrpos_low,
+ int *readlength, char **read, char **quality_string, char *line) {
+ char *p, *q;
+ int length, i;
+
+ debug(printf("Entering Samread_parse_line with %s\n",line));
+
+ p = line;
+ while (!isspace(*p)) p++;
+ length = (p - line)/sizeof(char);
+ *acc = (char *) MALLOC((length+1)*sizeof(char));
+ strncpy(*acc,line,length);
+ (*acc)[length] = '\0';
+
+ if (*p != '\0') { /* Skip over tab */
+ p++;
+ }
+
+ if (sscanf(p,"%u",&(*flag)) != 1) {
+ fprintf(stderr,"Unable to find flag in %s\n",p);
+ abort();
+ } else {
+ debug(printf(" flag = %u\n",*flag));
+ }
+
+ while (!isspace(*p)) p++; /* Skip over flag */
+ if (*p == '\0') {
+ fprintf(stderr,"Can't parse chr part of %s\n",line);
+ abort();
+ } else {
+ p++; /* Skip over tab */
+ }
+ q = p;
+ while (!isspace(*q)) q++;
+ length = (q - p)/sizeof(char);
+ *chr = (char *) MALLOC((length+1)*sizeof(char));
+ strncpy(*chr,p,length);
+ (*chr)[length] = '\0';
+
+ debug(printf(" chr = %s\n",*chr));
+ if (*q != '\0') {
+ q++;
+ }
+
+
+ p = q;
+ if (sscanf(p,"%u",&(*chrpos)) != 1) {
+ fprintf(stderr,"Unable to find chrpos in %s\n",p);
+ abort();
+ } else {
+ debug(printf(" chrpos = %u\n",*chrpos));
+ }
+
+
+ while (!isspace(*p)) p++; /* Skip over chrpos */
+ if (*p == '\0') {
+ fprintf(stderr,"Can't parse chrpos part of %s\n",line);
+ abort();
+ } else {
+ p++; /* Skip over tab */
+ }
+
+ /* Read mapping quality */
+ if (sscanf(p,"%d",&(*mapq)) != 1) {
+ fprintf(stderr,"Unable to find mapq in %s\n",p);
+ abort();
+ } else {
+ debug(printf(" mapq = %d\n",*mapq));
+ }
+
+ /* Skip past mapping quality */
+ while (!isspace(*p)) p++;
+
+
+ if (*p == '\0') {
+ fprintf(stderr,"Can't parse cigar part of %s\n",line);
+ abort();
+ } else {
+ p++; /* Skip over tab */
+ }
+ q = p;
+ while (!isspace(*q)) q++;
+ length = (q - p)/sizeof(char);
+ *cigar = (char *) MALLOC((length+1)*sizeof(char));
+ strncpy(*cigar,p,length);
+ (*cigar)[length] = '\0';
+
+ debug(printf(" cigar = %s\n",*cigar));
+
+
+ /* mate chr */
+ p = q;
+ if (*p != '\0') {
+ p++; /* Should be a tab */
+ }
+ q = p;
+ while (!isspace(*q)) q++;
+ length = (q - p)/sizeof(char);
+ *mate_chr = (char *) MALLOC((length+1)*sizeof(char));
+ strncpy(*mate_chr,p,length);
+ (*mate_chr)[length] = '\0';
+
+ debug(printf(" mate_chr = %s\n",*mate_chr));
+ if (*q == '\0') {
+ fprintf(stderr,"Can't parse mate chr part of %s\n",line);
+ abort();
+ } else {
+ q++;
+ }
+
+ /* mate chrpos low */
+ p = q;
+ if (sscanf(p,"%u",&(*mate_chrpos_low)) != 1) {
+ fprintf(stderr,"Unable to find mate_chrpos_low in %s\n",p);
+ abort();
+ } else {
+ debug(printf(" mate_chrpos_low = %u\n",*mate_chrpos_low));
+ }
+
+ while (!isspace(*p)) p++; /* Skip over mate_chrpos */
+ if (*p == '\0') {
+ fprintf(stderr,"Can't parse mate chrpos part of %s\n",line);
+ abort();
+ } else {
+ p++; /* Skip over tab */
+ }
+
+
+ /* Skip over insert size */
+ while (!isspace(*p)) p++;
+ if (*p == '\0') {
+ fprintf(stderr,"Can't parse mate chrpos part of %s\n",line);
+ abort();
+ } else {
+ p++;
+ }
+
+
+ q = p;
+ while (!isspace(*q)) q++;
+ *readlength = (q - p)/sizeof(char);
+ if (*q == '\t') q++;
+ debug(printf(" readlength = %d\n",*readlength));
+
+ *read = (char *) MALLOC(((*readlength)+1)*sizeof(char));
+ strncpy(*read,p,*readlength);
+ (*read)[*readlength] = '\0';
+
+ debug(printf(" read = %s\n",*read));
+
+ p = q;
+ while (!isspace(*q)) q++;
+ length = (q - p)/sizeof(char);
+ *quality_string = (char *) MALLOC(((*readlength)+1)*sizeof(char));
+ if (length == *readlength) {
+ strncpy(*quality_string,p,length);
+ (*quality_string)[*readlength] = '\0';
+
+ } else {
+ for (i = 0; i < *readlength; i++) {
+ (*quality_string)[i] = ' ';
+ }
+ (*quality_string)[*readlength] = '\0';
+ }
+
+ if (*q == '\t') q++;
+
+ return q;
+}
+#endif
+
+#if 0
+/* Leaves fp at start of auxinfo */
+void
+Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq, char **chr, Chrpos_T *chrpos, char **cigar,
+ char **mate_chr, Chrpos_T *mate_chrpos_low,
+ int *readlength, char **read, char **quality_string, int linelength) {
+ char *p, c;
+ int length, i;
+ char *substring;
+
+ debug(printf("Entering Samread_parse_line_fromfile\n"));
+
+ substring = MALLOCA((linelength + 1) * sizeof(char));
+
+ /* 1. QNAME */
+ p = *acc = (char *) MALLOC((linelength + 1) * sizeof(char));
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ /* 2. FLAG */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%u",&(*flag)) != 1) {
+ fprintf(stderr,"Unable to find flag in %s\n",substring);
+ abort();
+ } else {
+ debug(printf(" flag = %u\n",*flag));
+ }
+
+ /* 3. RNAME: chr */
+ p = *chr = (char *) MALLOC((linelength + 1) * sizeof(char));
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ /* 4. POS: chrpos */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%u",&(*chrpos)) != 1) {
+ fprintf(stderr,"Unable to find chrpos in %s\n",substring);
+ abort();
+ } else {
+ debug(printf(" chrpos = %u\n",*chrpos));
+ }
+
+ /* 5. MAPQ: Mapping quality */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%d",&(*mapq)) != 1) {
+ fprintf(stderr,"Unable to find mapq in %s\n",substring);
+ abort();
+ } else {
+ debug(printf(" mapq = %d\n",*mapq));
+ }
+
+ /* 5. CIGAR */
+ p = *cigar = (char *) MALLOC((linelength + 1) * sizeof(char));
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ /* 7. MRNM: Mate chr */
+ p = *mate_chr = (char *) MALLOC((linelength + 1) * sizeof(char));
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ /* 8. MPOS: Mate chrpos */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%d",&(*mate_chrpos_low)) != 1) {
+ fprintf(stderr,"Unable to find mate_chrpos_low in %s\n",substring);
+ abort();
+ } else {
+ debug(printf(" mate_chrpos_low = %u\n",*mate_chrpos_low));
+ }
+
+ /* 9. ISIZE: Insert size. Skip. */
+ while (fgetc(fp) != '\t') ;
+
+ /* 10. SEQ: queryseq */
+ p = *read = (char *) MALLOC((linelength + 1) * sizeof(char));
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+ *readlength = (p - *read)/sizeof(char);
+
+ /* 11. QUAL: quality scores */
+ p = *quality_string = (char *) MALLOC((*readlength + 1) * sizeof(char));
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+ length = (p - *quality_string)/sizeof(char);
+
+ if (length != *readlength) {
+ for (i = 0; i < *readlength; i++) {
+ (*quality_string)[i] = ' ';
+ }
+ *quality_string[i] = '\0';
+ }
+
+ FREEA(substring);
+
+ return;
+}
+#endif
+
+
+int
+Samread_parse_linelen_fromfile (FILE *fp) {
+ int linelen = 0;
+
+ /* 1. QNAME: Skip */
+ while (!feof(fp) && fgetc(fp) != '\t') {
+ linelen++;
+ }
+ linelen++;
+
+ if (feof(fp)) {
+ return -1;
+ }
+
+ while (!feof(fp) && fgetc(fp) != '\n') {
+ linelen++;
+ }
+ linelen++;
+
+ return linelen;
+}
+
+
+/* Main parser for processing without dups */
+Univcoord_T
+Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_output_type *split_output,
+ Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength) {
+ Univcoord_T genomicpos;
+ Chrnum_T chrnum;
+ Chrpos_T chrpos;
+ char *substring, *p, c;
+
+ substring = MALLOCA((linelength + 1) * sizeof(char));
+
+ /* 1. QNAME: Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 2. FLAG */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%u",&(*flag)) != 1) {
+ fprintf(stderr,"Unable to find flag in %s\n",substring);
+ abort();
+ } else {
+ debug(printf(" flag = %u\n",*flag));
+ }
+
+ /* 3. RNAME: chr */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (!strcmp(substring,"*")) {
+ genomicpos = 0;
+ } else if ((chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
+ fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+ fprintf(stderr,"Available chromosomes in the genome: ");
+ Univ_IIT_dump_labels(stderr,chromosome_iit);
+ exit(9);
+ } else {
+ genomicpos = chroffsets[chrnum - 1];
+ }
+
+ /* 4. POS: chrpos */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%u",&chrpos) != 1) {
+ fprintf(stderr,"Unable to find chrpos in %s\n",substring);
+ abort();
+ } else {
+ genomicpos += chrpos;
+ }
+
+ FREEA(substring);
+
+ /* 5. MAPQ: Mapping quality. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 6. CIGAR. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 7. MRNM: Mate chr. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 8. MPOS: Mate chrpos. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 9. ISIZE: Insert size. Skip. */
+ while (fgetc(fp) != '\t') ;
+
+ /* 10. SEQ: queryseq. Skip. */
+ while (fgetc(fp) != '\t') ;
+
+ /* 11. QUAL: quality scores. Skip. */
+ while (fgetc(fp) != '\t') ;
+
+ *split_output = parse_XO_fromfile(fp);
+
+ return genomicpos;
+}
+
+
+char *
+Samread_parse_aux_fromfile (FILE *fp, char *auxfield, int linelength) {
+ char *value, *p, c = 1, c0, c1;
+
+ while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+ if (c == '\0') {
+ return (char *) NULL;
+ } else {
+ c0 = fgetc(fp);
+ c1 = fgetc(fp);
+ if (c0 == auxfield[0] && c1 == auxfield[1]) {
+ fgetc(fp); /* : */
+ fgetc(fp); /* type */
+ fgetc(fp); /* : */
+ p = value = MALLOC((linelength+1) * sizeof(char));
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+ *--p = '\0'; /* terminating char */
+ return value;
+ }
+ }
+
+ return (char *) NULL;
+}
+
+
+
+#if 0
+/* Returns only fields needed by sam_sort */
+void
+Samread_parse_read_and_mateinfo_fromfile (FILE *fp, unsigned int *flag, char **mate_chr, Chrpos_T *mate_chrpos,
+ int *readlength, char **read, int linelength) {
+ char *p, *q, c;
+ char *substring, *clipped;
+ int hardclip_low, hardclip_high, subseq_length;
+
+ substring = MALLOCA((linelength + 1) * sizeof(char));
+
+ /* 1. QNAME. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 2. FLAG */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%u",&(*flag)) != 1) {
+ fprintf(stderr,"Unable to find flag in %s\n",substring);
+ abort();
+ } else {
+ debug(printf(" flag = %u\n",*flag));
+ }
+
+ /* 3. RNAME: chr. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 4. POS: chrpos. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 5. MAPQ: Mapping quality. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 6. CIGAR. Parse for cigar_readlength. */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ /* For a nomapper, this readlength is incorrect */
+ *readlength = cigar_string_readlength(&hardclip_low,&hardclip_high,substring);
+
+
+ /* 7. MRNM: Mate chr */
+ p = *mate_chr = (char *) MALLOC((linelength + 1) * sizeof(char));
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ /* 8. MPOS: Mate chrpos */
+w p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%d",&(*mate_chrpos)) != 1) {
+ fprintf(stderr,"Unable to find mate_chrpos_low in %s\n",substring);
+ abort();
+ } else {
+ debug(printf(" mate_chrpos_low = %u\n",*mate_chrpos));
+ }
+
+ /* 9. ISIZE: Insert size. Skip. */
+ while (fgetc(fp) != '\t') ;
+
+ /* 10. SEQ: queryseq */
+ *read = (char *) MALLOC((linelength + 1) * sizeof(char));
+
+ p = &((*read)[hardclip_low]);
+ subseq_length = 0;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') {
+ subseq_length++;
+ }
+
+ if (*readlength == 0) {
+ *readlength = subseq_length;
+ }
+
+ if (subseq_length + hardclip_low + hardclip_high != *readlength) {
+ fprintf(stderr,"Cigar readlength %d is not consistent with subsequence length %d + hardclip_low %d + hardclip_high %d\n",
+ *readlength,subseq_length,hardclip_low,hardclip_high);
+ }
+
+ if (hardclip_low > 0 || hardclip_high > 0) {
+ if ((clipped = Samread_parse_aux_fromfile(fp,/*auxfield*/"XH",linelength)) == NULL) {
+ fprintf(stderr,"Hard-clipped read needs XH field from a recent GSNAP version\n");
+ exit(9);
+
+ } else {
+ if (hardclip_low > 0) {
+ p = &((*read)[0]);
+ q = clipped;
+ while ((c = *q++) != '\0') {
+ *p++ = c;
+ }
+ } else {
+ p = &((*read)[(*readlength) - hardclip_high]);
+ q = clipped;
+ while ((c = *q++) != '\0') {
+ *p++ = c;
+ }
+ }
+ FREE(clipped);
+
+ }
+ }
+ (*read)[*readlength] = '\0';
+
+ FREEA(substring);
+
+ return;
+}
+#endif
+
+
+void
+Samread_parse_read_fromfile (FILE *fp, unsigned int *flag, int *readlength, char **read, int linelength) {
+ char *p, *q, c;
+ char *substring, *clipped;
+ int hardclip_low, hardclip_high, subseq_length;
+
+ substring = MALLOCA((linelength + 1) * sizeof(char));
+
+ /* 1. QNAME. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 2. FLAG. Skip */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%u",&(*flag)) != 1) {
+ fprintf(stderr,"Unable to find flag in %s\n",substring);
+ abort();
+ } else {
+ debug(printf(" flag = %u\n",*flag));
+ }
+
+ /* 3. RNAME: chr. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 4. POS: chrpos. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 5. MAPQ: Mapping quality. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 6. CIGAR. Parse for cigar_readlength. */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ /* For a nomapper, this readlength is incorrect */
+ *readlength = cigar_string_readlength(&hardclip_low,&hardclip_high,substring);
+
+
+ /* 7. MRNM: Mate chr. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 8. MPOS: Mate chrpos. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 9. ISIZE: Insert size. Skip. */
+ while (fgetc(fp) != '\t') ;
+
+ /* 10. SEQ: queryseq */
+ *read = (char *) MALLOC((linelength + 1) * sizeof(char));
+
+ p = &((*read)[hardclip_low]);
+ subseq_length = 0;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') {
+ subseq_length++;
+ }
+
+ if (*readlength == 0) {
+ *readlength = subseq_length;
+ }
+
+ if (subseq_length + hardclip_low + hardclip_high != *readlength) {
+ fprintf(stderr,"Cigar readlength %d is not consistent with subsequence length %d + hardclip_low %d + hardclip_high %d\n",
+ *readlength,subseq_length,hardclip_low,hardclip_high);
+ }
+
+ if (hardclip_low > 0 || hardclip_high > 0) {
+ if ((clipped = Samread_parse_aux_fromfile(fp,/*auxfield*/"XH",linelength)) == NULL) {
+ fprintf(stderr,"Hard-clipped read needs XH field from a recent GSNAP version\n");
+ exit(9);
+
+ } else {
+ if (hardclip_low > 0) {
+ p = &((*read)[0]);
+ q = clipped;
+ while ((c = *q++) != '\0') {
+ *p++ = c;
+ }
+ } else {
+ p = &((*read)[(*readlength) - hardclip_high]);
+ q = clipped;
+ while ((c = *q++) != '\0') {
+ *p++ = c;
+ }
+ }
+ FREE(clipped);
+
+ }
+ }
+
+ (*read)[*readlength] = '\0';
+
+ FREEA(substring);
+
+ return;
+}
+
+
+Univcoord_T
+Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength) {
+ Univcoord_T mate_genomicpos;
+ Chrpos_T mate_chrpos;
+ Chrnum_T chrnum, mate_chrnum;
+ char *p, c;
+ char *substring;
+
+ substring = MALLOCA((linelength + 1) * sizeof(char));
+
+ /* 1. QNAME. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 2. FLAG. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 3. RNAME: chr */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (!strcmp(substring,"*")) {
+ chrnum = -1;
+ } else if ((chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
+ fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+ }
+
+ /* 4. POS: chrpos. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 5. MAPQ: Mapping quality. Skip */
+ while (fgetc(fp) != '\t') ;
+
+ /* 6. CIGAR. Parse for cigar_readlength. Skip. */
+ while (fgetc(fp) != '\t') ;
+
+ /* 7. MRNM: Mate chr */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (!strcmp(substring,"*")) {
+ mate_genomicpos = 0;
+ } else if (!strcmp(substring,"=")) {
+ mate_chrnum = chrnum;
+ mate_genomicpos = chroffsets[mate_chrnum - 1];
+ } else if ((mate_chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
+ fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+ fprintf(stderr,"Available chromosomes in the genome: ");
+ Univ_IIT_dump_labels(stderr,chromosome_iit);
+ exit(9);
+ } else {
+ mate_genomicpos = chroffsets[mate_chrnum - 1];
+ }
+
+ /* 8. MPOS: Mate chrpos */
+ p = substring;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+ *--p = '\0';
+
+ if (sscanf(substring,"%d",&mate_chrpos) != 1) {
+ fprintf(stderr,"Unable to find mate_chrpos_low in %s\n",substring);
+ abort();
+ } else {
+ mate_genomicpos += mate_chrpos;
+ }
+
+ FREEA(substring);
+
+ return mate_genomicpos;
+}
+
+
+#if 0
+char *
+Samread_chrinfo (Chrpos_T *chrpos, char **cigar, char *line) {
+ char *chr;
+ unsigned int flag;
+ int mapq;
+
+ char *p, *q;
+ int length;
+
+ debug(printf("Entering Samread_chrinfo with %s\n",line));
+
+ p = line;
+ while (!isspace(*p)) p++;
+ length = (p - line)/sizeof(char);
+#if 0
+ *acc = (char *) MALLOC((length+1)*sizeof(char));
+ strncpy(*acc,line,length);
+ (*acc)[length] = '\0';
+#endif
+
+ if (*p != '\0') { /* Skip over tab */
+ p++;
+ }
+
+ if (sscanf(p,"%u",&flag) != 1) {
+ fprintf(stderr,"Unable to find flag in %s\n",p);
+ abort();
+ } else {
+ debug(printf(" flag = %u\n",*flag));
+ }
+
+ while (!isspace(*p)) p++; /* Skip over flag */
+ if (*p == '\0') {
+ fprintf(stderr,"Can't parse chr part of %s\n",line);
+ abort();
+ } else {
+ p++; /* Skip over tab */
+ }
+ q = p;
+ while (!isspace(*q)) q++;
+ length = (q - p)/sizeof(char);
+ chr = (char *) MALLOC((length+1)*sizeof(char));
+ strncpy(chr,p,length);
+ chr[length] = '\0';
+
+ debug(printf(" chr = %s\n",chr));
+ if (*q != '\0') {
+ q++;
+ }
+
+
+ p = q;
+ if (sscanf(p,"%u",&(*chrpos)) != 1) {
+ fprintf(stderr,"Unable to find chrpos in %s\n",p);
+ abort();
+ } else {
+ debug(printf(" chrpos = %u\n",*chrpos));
+ }
+
+ while (!isspace(*p)) p++; /* Skip over chrpos */
+ if (*p == '\0') {
+ fprintf(stderr,"Can't parse chrpos part of %s\n",line);
+ abort();
+ } else {
+ p++; /* Skip over tab */
+ }
+
+ /* Read mapping quality */
+ if (sscanf(p,"%d",&mapq) != 1) {
+ fprintf(stderr,"Unable to find mapq in %s\n",p);
+ abort();
+ } else {
+ debug(printf(" mapq = %d\n",mapq));
+ }
+
+ /* Skip past mapping quality */
+ while (!isspace(*p)) p++;
+
+
+ if (*p == '\0') {
+ fprintf(stderr,"Can't parse cigar part of %s\n",line);
+ abort();
+ } else {
+ p++; /* Skip over tab */
+ }
+ q = p;
+ while (!isspace(*q)) q++;
+ length = (q - p)/sizeof(char);
+ *cigar = (char *) MALLOC((length+1)*sizeof(char));
+ strncpy(*cigar,p,length);
+ (*cigar)[length] = '\0';
+
+ debug(printf(" cigar = %s\n",*cigar));
+
+ return chr;
+}
+#endif
+
+
+
+#define CHUNK 1024
+
+void
+Samread_print_as_duplicate_fromfile (FILE *fp, int linelength) {
+ int nread = 0;
+ char buffer[CHUNK], *p, c;
+ unsigned int flag;
+
+ /* 1. QNAME */
+ while ((c = fgetc(fp)) != EOF && c != '\t') {
+ putchar(c);
+ nread++;
+ }
+ putchar('\t');
+ nread++;
+
+ /* 2. FLAG */
+ p = buffer;
+ while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') {
+ nread++;
+ }
+ nread++;
+ *--p = '\0';
+
+ if (sscanf(buffer,"%u",&flag) != 1) {
+ fprintf(stderr,"Unable to find flag in %s\n",buffer);
+ abort();
+ } else {
+ printf("%u\t",flag | DUPLICATE_READ);
+ }
+
+ /* 3... Rest */
+ linelength -= nread;
+
+ while (linelength > CHUNK) {
+ fread(buffer,sizeof(char),CHUNK,fp);
+ fwrite(buffer,sizeof(char),CHUNK,stdout);
+ linelength -= CHUNK;
+ }
+ if (linelength > 0) {
+ fread(buffer,sizeof(char),linelength,fp);
+ fwrite(buffer,sizeof(char),linelength,stdout);
+ }
+
+ return;
+}
+
+
+#if 0
+char
+Samread_splice_strand (char *auxinfo) {
+ char *p;
+ char tag1, tag2;
+
+ debug(printf("Entering Samread_splice_strand with %s\n",auxinfo));
+
+ p = auxinfo;
+ while (*p != '\0' && *p != '\n') {
+ tag1 = p[0];
+ tag2 = p[1];
+
+ if (tag1 == 'X' && tag2 == 'S') {
+ debug(printf("Found tag XS\n"));
+ /* XS:A: */
+ p += 5;
+
+ if (*p == '+') {
+ return '+';
+ } else if (*p == '-') {
+ return '-';
+ } else if (*p == '?') {
+ return '?';
+ } else {
+ fprintf(stderr,"Cannot parse strand %c after XS tag\n",*p);
+ return ' ';
+ }
+ } else {
+ while (*p != '\0' && *p != '\t') {
+ p++;
+ }
+ if (*p == '\t') {
+ p++;
+ }
+ }
+ }
+
+ return ' ';
+}
+#endif
+
+
+#if 0
+Intlist_T
+Samread_parse_cigar (Uintlist_T *npositions, int *readlength, char *cigar) {
+ Intlist_T types = NULL;
+ unsigned int npos;
+ char *p, type;
+
+ *npositions = (Uintlist_T) NULL;
+ *readlength = 0;
+
+ if (cigar[0] == '*') {
+ return (Intlist_T) NULL;
+ }
+
+ p = cigar;
+ while (*p != '\0') {
+ if (sscanf(p,"%u",&npos) != 1) {
+ fprintf(stderr,"Unable to parse cigar %s. No number in %s\n",cigar,p);
+ abort();
+ } else {
+ *npositions = Uintlist_push(*npositions,npos);
+ }
+
+ while (*p != '\0' && isdigit(*p)) {
+ p++;
+ }
+ if (*p == '\0') {
+ fprintf(stderr,"Unable to parse cigar %s. No letter after number %u\n",cigar,npos);
+ exit(9);
+ } else {
+ type = *p++;
+ types = Intlist_push(types,(int) type);
+ }
+
+ if (type == 'S' || type == 'M' || type == 'I') {
+ *readlength += npos;
+ } else if (type == 'H') {
+ *readlength += npos;
+ } else if (type == 'D' || type == 'N') {
+ /* Ignore */
+ } else {
+ fprintf(stderr,"Unable to parse cigar %s. Do not recognize letter %c\n",cigar,type);
+ exit(9);
+ }
+ }
+
+ *npositions = Uintlist_reverse(*npositions);
+ return Intlist_reverse(types);
+}
+#endif
+
+
+#if 0
+void
+Samread_print_cigar (Intlist_T types, Uintlist_T npositions) {
+ Intlist_T p;
+ Uintlist_T q;
+
+ for (p = types, q = npositions; p != NULL; p = Intlist_next(p), q = Uintlist_next(q)) {
+ printf("%u%c",Uintlist_head(q),Intlist_head(p));
+ }
+ return;
+}
+#endif
+
+
+#if 0
+Chrpos_T
+Samread_chrpos_high (Intlist_T types, Uintlist_T npositions, Chrpos_T chrpos_low) {
+ Intlist_T p;
+ Uintlist_T q;
+ Chrpos_T chrpos_high;
+ int type;
+
+ chrpos_high = chrpos_low;
+ for (p = types, q = npositions; p != NULL; p = Intlist_next(p), q = Uintlist_next(q)) {
+ if ((type = Intlist_head(p)) == 'S') {
+ /* Ignore */
+
+ } else if (type == 'H') {
+ /* Ignore */
+
+ } else if (type == 'M') {
+ chrpos_high += Uintlist_head(q);
+
+ } else if (type == 'N') {
+ chrpos_high += Uintlist_head(q);
+
+ } else if (type == 'I') {
+ /* Do nothing */
+
+ } else if (type == 'D') {
+ /* CHECK */
+ chrpos_high += Uintlist_head(q);
+
+ } else {
+ fprintf(stderr,"Cannot parse type %c\n",type);
+ exit(9);
+ }
+ debug(printf(" type = %c, chrpos = %u\n",type,chrpos_high));
+ }
+
+ return chrpos_high - 1U;
+}
+#endif
+
+
+#if 0
+int
+Samread_get_query_coordinates (int *query5, int *query3, Intlist_T types, Uintlist_T npositions,
+ int readlength, char *cigar) {
+ int validlength;
+ Intlist_T p;
+ Uintlist_T q;
+ int type;
+
+ *query5 = 1; /* 1-based */
+ *query3 = readlength;
+ validlength = 0;
+
+ p = types;
+ q = npositions;
+ while (p != NULL) {
+ if ((type = Intlist_head(p)) == 'S') {
+ if (p == types) {
+ *query5 = Uintlist_head(q) + 1; /* 1-based */
+ } else if (Intlist_next(p) == NULL) {
+ *query3 = readlength - Uintlist_head(q);
+ } else {
+ fprintf(stderr,"Cannot parse cigar %s. Type S occurs in middle\n",cigar);
+ exit(9);
+ }
+ } else if (type == 'H') {
+ /* Do nothing */
+ } else if (type == 'M') {
+ validlength += Uintlist_head(q);
+ } else if (type == 'N') {
+ /* Do nothing */
+ } else if (type == 'I') {
+ validlength += Uintlist_head(q);
+ } else if (type == 'D') {
+ /* Do nothing */
+ }
+ p = Intlist_next(p);
+ q = Uintlist_next(q);
+ }
+
+ debug(printf("Got query %d to %d, with length %d\n",*query5,*query3,validlength));
+ if (validlength != (*query3) - (*query5) + 1) {
+ fprintf(stderr,"Validlength %d from cigar != %d - %d + 1\n",validlength,*query3,*query5);
+ abort();
+ }
+
+ return validlength;
+}
+#endif
+
+
+
+#if 0
+int
+get_substrings (int *querylength, int **query_starts, Chrpos_T **genomic_starts, Chrpos_T **genomic_ends,
+ char *cigar, Chrpos_T chrpos_low) {
+ int nsubstrings = 0;
+ unsigned int npos;
+ char *p, type;
+
+ int querypos = 0;
+ Chrpos_T genomicpos = chrpos_low;
+ Intlist_T query_starts_list = NULL;
+ Uintlist_T genomic_starts_list = NULL, genomic_ends_list = NULL;
+
+ if (cigar[0] == '*') {
+ *querylength = 0;
+ *query_starts = (int *) NULL;
+ *genomic_starts = (Chrpos_T *) NULL;
+ *genomic_ends = (Chrpos_T *) NULL;
+ return 0;
+ }
+
+ query_starts_list = Intlist_push(NULL,querypos);
+ genomic_starts_list = Uintlist_push(NULL,genomicpos);
+
+ p = cigar;
+ while (*p != '\0') {
+ if (sscanf(p,"%u",&npos) != 1) {
+ fprintf(stderr,"Unable to parse cigar %s in get_substrings. No number in %s\n",cigar,p);
+ abort();
+ }
+
+ while (*p != '\0' && isdigit(*p)) {
+ p++;
+ }
+ if (*p == '\0') {
+ fprintf(stderr,"Unable to parse cigar %s. No letter after number %u\n",cigar,npos);
+ exit(9);
+ } else {
+ type = *p++;
+ }
+
+ if (type == 'S') {
+ querypos += npos;
+
+ } else if (type == 'M') {
+ querypos += npos;
+ genomicpos += npos;
+
+ } else if (type == 'I') {
+ querypos += npos;
+
+ } else if (type == 'H') {
+ /* ? querypos += npos; */
+
+ } else if (type == 'D') {
+ genomicpos += npos;
+
+ } else if (type == 'N') {
+ genomic_ends_list = Uintlist_push(genomic_ends_list,genomicpos);
+ /* nsubstrings++; */
+
+ genomicpos += npos;
+
+ query_starts_list = Intlist_push(query_starts_list,querypos);
+ genomic_starts_list = Uintlist_push(genomic_starts_list,genomicpos);
+
+ } else {
+ fprintf(stderr,"Unable to parse cigar %s. Do not recognize letter %c\n",cigar,type);
+ exit(9);
+ }
+ }
+
+ *querylength = querypos;
+ genomic_ends_list = Uintlist_push(genomic_ends_list,genomicpos);
+ /* nsubstrings++; */
+
+
+ /* Convert lists to arrays */
+ query_starts_list = Intlist_reverse(query_starts_list);
+ *query_starts = Intlist_to_array(&nsubstrings,query_starts_list);
+ Intlist_free(&query_starts_list);
+
+ genomic_starts_list = Uintlist_reverse(genomic_starts_list);
+ *genomic_starts = Uintlist_to_array(&nsubstrings,genomic_starts_list);
+ Uintlist_free(&genomic_starts_list);
+
+ genomic_ends_list = Uintlist_reverse(genomic_ends_list);
+ *genomic_ends = Uintlist_to_array(&nsubstrings,genomic_ends_list);
+ Uintlist_free(&genomic_ends_list);
+
+ return nsubstrings;
+}
+#endif
+
+
+
+#if 0
+int
+Samread_compute_insert_length (int *querylength5, int *querylength3,
+ char *cigar5, Chrpos_T chrpos_low_5, char *cigar3, Chrpos_T chrpos_low_3) {
+ int insert_length;
+ int nsubstrings5, nsubstrings3, i, j;
+ int *query_starts_5, *query_starts_3;
+ Chrpos_T *genomic_starts_5, *genomic_ends_5, *genomic_starts_3, *genomic_ends_3;
+ Chrpos_T pos5, pos3;
+
+ if (cigar5[0] == '*' || cigar3[0] == '*') {
+ return 0;
+ }
+
+ nsubstrings5 = get_substrings(&(*querylength5),&query_starts_5,&genomic_starts_5,&genomic_ends_5,cigar5,chrpos_low_5);
+ nsubstrings3 = get_substrings(&(*querylength3),&query_starts_3,&genomic_starts_3,&genomic_ends_3,cigar3,chrpos_low_3);
+
+ for (i = 0; i < nsubstrings5; i++) {
+ for (j = 0; j < nsubstrings3; j++) {
+ if (genomic_ends_5[i] < genomic_starts_3[j]) {
+ /* No overlap */
+ } else if (genomic_starts_5[i] > genomic_ends_3[j]) {
+ /* No overlap */
+ } else {
+ pos5 = genomic_starts_5[i] - query_starts_5[i];
+ pos3 = genomic_starts_3[j] - query_starts_3[j];
+
+ FREE(query_starts_5);
+ FREE(genomic_starts_5);
+ FREE(genomic_ends_5);
+ FREE(query_starts_3);
+ FREE(genomic_starts_3);
+ FREE(genomic_ends_3);
+
+ if (pos5 > pos3) {
+ return (int) (pos5 - pos3);
+ } else {
+ return (int) (pos3 - pos5);
+ }
+ }
+ }
+ }
+
+ if (genomic_ends_5[nsubstrings5-1] < genomic_starts_3[0]) {
+ insert_length = genomic_starts_3[0] - genomic_ends_5[nsubstrings5-1] + (*querylength5) + (*querylength3);
+ } else if (genomic_ends_3[nsubstrings3-1] < genomic_starts_5[0]) {
+ insert_length = genomic_starts_5[0] - genomic_ends_3[nsubstrings3-1] + (*querylength5) + (*querylength3);
+ } else {
+ insert_length = 0;
+ }
+
+ FREE(query_starts_5);
+ FREE(genomic_starts_5);
+ FREE(genomic_ends_5);
+
+ FREE(query_starts_3);
+ FREE(genomic_starts_3);
+ FREE(genomic_ends_3);
+
+ return insert_length;
+}
+#endif
+
+
=====================================
src/samread.h
=====================================
@@ -0,0 +1,40 @@
+/* $Id: samread.h 154089 2014-11-25 21:03:16Z twu $ */
+#ifndef SAMREAD_INCLUDED
+#define SAMREAD_INCLUDED
+#include <stdio.h>
+#include "samflags.h"
+#include "genomicpos.h"
+#include "intlist.h"
+#include "uintlist.h"
+#include "iit-read-univ.h"
+
+
+extern char *
+Samread_get_acc_fromfile (int *acclength, FILE *fp, int linelength);
+
+extern char *
+Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM_split_output_type *split_output,
+ char **hiti, Univcoord_T *genomicpos, int *initial_softclip, bool *query_lowp,
+ FILE *fp, Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength);
+
+extern int
+Samread_parse_linelen_fromfile (FILE *fp);
+
+extern Univcoord_T
+Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_output_type *split_output,
+ Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength);
+
+extern void
+Samread_parse_read_fromfile (FILE *fp, unsigned int *flag, int *readlength, char **read, int linelength);
+
+extern Univcoord_T
+Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength);
+
+extern char *
+Samread_parse_aux_fromfile (FILE *fp, char *auxfield, int linelength);
+
+extern void
+Samread_print_as_duplicate_fromfile (FILE *fp, int linelength);
+
+#endif
+
=====================================
src/stage3hr.c
=====================================
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage3hr.c 222929 2020-06-29 16:09:28Z twu $";
+static char rcsid[] = "$Id: stage3hr.c 223081 2020-09-13 14:21:03Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -5010,7 +5010,7 @@ Stage3end_new_substitution (int *found_score_overall, int *found_score_within_tr
mismatch_positions_alloc,query_compress,chroffset,sensedir);
splice_queryend_p = Substring_qend_trim(&qend,&splicetype_queryend,&ambig_prob_queryend,
univdiagonal,pos5,querylength,plusp,genestrand,
- mismatch_positions_alloc,query_compress,chroffset,sensedir);
+ mismatch_positions_alloc,query_compress,chroffset,chrhigh,sensedir);
debug0(printf("Trimming querystart yields splicep %d, qstart %d, prob %f\n",splice_querystart_p,qstart,ambig_prob_querystart));
debug0(printf("Trimming queryend yields splicep %d, qend %d, prob %f\n",splice_queryend_p,qend,ambig_prob_queryend));
@@ -5046,7 +5046,7 @@ Stage3end_new_substitution (int *found_score_overall, int *found_score_within_tr
/* trim_querystart and trim_queryend Genome_count_mismatches_substring are flipped, but not for Substring_new */
splice_querystart_p = Substring_qend_trim(&qend,&splicetype_querystart,&ambig_prob_querystart,
univdiagonal,pos5,querylength,plusp,genestrand,
- mismatch_positions_alloc,query_compress,chroffset,sensedir);
+ mismatch_positions_alloc,query_compress,chroffset,chrhigh,sensedir);
splice_queryend_p = Substring_qstart_trim(&qstart,&splicetype_queryend,&ambig_prob_queryend,
univdiagonal,pos3,querylength,plusp,genestrand,
mismatch_positions_alloc,query_compress,chroffset,sensedir);
=====================================
src/substring.c
=====================================
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: substring.c 222930 2020-06-29 16:10:11Z twu $";
+static char rcsid[] = "$Id: substring.c 223080 2020-09-13 14:20:25Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -1157,12 +1157,13 @@ embellish_genomic_sam (char *result, char *genomic_diff, char *query, int querys
/* Still takes left as a parameter */
int
Substring_trim_qstart_nosplice (int *nmismatches, int *mismatch_positions_alloc,
- Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
- bool plusp, int genestrand) {
+ Compress_T query_compress, Univcoord_T left, Univcoord_T chroffset,
+ int pos5, int pos3, int querylength, bool plusp, int genestrand) {
int score = 0;
int trimpos = pos5, pos, prevpos, i;
int *mismatch_positions = mismatch_positions_alloc;
int alignlength = pos3 - pos5;
+ Univcoord_T univdiagonal;
debug8(printf("Entered Substring_trim_qstart_nosplice with pos5 %d, pos3 %d, mismatch_scores %d/%d, match_score %d\n",
@@ -1222,8 +1223,9 @@ Substring_trim_qstart_nosplice (int *nmismatches, int *mismatch_positions_alloc,
/* score = 0; */
}
- if (novelsplicingp == false && trimpos == 0) {
- /* For DNA-seq, take the initial mismatch at the beginning of the read */
+ if (novelsplicingp == false && trimpos == 0 &&
+ left + (Univcoord_T) querylength >= chroffset + (Univcoord_T) querylength) {
+ /* For DNA-seq, if still within chromosome, take the initial mismatch at the beginning of the read */
debug8(printf("Backing up 1 bp to start => trimpos %d\n",0));
return 0;
} else {
@@ -1236,8 +1238,8 @@ Substring_trim_qstart_nosplice (int *nmismatches, int *mismatch_positions_alloc,
/* Still takes left as a parameter */
int
Substring_trim_qend_nosplice (int *nmismatches, int *mismatch_positions_alloc,
- Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
- int querylength, bool plusp, int genestrand) {
+ Compress_T query_compress, Univcoord_T left, Univcoord_T chrhigh,
+ int pos5, int pos3, int querylength, bool plusp, int genestrand) {
int score = 0;
int trimpos = pos3, pos, prevpos, i;
int *mismatch_positions = mismatch_positions_alloc;
@@ -1301,8 +1303,9 @@ Substring_trim_qend_nosplice (int *nmismatches, int *mismatch_positions_alloc,
/* score = 0; */
}
- if (novelsplicingp == false && trimpos == querylength - 1) {
- /* For DNA-seq, take the final mismatch at end of read */
+ if (novelsplicingp == false && trimpos == querylength - 1 &&
+ left + (Univcoord_T) querylength <= chrhigh) {
+ /* For DNA-seq, if still within the chromosome, take the final mismatch at end of read */
debug8(printf("Advancing 1 bp to end => trimpos %d\n",pos3));
return querylength;
} else {
@@ -1325,14 +1328,15 @@ Substring_trimmed_qstarts (int *result, Splicetype_T *splicetype, int **ambig_qs
int nspliceends;
int nmismatches;
+
debug8(printf("\n***Entered Substring_trimmed_qstarts with univdiagonal %u, qend %d..%d, plusp %d, sensedir %d\n",
univdiagonal-chroffset,0,qend,plusp,sensedir));
left = univdiagonal - (Univcoord_T) querylength;
pos5 = (univdiagonal >= (Univcoord_T) querylength) ? 0 : (int) -left;
- trimpos = Substring_trim_qstart_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,left,
- pos5,/*pos3*/qend,plusp,genestrand);
+ trimpos = Substring_trim_qstart_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
+ left,chroffset,pos5,/*pos3*/qend,querylength,plusp,genestrand);
debug8(printf("trimpos %d (relative to %d)\n",trimpos,/*qstart*/0));
if (trimpos >= qend) {
@@ -1401,7 +1405,7 @@ Substring_qstart_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_pro
pos5 = (univdiagonal >= (Univcoord_T) querylength) ? 0 : (int) -left;
*trimpos = Substring_trim_qstart_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
- left,pos5,pos3,plusp,genestrand);
+ left,chroffset,pos5,pos3,querylength,plusp,genestrand);
debug8(printf("trimpos %d (relative to %d)\n",*trimpos,pos5));
if (*trimpos >= pos3) {
@@ -1470,7 +1474,7 @@ bool
Substring_trimmed_qends (int *result, Splicetype_T *splicetype, int **ambig_qends, double **ambig_probs_3,
Univcoord_T univdiagonal, int qstart, int querylength,
bool plusp, int genestrand, int *mismatch_positions_alloc, Compress_T query_compress,
- Univcoord_T chroffset, int sensedir) {
+ Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir) {
Univcoord_T left;
int trimpos, pos3;
int nspliceends;
@@ -1482,8 +1486,8 @@ Substring_trimmed_qends (int *result, Splicetype_T *splicetype, int **ambig_qend
left = univdiagonal - (Univcoord_T) querylength;
pos3 = (univdiagonal <= genomelength) ? querylength : (int) (genomelength - left);
- trimpos = Substring_trim_qend_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,left,
- /*pos5*/qstart,pos3,querylength,plusp,genestrand);
+ trimpos = Substring_trim_qend_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
+ left,chrhigh,/*pos5*/qstart,pos3,querylength,plusp,genestrand);
debug8(printf("trimpos %d (relative to %d)\n",trimpos,/*qend*/querylength));
if (trimpos <= qstart) {
@@ -1536,7 +1540,7 @@ bool
Substring_qend_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_prob_qend,
Univcoord_T univdiagonal, int pos5, int querylength, bool plusp, int genestrand,
int *mismatch_positions_alloc, Compress_T query_compress,
- Univcoord_T chroffset, int sensedir) {
+ Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir) {
Univcoord_T left;
int pos3;
int nspliceends;
@@ -1551,7 +1555,7 @@ Substring_qend_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_prob_
pos3 = (univdiagonal <= genomelength) ? querylength : (int) (genomelength - left);
*trimpos = Substring_trim_qend_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
- left,pos5,pos3,querylength,plusp,genestrand);
+ left,chrhigh,pos5,pos3,querylength,plusp,genestrand);
debug8(printf("trimpos %d (relative to %d)\n",*trimpos,pos3));
if (*trimpos <= pos5) {
@@ -2206,7 +2210,7 @@ Substring_extend_anchor_querystart (T anchor, T substring, int *mismatch_positio
/*univdiagonal*/anchor->left + anchor->querylength,/*pos5*/qstart,
anchor->querylength,/*plusp*/false,anchor->genestrand,
mismatch_positions_alloc,query_compress,
- anchor->chroffset,anchor->sensedir);
+ anchor->chroffset,anchor->chrhigh,anchor->sensedir);
/* printf("Substring_extend_anchor_querystart, minus, yields splicep %d, qend %d, prob %f\n",
splice_querystart_p,qend,ambig_prob_querystart); */
@@ -2270,7 +2274,7 @@ Substring_extend_anchor_queryend (T anchor, T substring, int *mismatch_positions
/*univdiagonal*/anchor->left + anchor->querylength,/*pos5*/qstart,
anchor->querylength,/*plusp*/true,anchor->genestrand,
mismatch_positions_alloc,query_compress,
- anchor->chroffset,anchor->sensedir);
+ anchor->chroffset,anchor->chrhigh,anchor->sensedir);
/* printf("Substring_extend_anchor_queryend, plus, yields splicep %d, qend %d, prob %f\n",
splice_queryend_p,qend,ambig_prob_queryend); */
=====================================
src/substring.h
=====================================
@@ -1,4 +1,4 @@
-/* $Id: substring.h 222930 2020-06-29 16:10:11Z twu $ */
+/* $Id: substring.h 223080 2020-09-13 14:20:25Z twu $ */
#ifndef SUBSTRING_INCLUDED
#define SUBSTRING_INCLUDED
@@ -60,12 +60,12 @@ Substring_overlap_segment_trimmed (T substring1, T substring2);
extern int
Substring_trim_qstart_nosplice (int *nmismatches, int *mismatch_positions_alloc,
- Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
- bool plusp,int genestrand);
+ Compress_T query_compress, Univcoord_T left, Univcoord_T chroffset,
+ int pos5, int pos3, int querylength, bool plusp, int genestrand);
extern int
Substring_trim_qend_nosplice (int *nmismatches, int *mismatch_positions_alloc,
- Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
- int querylength, bool plusp, int genestrand);
+ Compress_T query_compress, Univcoord_T left, Univcoord_T chrhigh,
+ int pos5, int pos3, int querylength, bool plusp, int genestrand);
extern bool
Substring_trimmed_qstarts (int *result, Splicetype_T *splicetype, int **ambig_qstarts, double **ambig_probs_5,
@@ -82,13 +82,13 @@ extern bool
Substring_trimmed_qends (int *result, Splicetype_T *splicetype, int **ambig_qends, double **ambig_probs_3,
Univcoord_T univdiagonal, int qstart, int querylength,
bool plusp, int genestrand, int *mismatch_positions_alloc, Compress_T query_compress,
- Univcoord_T chroffset, int sensedir);
+ Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir);
extern bool
Substring_qend_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_prob_qend,
Univcoord_T univdiagonal, int pos5, int querylength, bool plusp, int genestrand,
int *mismatch_positions_alloc, Compress_T query_compress,
- Univcoord_T chroffset, int sensedir);
+ Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir);
extern int
Substring_compute_nmatches (int *ref_nmatches_plus_spliced_trims, Univcoord_T left,
=====================================
src/terminal.c
=====================================
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: terminal.c 222933 2020-06-29 16:12:14Z twu $";
+static char rcsid[] = "$Id: terminal.c 223081 2020-09-13 14:21:03Z twu $";
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
@@ -139,9 +139,11 @@ find_terminals (List_T *sense_terminals, List_T *antisense_terminals, List_T elt
/* TODO: Consider trimming separately for SENSE_FORWARD and SENSE_ANTI */
pos5_trimmed = Substring_trim_qstart_nosplice(&nmismatches_ignore,mismatch_positions_alloc,
- query_compress,segment_left,pos5,pos3,plusp,genestrand);
+ query_compress,segment_left,chroffset,
+ pos5,pos3,querylength,plusp,genestrand);
pos3_trimmed = Substring_trim_qend_nosplice(&nmismatches_ignore,mismatch_positions_alloc,
- query_compress,segment_left,pos5,pos3,querylength,plusp,genestrand);
+ query_compress,segment_left,chrhigh,
+ pos5,pos3,querylength,plusp,genestrand);
if (plusp == true) {
querystart = pos5_trimmed;
queryend = pos3_trimmed;
@@ -206,9 +208,11 @@ find_terminals (List_T *sense_terminals, List_T *antisense_terminals, List_T elt
/* TODO: Consider trimming separately for SENSE_FORWARD and SENSE_ANTI */
pos5_trimmed = Substring_trim_qstart_nosplice(&nmismatches_ignore,mismatch_positions_alloc,
- query_compress,segment_left,pos5,pos3,plusp,genestrand);
+ query_compress,segment_left,chroffset,
+ pos5,pos3,querylength,plusp,genestrand);
pos3_trimmed = Substring_trim_qend_nosplice(&nmismatches_ignore,mismatch_positions_alloc,
- query_compress,segment_left,pos5,pos3,querylength,plusp,genestrand);
+ query_compress,segment_left,chrhigh,
+ pos5,pos3,querylength,plusp,genestrand);
if (plusp == true) {
querystart = pos5_trimmed;
queryend = pos3_trimmed;
=====================================
util/gmap_build.pl.in
=====================================
@@ -1,5 +1,5 @@
#! @PERL@
-# $Id: gmap_build.pl.in 222761 2020-06-01 00:24:12Z twu $
+# $Id: gmap_build.pl.in 223082 2020-09-13 14:21:33Z twu $
use warnings;
@@ -531,7 +531,7 @@ sub print_usage {
gmap_build: Builds a gmap database for a genome to be used by GMAP or GSNAP.
Part of GMAP package, version $package_version.
-Usage: gmap_build [options...] -d <genome> [-c <transcriptome> -t <transcript_fasta>] <genome_fasta_files>
+Usage: gmap_build [options...] -d <genome> [-c <transcriptome> -T <transcript_fasta>] <genome_fasta_files>
You are free to name <genome> and <transcriptome> as you wish. You
will use the same names when performing alignments subsequently using
View it on GitLab: https://salsa.debian.org/med-team/gmap/-/commit/56daea0c64f8224fa212b8030f22ac30b7d175e2
--
View it on GitLab: https://salsa.debian.org/med-team/gmap/-/commit/56daea0c64f8224fa212b8030f22ac30b7d175e2
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200918/2426a327/attachment-0001.html>
More information about the debian-med-commit
mailing list