[med-svn] [Git][med-team/gmap][upstream] New upstream version 2020-09-12+ds

Alexandre Mestiashvili gitlab at salsa.debian.org
Fri Sep 18 12:42:05 BST 2020



Alexandre Mestiashvili pushed to branch upstream at Debian Med / gmap


Commits:
56daea0c by Alexandre Mestiashvili at 2020-09-18T12:00:58+02:00
New upstream version 2020-09-12+ds
- - - - -


13 changed files:

- ChangeLog
- VERSION
- src/Makefile.am
- src/distant-rna.c
- src/path-solve.c
- + src/sam_sort.c
- + src/samread.c
- + src/samread.h
- src/stage3hr.c
- src/substring.c
- src/substring.h
- src/terminal.c
- util/gmap_build.pl.in


Changes:

=====================================
ChangeLog
=====================================
@@ -1,3 +1,19 @@
+2020-09-17  twu
+
+    * Makefile.gsnaptoo.am: Including sam_sort again
+
+2020-09-13  twu
+
+    * VERSION, index.html: Updated version number
+
+    * path-solve.c, distant-rna.c, stage3hr.c, terminal.c: Using new interfaces
+      to Substring trim functions
+
+    * gmap_build.pl.in: Fixed flag in help output
+
+    * substring.c, substring.h: For DNA-seq, not allowing extension of last
+      mismatch at end if it extends beyond chromosomal bounds
+
 2020-07-10  twu
 
     * pair.c: Added semicolon between Dir and coverage in gff3 output


=====================================
VERSION
=====================================
@@ -1 +1 @@
-2020-06-30
\ No newline at end of file
+2020-09-12
\ No newline at end of file


=====================================
src/Makefile.am
=====================================
@@ -42,10 +42,10 @@ EXTRA_DIST = mpidebug.c mpidebug.h master.c master.h
 #endif
 
 
-# sam_sort
 bin_PROGRAMS = cpuid gmap gmapl get-genome gmapindex indexdb_cat \
                iit_store iit_get iit_dump \
-               gsnap gsnapl snpindex cmetindex atoiindex trindex
+               gsnap gsnapl snpindex cmetindex atoiindex trindex \
+               sam_sort
 
 bin_PROGRAMS += gmap.nosimd
 bin_PROGRAMS += gmapl.nosimd


=====================================
src/distant-rna.c
=====================================
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: distant-rna.c 222804 2020-06-03 21:58:26Z twu $";
+static char rcsid[] = "$Id: distant-rna.c 223081 2020-09-13 14:21:03Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -435,8 +435,8 @@ find_spliceends_rna_plus (List_T **distant_donors, List_T **distant_antidonors,
 	/* SENSE FORWARD */
 	splice_queryend_p = Substring_qend_trim(&splice_pos_qend,&splicetype_queryend,&ambig_prob_queryend,
 						segment_univdiagonal,/*pos5*/elt->qstart,querylength,
-						/*plusp*/true,genestrand,mismatch_positions_alloc,query_compress,chroffset,
-						/*sensedir*/SENSE_FORWARD);
+						/*plusp*/true,genestrand,mismatch_positions_alloc,query_compress,
+						chroffset,chrhigh,/*sensedir*/SENSE_FORWARD);
 	debug4e(printf("sense: splice_pos_qend %d with splice_queryend_p %d\n",splice_pos_qend,splice_queryend_p));
 	if (splice_pos_qend >= 0 && (splice_pos_qend > querylength - 6 || splice_queryend_p == true)) {
 	  /* genomic right anchor or middle anchor */
@@ -538,8 +538,8 @@ find_spliceends_rna_plus (List_T **distant_donors, List_T **distant_antidonors,
 	/* SENSE ANTI */
 	splice_queryend_p = Substring_qend_trim(&splice_pos_qend,&splicetype_queryend,&ambig_prob_queryend,
 						segment_univdiagonal,/*pos5*/elt->qstart,querylength,
-						/*plusp*/true,genestrand,mismatch_positions_alloc,query_compress,chroffset,
-						/*sensedir*/SENSE_ANTI);
+						/*plusp*/true,genestrand,mismatch_positions_alloc,query_compress,
+						chroffset,chrhigh,/*sensedir*/SENSE_ANTI);
 	debug4e(printf("antisense: splice_pos_qend %d with splice_queryend_p %d\n",splice_pos_qend,splice_queryend_p));
 	if (splice_pos_qend >= 0 && (splice_pos_qend > querylength - 6 || splice_queryend_p == true)) {
 	  /* genomic right anchor or middle anchor */
@@ -953,8 +953,8 @@ find_spliceends_rna_minus (List_T **distant_donors, List_T **distant_antidonors,
 	/* SENSE ANTI */
 	splice_querystart_p = Substring_qend_trim(&splice_pos_qend,&splicetype_querystart,&ambig_prob_querystart,
 						  segment_univdiagonal,/*pos5*/elt->qstart,querylength,
-						  /*plusp*/false,genestrand,mismatch_positions_alloc,query_compress,chroffset,
-						  /*sensedir*/SENSE_ANTI);
+						  /*plusp*/false,genestrand,mismatch_positions_alloc,query_compress,
+						  chroffset,chrhigh,/*sensedir*/SENSE_ANTI);
 	debug4e(printf("antisense: splice_pos_qend %d with splice_querystart_p %d\n",splice_pos_qend,splice_querystart_p));
 	if (splice_pos_qend >= 0 && (splice_pos_qend > querylength - 6 || splice_querystart_p == true)) {
 	  /* genomic right anchor or middle anchor */
@@ -1057,8 +1057,8 @@ find_spliceends_rna_minus (List_T **distant_donors, List_T **distant_antidonors,
 	/* SENSE FORWARD */
 	splice_querystart_p = Substring_qend_trim(&splice_pos_qend,&splicetype_querystart,&ambig_prob_querystart,
 						  segment_univdiagonal,/*pos5*/elt->qstart,querylength,
-						  /*plusp*/false,genestrand,mismatch_positions_alloc,query_compress,chroffset,
-						  /*sensedir*/SENSE_FORWARD);
+						  /*plusp*/false,genestrand,mismatch_positions_alloc,query_compress,
+						  chroffset,chrhigh,/*sensedir*/SENSE_FORWARD);
 	debug4e(printf("sense: splice_pos_qend %d with splice_querystart_p %d\n",splice_pos_qend,splice_querystart_p));
 	if (splice_pos_qend >= 0 && (splice_pos_qend > querylength - 6 || splice_querystart_p == true)) {
 	  /* genomic right anchor or middle anchor */


=====================================
src/path-solve.c
=====================================
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: path-solve.c 222797 2020-06-03 21:05:08Z twu $";
+static char rcsid[] = "$Id: path-solve.c 223083 2020-09-13 15:26:10Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -1455,7 +1455,7 @@ Path_new_for_qend_extension (Univcoord_T univdiagonal, int qstart, int qend, boo
    be attached.  All Path_T objects are copies of the original path */
 static List_T
 attach_qend_diagonal (T path, Univcoord_T high_univdiagonal, int high_qstart, int high_qend,
-		      Chrnum_T chrnum, Univcoord_T chroffset, int querylength,
+		      Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, int querylength,
 		      Spliceinfo_T spliceinfo, int *mismatch_positions_alloc, Compress_T query_compress,
 		      bool plusp, int genestrand, int max_mismatches_allowed, int max_insertionlen, int max_deletionlen,
 		      Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
@@ -1515,7 +1515,7 @@ attach_qend_diagonal (T path, Univcoord_T high_univdiagonal, int high_qstart, in
 				     /*univdiagonal*/high_univdiagonal,/*qstart*/high_qstart,
 				     /*qend:querylength,*/querylength,plusp,genestrand,
 				     mismatch_positions_alloc,query_compress,
-				     chroffset,sensedir);
+				     chroffset,chrhigh,sensedir);
 
   if (result < 0) {
     nspliceends = 0;
@@ -2139,7 +2139,7 @@ compute_qend_paths (int depth, bool *terminalp, T path, List_T diagonals,
       } else if ((child_paths = attach_qend_diagonal(path,/*high_univdiagonal*/diagonal->univdiagonal,
 						     /*high_qstart*/diagonal->qstart,
 						     /*high_qend*/diagonal->qend,
-						     chrnum,chroffset,querylength,
+						     chrnum,chroffset,chrhigh,querylength,
 						     spliceinfo,mismatch_positions_alloc,query_compress,
 						     plusp,genestrand,max_mismatches_allowed,
 						     max_insertionlen,max_deletionlen,intlistpool,
@@ -2323,7 +2323,7 @@ add_qend_local (bool *addedp, T path, Univcoord_T *local_diagonals, int ndiagona
 
     if ((paths = attach_qend_diagonal(path,/*high_univdiagonal*/local_diagonals[i],
 				      /*high_qstart*/pos5,high_qend,
-				      chrnum,chroffset,querylength,
+				      chrnum,chroffset,chrhigh,querylength,
 				      spliceinfo,mismatch_positions_alloc,query_compress,
 				      plusp,genestrand,max_mismatches_allowed,
 				      max_insertionlen,max_deletionlen,intlistpool,
@@ -3803,7 +3803,7 @@ Path_solve_from_diagonals (bool *foundp, int *found_score_overall, int *found_sc
 				     /*qend:middle_diagonal_qend,*/
 				     querylength,plusp,genestrand,
 				     mismatch_positions_alloc,query_compress,
-				     chroffset,/*sensedir*/SENSE_FORWARD);
+				     chroffset,chrhigh,/*sensedir*/SENSE_FORWARD);
   debug13(printf("splice3p %d, result %d\n",splice3p,result));
   
   if (result < 0) {
@@ -3891,7 +3891,7 @@ Path_solve_from_diagonals (bool *foundp, int *found_score_overall, int *found_sc
 				       /*qend:middle_diagonal_qend,*/
 				       querylength,plusp,genestrand,
 				       mismatch_positions_alloc,query_compress,
-				       chroffset,/*sensedir*/SENSE_ANTI);
+				       chroffset,chrhigh,/*sensedir*/SENSE_ANTI);
     debug13(printf("splice3p %d, result %d\n",splice3p,result));
 
     if (result < 0) {


=====================================
src/sam_sort.c
=====================================
@@ -0,0 +1,1588 @@
+static char rcsid[] = "$Id: sam_sort.c 200239 2016-11-08 01:00:30Z twu $";
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>		/* For off_t */
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bool.h"
+#include "mem.h"
+#include "access.h"
+#include "complement.h"
+#include "littleendian.h"
+#include "genomicpos.h"
+#include "chrnum.h"
+#include "samheader.h"
+#include "samread.h"
+#include "samflags.h"
+#include "stopwatch.h"
+#include "datadir.h"
+#include "filestring.h"
+#include "getopt.h"
+
+
+#define MONITOR_INTERVAL 100000
+
+
+/************************************************************************
+ *
+ *  Check for correctness:
+ *
+ *    Run ./sam_sort -d <genome> --mark-dups --mark-first --dups-only --no-sam-headers on a SAM file
+ *    Do "cut -f 1 | sort | uniq" to find all duplicate accessions
+ *    
+ *    Run the following Perl program on the original FASTA file
+ *    Do "grep '^>' | sed -e 's/>//' | sort | uniq" to use as a gold standard
+ * 
+ * use IO::File;
+ * $fastafile = $ARGV[0];
+ * 
+ * $FP = new IO::File($fastafile);
+ * while (defined($header = <$FP>) && defined($queryseq5 = <$FP>) && defined($queryseq3 = <$FP>)) {
+ *     chop $queryseq5; chop $queryseq3;
+ * 
+ *     if ($queryseq5 lt $queryseq3) {
+ *       $nseen{$queryseq5 . $queryseq3} += 1;
+ *     } else {
+ *       $nseen{$queryseq3 . $queryseq5} += 1;
+ *     }
+ * }
+ * close($FP);
+ * 
+ * $FP = new IO::File($fastafile);
+ * while (defined($header = <$FP>) && defined($queryseq5 = <$FP>) && defined($queryseq3 = <$FP>)) {
+ *     chop $queryseq5; chop $queryseq3;
+ * 
+ *     if ($queryseq5 lt $queryseq3) {
+ * 	$queryseq = $queryseq5 . $queryseq3;
+ *     } else {
+ * 	$queryseq = $queryseq3 . $queryseq5;
+ *     }
+ * 
+ *     if (($n = $nseen{$queryseq}) > 1) {
+ *  	 print $header; print $queryseq5 . "\n"; print $queryseq3 . "\n";
+ *     }
+ * }
+ * close($FP);
+ * 
+ * exit;
+ * 
+ ************************************************************************/
+
+
+#define CHUNK 1024
+typedef enum {NO_SECONDARY_SORT, ORIG_SECONDARY_SORT, ACC_SECONDARY_SORT, MATEFWD_SECONDARY_SORT, MATEREV_SECONDARY_SORT} Secondary_sort_T;
+
+#ifdef DEBUG
+#define debug(x) x
+#else
+#define debug(x)
+#endif
+
+/* Details of getting queryseqs */
+#ifdef DEBUG9
+#define debug9(x) x
+#else
+#define debug9(x)
+#endif
+
+/* Cell_binary_search */
+#ifdef DEBUG10
+#define debug10(x) x
+#else
+#define debug10(x)
+#endif
+
+/* Testing of linelen algorithm */
+#ifdef DEBUG14
+#define debug14(x) x
+#else
+#define debug14(x)
+#endif
+
+
+
+#ifdef HAVE_FSEEKO
+#define moveto(fp,offset) fseeko(fp,offset,SEEK_SET)
+#else
+#define moveto(fp,offset) fseek(fp,offset,SEEK_SET)
+#endif
+
+
+
+/* Program Options */
+static char *genomesubdir = NULL;
+static char *dbroot = NULL;
+static char *dbversion = NULL;
+static char *user_genomedir = NULL;
+
+static bool sam_headers_p = true;
+
+static bool mark_duplicates_p = false;
+static bool mark_first_p = false;
+static bool print_unique_p = true;
+static bool print_duplicates_p = true;
+static bool restore_original_order_p = false;
+
+static bool secondary_sort_method = NO_SECONDARY_SORT;
+static bool multiple_primaries_p = false;
+
+static Stopwatch_T stopwatch = NULL;
+
+static char *split_output_root = NULL;
+static bool appendp = false;
+static FILE **outputs = NULL;
+
+
+static struct option long_options[] = {
+  /* Input options */
+  {"dir", required_argument, 0, 'D'}, /* user_genomedir */
+  {"db", required_argument, 0, 'd'}, /* dbroot */
+  {"split-output", required_argument, 0, 0}, /* outputs */
+  {"append-output", no_argument, 0, 0},	     /* appendp */
+
+  {"sort2", required_argument, 0, 0}, /* secondary_sort_method */
+
+  {"mark-dups", no_argument, 0, 0}, /* mark_duplicates_p, print_unique_p, print_duplicates_p */
+  {"mark-first", no_argument, 0, 0}, /* mark_first_p */
+  {"dups-only", no_argument, 0, 0}, /* mark_duplicates_p, print_unique_p, print_duplicates_p */
+  {"uniq-only", no_argument, 0, 0}, /* mark_duplicates_p, print_unique_p, print_duplicates_p */
+  {"restore-orig-order", no_argument, 0, 0}, /* restore_original_order_p */
+  {"multiple-primaries", no_argument, 0, 0}, /* multiple_primaries_p */
+  {"no-sam-headers", no_argument, 0, 0},     /* sam_headers_p */
+
+  /* Help options */
+  {"version", no_argument, 0, '^'}, /* print_program_version */
+  {"help", no_argument, 0, '?'}, /* print_program_usage */
+  {0, 0, 0, 0}
+};
+
+
+static void
+print_program_version () {
+  fprintf(stdout,"\n");
+  fprintf(stdout,"sam_sort: sorting and duplication-marking utility for SAM files\n");
+  fprintf(stdout,"Part of GMAP package, version %s\n",PACKAGE_VERSION);
+  fprintf(stdout,"Thomas D. Wu, Genentech, Inc.\n");
+  fprintf(stdout,"Contact: twu at gene.com\n");
+  fprintf(stdout,"\n");
+  return;
+}
+
+static void
+print_program_usage () {
+  fprintf(stdout,"\
+Usage: sam_sort [OPTIONS...] -d genome <sam file>\n\
+\n\
+Input options\n\
+  -D, --dir=STRING        Genome directory\n\
+  -d, --db=STRING         Genome database.  If argument is '?' (with\n\
+                            the quotes), this command lists available databases.\n\
+Output file options\n\
+  --split-output=STRING   Basename for multiple-file output, separately for nomapping,\n\
+                            halfmapping_uniq, halfmapping_mult, unpaired_uniq, unpaired_mult,\n\
+                            paired_uniq, paired_mult, concordant_uniq, and concordant_mult results\n\
+  --append-output         When --split-output is given, this flag will append output\n\
+                            to the existing files.  Otherwise, the default is to create new files.\n\
+\n\
+Other options\n\
+  --sort2=STRING          For positions with the same genomic position, sort secondarily by\n\
+                             none: no guarantee about the secondary sort (default)\n\
+                             orig: original order in the SAM output file (what samtools sort does)\n\
+                             accession: alphabetically by accession name\n\
+                             mate-fwd: by genomic position of the mate, in ascending order\n\
+                             mate-rev: by genomic position of the mate, in descending order\n\
+                          For sorting by mate genomic position, a nomapping mate is treated as genomic position 0\n\
+  --mark-dups             Mark duplicate reads by altering the flag accordingly\n\
+  --mark-first            Also mark the first occurrence of a read that has a subsequent duplicate\n\
+\n\
+  --dups-only             Print only duplicate reads\n\
+  --uniq-only             Print only unique reads\n\
+  --restore-orig-order    Restore original order of SAM file.  Useful when --mark-dups is specified\n\
+                            and sorting is not desired\n\
+  --multiple-primaries    Specify if GSNAP or GMAP was run with the --multiple-primaries flag\n\
+  --no-sam-headers        Do not print SAM header lines\n\
+");
+  return;
+}
+
+
+static char complCode[128] = COMPLEMENT_LC;
+
+static void
+make_complement_inplace (char *sequence, unsigned int length) {
+  char temp;
+  unsigned int i, j;
+
+  for (i = 0, j = length-1; i < length/2; i++, j--) {
+    temp = complCode[(int) sequence[i]];
+    sequence[i] = complCode[(int) sequence[j]];
+    sequence[j] = temp;
+  }
+  if (i == j) {
+    sequence[i] = complCode[(int) sequence[i]];
+  }
+
+  return;
+}
+
+
+#define T Cell_T
+typedef struct T *T;
+struct T {
+  unsigned int flag;
+  SAM_split_output_type split_output;
+  bool low_read_p;
+
+  Chrnum_T chrnum;
+  Univcoord_T genomicpos;
+  Univcoord_T genomicpos_extend_softclip;
+  Univcoord_T mate_genomicpos;
+  int filei;
+  size_t linestart;
+  int linelen;
+
+  char *acc;			/* Needed for ACC_SECONDARY_SORT */
+
+  int lineindex;		/* Original line order */
+  int readindex;		/* inputi or outputi.  Grouped by accession.  Needed for marking duplicates to find the other queryseq */
+  char *queryseq5;
+  char *queryseq3;
+
+  char *queryseq_alpha1;	/* Pointers to queryseq5 and queryseq3.  Needed for handling 5/3 swaps. */
+  char *queryseq_alpha2;
+
+};
+
+
+static void
+Cell_standardize_queryseqs (T this) {
+
+  if (this->queryseq5 == NULL && this->queryseq3 == NULL) {
+    this->queryseq_alpha1 = this->queryseq_alpha2 = NULL;
+  } else if (this->queryseq5 == NULL) {
+    this->queryseq_alpha1 = this->queryseq3;
+    this->queryseq_alpha2 = NULL;
+  } else if (this->queryseq3 == NULL) {
+    this->queryseq_alpha1 = this->queryseq5;
+    this->queryseq_alpha2 = NULL;
+  } else if (strcmp(this->queryseq5,this->queryseq3) < 0) {
+    this->queryseq_alpha1 = this->queryseq5;
+    this->queryseq_alpha2 = this->queryseq3;
+  } else {
+    this->queryseq_alpha1 = this->queryseq3;
+    this->queryseq_alpha2 = this->queryseq5;
+  }
+
+  return;
+}
+    
+
+/* initial_softclip needs to be determined only if we are marking duplicates */
+static void
+Cell_fill (struct T *this, int lineindex, int readindex, unsigned int flag, SAM_split_output_type split_output,
+	   bool query_lowp, int initial_softclip, Univcoord_T genomicpos,
+	   int filei, size_t fileposition, int linelen) {
+
+  this->lineindex = lineindex;
+  this->readindex = readindex;
+
+  this->flag = flag;
+  this->split_output = split_output;
+  this->low_read_p = query_lowp;
+
+  this->genomicpos = genomicpos;
+  this->genomicpos_extend_softclip = this->genomicpos - initial_softclip;
+
+  this->filei = filei;
+  this->linestart = fileposition;
+  this->linelen = linelen;
+
+  this->queryseq5 = (char *) NULL;
+  this->queryseq3 = (char *) NULL;
+
+  return;
+}
+
+/* initial_softclip needs to be determined only if we are marking duplicates */
+static void
+Cell_fill_nodups (struct T *this, unsigned int flag, SAM_split_output_type split_output,
+		  Univcoord_T genomicpos, int filei, size_t fileposition, int linelen) {
+
+  this->lineindex = 0;
+  this->readindex = 0;
+
+  this->flag = flag;
+  this->split_output = split_output;
+  this->low_read_p = true;
+
+  this->genomicpos = genomicpos;
+  this->genomicpos_extend_softclip = genomicpos;
+
+  this->filei = filei;
+  this->linestart = fileposition;
+  this->linelen = linelen;
+
+  this->queryseq5 = (char *) NULL;
+  this->queryseq3 = (char *) NULL;
+
+  return;
+}
+
+#if 0
+static void
+print_fromfile (FILE *fp, size_t fileposition, int linelength) {
+  char buffer[CHUNK];
+
+  moveto(fp,fileposition);
+
+  while (linelength > CHUNK) {
+    fread(buffer,sizeof(char),CHUNK,fp);
+    fwrite(buffer,sizeof(char),CHUNK,stdout);
+    linelength -= CHUNK;
+  }
+  if (linelength > 0) {
+    fread(buffer,sizeof(char),linelength,fp);
+    fwrite(buffer,sizeof(char),linelength,stdout);
+  }
+
+  return;
+}
+#endif
+
+
+
+static void
+Cell_print_fromfile (FILE *fp_input, T this, Filestring_T headers) {
+  char buffer[CHUNK];
+  int linelength = this->linelen;
+  FILE *fp_output;
+
+#if 0
+  if (nofailsp == true && this->split_output == OUTPUT_NM) {
+    /* Skip */
+    return;
+
+  } else if (failsonlyp == true && this->split_output != OUTPUT_NM &&
+	     this->split_output != OUTPUT_HX && this->split_output != OUTPUT_UX &&
+	     this->split_output != OUTPUT_PX && this->split_output != OUTPUT_CX) {
+    return;
+  }
+
+  if (failedinput_root != NULL && primaryp(this->flag) == true) {
+    /* Convert SAM line to FASTA or FASTQ and write to a failedinput file */
+  }
+#endif
+
+  if (split_output_root == NULL) {
+    if ((fp_output = outputs[0]) == NULL) {
+      fp_output = outputs[0] = stdout;
+      Filestring_print(fp_output,headers);
+    }
+
+  } else {
+    if ((fp_output = outputs[this->split_output]) == NULL) {
+      fp_output = outputs[this->split_output] = SAM_header_open_file(this->split_output,split_output_root,/*appendp*/false);
+      Filestring_print(fp_output,headers);
+    }
+  }
+
+  moveto(fp_input,this->linestart);
+
+#ifdef DEBUG
+  printf("readindex %d: ",this->readindex);
+#endif
+
+  while (linelength > CHUNK) {
+    fread(buffer,sizeof(char),CHUNK,fp_input);
+    fwrite(buffer,sizeof(char),CHUNK,fp_output);
+    linelength -= CHUNK;
+  }
+  if (linelength > 0) {
+    fread(buffer,sizeof(char),linelength,fp_input);
+    fwrite(buffer,sizeof(char),linelength,fp_output);
+  }
+
+  return;
+}
+
+
+
+static int
+Cell_lineindex_cmp (const void *a, const void *b) {
+  T x = * (T *) a;
+  T y = * (T *) b;
+
+  if (x->lineindex < y->lineindex) {
+    return -1;
+  } else if (y->lineindex < x->lineindex) {
+    return +1;
+  } else {
+    return 0;
+  }
+}
+
+static int
+Cell_genomicpos_cmp (const void *a, const void *b) {
+  T x = * (T *) a;
+  T y = * (T *) b;
+
+  if (x->genomicpos != 0 && y->genomicpos == 0) {
+    return -1;
+  } else if (y->genomicpos != 0 && x->genomicpos == 0) {
+    return +1;
+  } else if (x->genomicpos < y->genomicpos) {
+    return -1;
+  } else if (y->genomicpos < x->genomicpos) {
+    return +1;
+  } else {
+    return 0;
+  }
+}
+
+static int
+Cell_genomicpos_linestart_cmp (const void *a, const void *b) {
+  T x = * (T *) a;
+  T y = * (T *) b;
+
+  if (x->genomicpos != 0 && y->genomicpos == 0) {
+    return -1;
+  } else if (y->genomicpos != 0 && x->genomicpos == 0) {
+    return +1;
+  } else if (x->genomicpos < y->genomicpos) {
+    return -1;
+  } else if (y->genomicpos < x->genomicpos) {
+    return +1;
+  } else if (x->linestart < y->linestart) {
+    return -1;
+  } else if (y->linestart < x->linestart) {
+    return +1;
+  } else {
+    return 0;
+  }
+}
+
+static int
+Cell_genomicpos_extend_softclip_lowhigh_cmp (const void *a, const void *b) {
+  T x = * (T *) a;
+  T y = * (T *) b;
+
+  if (x->genomicpos_extend_softclip != 0 && y->genomicpos_extend_softclip == 0) {
+    return -1;
+  } else if (y->genomicpos_extend_softclip != 0 && x->genomicpos_extend_softclip == 0) {
+    return +1;
+  } else if (x->genomicpos_extend_softclip < y->genomicpos_extend_softclip) {
+    return -1;
+  } else if (y->genomicpos_extend_softclip < x->genomicpos_extend_softclip) {
+    return +1;
+
+  } else if (x->low_read_p == true && y->low_read_p == false) {
+    return -1;
+  } else if (y->low_read_p == true && x->low_read_p == false) {
+    return +1;
+  } else {
+    return 0;
+  }
+}
+
+static int
+Cell_accession_cmp (const void *a, const void *b) {
+  T x = * (T *) a;
+  T y = * (T *) b;
+
+  return strcmp(x->acc,y->acc);
+}
+
+static int
+Cell_matefwd_cmp (const void *a, const void *b) {
+  T x = * (T *) a;
+  T y = * (T *) b;
+
+  if (x->mate_genomicpos < y->mate_genomicpos) {
+    return -1;
+  } else if (y->mate_genomicpos < x->mate_genomicpos) {
+    return +1;
+  } else {
+    return 0;
+  }
+}
+
+static int
+Cell_materev_cmp (const void *a, const void *b) {
+  T x = * (T *) a;
+  T y = * (T *) b;
+
+  if (x->mate_genomicpos > y->mate_genomicpos) {
+    return -1;
+  } else if (y->mate_genomicpos > x->mate_genomicpos) {
+    return +1;
+  } else {
+    return 0;
+  }
+}
+
+
+static int
+Cell_queryseq_cmp (const void *a, const void *b) {
+  T x = * (T *) a;
+  T y = * (T *) b;
+  int cmp;
+
+  if (x->queryseq_alpha1 != NULL && y->queryseq_alpha1 == NULL) {
+    return -1;
+  } else if (y->queryseq_alpha1 == NULL && x->queryseq_alpha1 != NULL) {
+    return +1;
+  } else if (x->queryseq_alpha1 == NULL && y->queryseq_alpha1 == NULL) {
+    return 0;
+  } else if ((cmp = strcmp(x->queryseq_alpha1,y->queryseq_alpha1)) != 0) {
+    return cmp;
+  } else if (x->queryseq_alpha2 != NULL && y->queryseq_alpha2 == NULL) {
+    return -1;
+  } else if (y->queryseq_alpha2 == NULL && x->queryseq_alpha2 != NULL) {
+    return +1;
+  } else if (x->queryseq_alpha2 == NULL && y->queryseq_alpha2 == NULL) {
+    return 0;
+  } else {
+    return strcmp(x->queryseq_alpha2,y->queryseq_alpha2);
+  }
+}
+
+
+#if 0
+static int
+Cell_binary_search (int lowi, int highi, T *cells, Univcoord_T goal) {
+  int middlei;
+
+  debug10(printf("entered binary search with lowi=%d, highi=%d, goal=%u\n",lowi,highi,goal));
+
+  while (lowi < highi) {
+    middlei = lowi + ((highi - lowi) / 2);
+    debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
+		   lowi,cells[lowi]->genomicpos,middlei,cells[middlei]->genomicpos,
+		   highi,cells[highi]->genomicpos,goal));
+    if (goal < cells[middlei]->genomicpos) {
+      highi = middlei;
+    } else if (goal > cells[middlei]->genomicpos) {
+      lowi = middlei + 1;
+    } else {
+      debug10(printf("binary search returns %d\n",middlei));
+      /* Rewind to first cell having the goal */
+      while (middlei - 1 >= lowi && cells[middlei - 1]->genomicpos == goal) {
+	middlei--;
+      }
+      return middlei;
+    }
+  }
+
+  debug10(printf("binary search returns %d\n",highi));
+  return highi;
+}
+#endif
+
+#if 0
+static int
+Cell_find (int lowi, int highi, T *cells, Univcoord_T goal, int readindex) {
+  int i;
+
+  i = Cell_binary_search(lowi,highi,cells,goal);
+  while (i < highi && cells[i]->genomicpos == goal) {
+    if (cells[i]->readindex == readindex) {
+      return i;
+    } else {
+      i++;
+    }
+  }
+
+  fprintf(stderr,"Cannot find cell in %d..%d with genomicpos %u and readindex %d\n",
+	  lowi,highi,goal,readindex);
+  return -1;
+}
+#endif
+
+
+static void
+process_without_dups (FILE **sam_inputs, int *headerlengths, int *ncells, int ninputs,
+		      Intlist_T linelengths, int ncells_total, Univ_IIT_T chromosome_iit,
+		      Univcoord_T *chroffsets, Filestring_T headers) {
+  T *cells;
+  FILE *fp_sam;
+  int filei, linei;
+  int n_mappers = 0, n_nomappers = 0;
+  Intlist_T l;
+  struct T *cells_allocated, *ptr;
+  int b, i, j, k;
+
+  size_t fileposition;
+  int linelen;
+  unsigned int flag;
+  SAM_split_output_type split_output;
+  Univcoord_T genomicpos;
+
+  int acclength;
+
+  ptr = cells_allocated = (struct T *) MALLOC(ncells_total * sizeof(struct T));
+  cells = (T *) MALLOC(ncells_total * sizeof(T));
+  for (i = 0; i < ncells_total; i++) {
+    cells[i] = &(ptr[i]);
+  }
+
+  fprintf(stderr,"Reading SAM files...\n");
+
+  k = 0;
+  l = linelengths;
+  for (filei = 0; filei < ninputs; filei++) {
+    fprintf(stderr,"  Reading file %d...",filei+1);
+    fp_sam = sam_inputs[filei];
+    fileposition = headerlengths[filei];
+    for (linei = 0; linei < ncells[filei]; linei++) {
+      linelen = Intlist_head(l);
+      moveto(fp_sam,fileposition);
+      genomicpos = Samread_parse_genomicpos_fromfile(fp_sam,&flag,&split_output,
+						     chromosome_iit,chroffsets,linelen);
+      Cell_fill_nodups(cells[k++],flag,split_output,genomicpos,filei,fileposition,linelen);
+      if (flag & QUERY_UNMAPPED) {
+	n_nomappers++;
+      } else {
+	n_mappers++;
+      }
+      fileposition += linelen;
+      l = Intlist_next(l);
+    }
+
+    fprintf(stderr,"done\n");
+  }
+
+  /* Sort and print */
+  if (secondary_sort_method == NO_SECONDARY_SORT) {
+    Stopwatch_start(stopwatch);
+    fprintf(stderr,"Sorting entries by genomicpos...");
+    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+    Stopwatch_start(stopwatch);
+    fprintf(stderr,"Printing entries...");
+    for (b = 0; b + MONITOR_INTERVAL < ncells_total; b += MONITOR_INTERVAL) {
+      fprintf(stderr,"Printed %d entries\n",b);
+      for (i = 0; i < MONITOR_INTERVAL; i++) {
+	k = b + i;
+	debug(printf("%u\t%u\t%d\n",cells[k]->genomicpos,cells[k]->linestart,cells[k]->linelen));
+	fp_sam = sam_inputs[cells[k]->filei];
+	Cell_print_fromfile(fp_sam,cells[k],headers);
+      }
+    }
+    for (k = b; k < ncells_total; k++) {
+      debug(printf("%u\t%u\t%d\n",cells[k]->genomicpos,cells[k]->linestart,cells[k]->linelen));
+      fp_sam = sam_inputs[cells[k]->filei];
+      Cell_print_fromfile(fp_sam,cells[k],headers);
+    }
+    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+  } else if (secondary_sort_method == ORIG_SECONDARY_SORT) {
+    Stopwatch_start(stopwatch);
+    fprintf(stderr,"Sorting entries by genomicpos and original file position...");
+    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_linestart_cmp);
+    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+    Stopwatch_start(stopwatch);
+    fprintf(stderr,"Printing entries...");
+    for (b = 0; b + MONITOR_INTERVAL < ncells_total; b += MONITOR_INTERVAL) {
+      fprintf(stderr,"Printed %d entries\n",b);
+      for (i = 0; i < MONITOR_INTERVAL; i++) {
+	k = b + i;
+	fp_sam = sam_inputs[cells[k]->filei];
+	Cell_print_fromfile(fp_sam,cells[k],headers);
+      }
+    }
+    for (k = b; k < ncells_total; k++) {
+      fp_sam = sam_inputs[cells[k]->filei];
+      Cell_print_fromfile(fp_sam,cells[k],headers);
+    }
+    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+  } else if (secondary_sort_method == ACC_SECONDARY_SORT) {
+    Stopwatch_start(stopwatch);
+    fprintf(stderr,"Sorting entries by genomicpos...");
+    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+    Stopwatch_start(stopwatch);
+    fprintf(stderr,"Subsorting by accession and printing entries...");
+    i = 0;
+    while (i < n_mappers) {
+      j = i + 1;
+      while (j < n_mappers && cells[j]->genomicpos == cells[i]->genomicpos) {
+	j++;
+      }
+      
+      if (j > i + 1) {
+	for (k = i; k < j; k++) {
+	  fp_sam = sam_inputs[cells[k]->filei];
+	  moveto(fp_sam,cells[k]->linestart);
+	  cells[k]->acc = Samread_get_acc_fromfile(&acclength,fp_sam,cells[k]->linelen);
+	}
+	
+	qsort(&(cells[i]),j - i,sizeof(T),Cell_accession_cmp);
+	for (k = i; k < j; k++) {
+	  FREE(cells[k]->acc);
+	}
+      }
+      
+      for (k = i; k < j; k++) {
+	fp_sam = sam_inputs[cells[k]->filei];
+	Cell_print_fromfile(fp_sam,cells[k],headers);
+      }
+      
+      i = j;
+    }
+    
+    if (ncells_total > n_mappers + 1) {
+      for (k = n_mappers; k < ncells_total; k++) {
+	fp_sam = sam_inputs[cells[k]->filei];
+	moveto(fp_sam,cells[k]->linestart);
+	cells[k]->acc = Samread_get_acc_fromfile(&acclength,fp_sam,cells[k]->linelen);
+      }
+	
+      qsort(&(cells[n_mappers]),n_nomappers,sizeof(T),Cell_accession_cmp);
+      for (k = n_mappers; k < ncells_total; k++) {
+	FREE(cells[k]->acc);
+      }
+    }
+      
+    for (k = n_mappers; k < ncells_total; k++) {
+      fp_sam = sam_inputs[cells[k]->filei];
+      Cell_print_fromfile(fp_sam,cells[k],headers);
+    }
+    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+  } else if (secondary_sort_method == MATEFWD_SECONDARY_SORT || secondary_sort_method == MATEREV_SECONDARY_SORT) {
+    Stopwatch_start(stopwatch);
+    fprintf(stderr,"Sorting entries by genomicpos...");
+    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+    Stopwatch_start(stopwatch);
+    fprintf(stderr,"Subsorting by mate position and printing entries...");
+    i = 0;
+    while (i < n_mappers) {
+      j = i + 1;
+      while (j < n_mappers && cells[j]->genomicpos == cells[i]->genomicpos) {
+	j++;
+      }
+      
+      if (j > i + 1) {
+	for (k = i; k < j; k++) {
+	  moveto(fp_sam,cells[k]->linestart);
+	  cells[k]->mate_genomicpos = Samread_parse_mate_genomicpos_fromfile(fp_sam,chromosome_iit,chroffsets,cells[k]->linelen);
+	}
+	
+	if (secondary_sort_method == MATEFWD_SECONDARY_SORT) {
+	  qsort(&(cells[i]),j - i,sizeof(T),Cell_matefwd_cmp);
+	} else {
+	  qsort(&(cells[i]),j - i,sizeof(T),Cell_materev_cmp);
+	}
+      }
+      
+      for (k = i; k < j; k++) {
+	fp_sam = sam_inputs[cells[k]->filei];
+	Cell_print_fromfile(fp_sam,cells[k],headers);
+      }
+
+      i = j;
+    }
+      
+    if (ncells_total > n_mappers + 1) {
+      for (k = n_mappers; k < ncells_total; k++) {
+	moveto(fp_sam,cells[k]->linestart);
+	cells[k]->mate_genomicpos = Samread_parse_mate_genomicpos_fromfile(fp_sam,chromosome_iit,chroffsets,cells[k]->linelen);
+      }
+      
+      if (secondary_sort_method == MATEFWD_SECONDARY_SORT) {
+	qsort(&(cells[n_mappers]),n_nomappers,sizeof(T),Cell_matefwd_cmp);
+      } else {
+	qsort(&(cells[n_mappers]),n_nomappers,sizeof(T),Cell_materev_cmp);
+      }
+    }
+      
+    for (k = n_mappers; k < ncells_total; k++) {
+      fp_sam = sam_inputs[cells[k]->filei];
+      Cell_print_fromfile(fp_sam,cells[k],headers);
+    }
+    fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+  } else {
+    fprintf(stderr,"Secondary sort method not recognized\n");
+    abort();
+  }
+
+  FREE(cells);
+  FREE(cells_allocated);
+
+  return;
+}
+
+
+static int
+process_with_dups (FILE **sam_inputs, int *headerlengths, int *ncells, int ninputs,
+		   Intlist_T linelengths, int ncells_total, Univ_IIT_T chromosome_iit,
+		   Univcoord_T *chroffsets, Filestring_T headers) {
+  FILE *fp_sam;
+  int filei, linei;
+  int nmarked = 0;
+  int n_mappers = 0, n_nomappers = 0;
+  T *cells, mate;
+  struct T *cells_allocated, *ptr;
+  int *queryseq5_index, *queryseq3_index;
+  int i, k;
+  int j, j_low, j_high, mate_allocated;
+
+  size_t fileposition;
+  int linelen;
+  Univcoord_T genomicpos;
+  char *hiti;
+
+  Intlist_T l;
+  unsigned int flag;
+  SAM_split_output_type split_output;
+  int initial_softclip;
+  char *acc, *last_acc, *read;
+  int lineindex, readindex, nreads;
+  int acclength, last_acclength, readlength;
+  bool query_lowp;
+  bool *duplicatep = NULL;
+  bool all_duplicates_p;
+
+
+
+  /* Actually, array lengths should be nreads, but we don't know that yet */
+  queryseq5_index = (int *) CALLOC(ncells_total,sizeof(int));
+  queryseq3_index = (int *) CALLOC(ncells_total,sizeof(int));
+
+  ptr = cells_allocated = (struct T *) MALLOC(ncells_total * sizeof(struct T));
+  cells = (T *) MALLOC(ncells_total * sizeof(T));
+  for (i = 0; i < ncells_total; i++) {
+    cells[i] = &(ptr[i]);
+  }
+    
+
+  last_acc = MALLOC(sizeof(char));
+  last_acc[0] = '\0';
+  last_acclength = 0;
+  lineindex = -1;
+  readindex = -1;		/* readindex is 0-based */
+
+  fprintf(stderr,"Reading SAM files...\n");
+
+  k = 0;
+  l = linelengths;
+  for (filei = 0; filei < ninputs; filei++) {
+    fprintf(stderr,"  Reading file %d...",filei+1);
+    fp_sam = sam_inputs[filei];
+    fileposition = headerlengths[filei];
+    for (linei = 0; linei < ncells[filei]; linei++) {
+      linelen = Intlist_head(l);
+      moveto(fp_sam,fileposition);
+      acc = Samread_parse_acc_and_softclip_fromfile(&acclength,&flag,&split_output,&hiti,
+						    &genomicpos,&initial_softclip,&query_lowp,
+						    fp_sam,chromosome_iit,chroffsets,linelen);
+      lineindex++;
+      if (acclength != last_acclength) {
+	readindex++;
+      } else if (strcmp(acc,last_acc)) {
+	readindex++;
+      }
+      FREE(last_acc);
+      last_acc = acc;
+      last_acclength = acclength;
+
+      if (flag & QUERY_UNMAPPED) {
+	n_nomappers++;
+      } else {
+	n_mappers++;
+      }
+
+      /* debug(printf("Read readindex %d, chrnum %d, chrpos %u, linelen %d\n",readindex,chrnum,chrpos,linelen)); */
+      if (flag & NOT_PRIMARY) {
+      /* Don't use secondary hit for accessing reads */
+
+      } else if (multiple_primaries_p == true) {
+#if 0
+	/* Now always parsed */
+	hiti = Samread_parse_aux_fromfile(fp_sam,/*auxfield*/"HI",linelen);
+#endif
+	if (strcmp(hiti,"1")) {
+	  /* Don't use second or later primary hit for accessing reads */
+	} else if (flag & FIRST_READ_P) {
+	  queryseq5_index[readindex] = k;
+	} else {
+	  queryseq3_index[readindex] = k;
+	}
+	
+      } else {
+	if (flag & FIRST_READ_P) {
+	  queryseq5_index[readindex] = k;
+	} else {
+	  queryseq3_index[readindex] = k;
+	}
+      }
+      
+      FREE(hiti);
+      Cell_fill(cells[k++],lineindex,readindex,flag,split_output,query_lowp,initial_softclip,genomicpos,filei,fileposition,linelen);
+
+      fileposition += linelen;
+      l = Intlist_next(l);
+    }
+
+    fprintf(stderr,"done\n");
+  }
+  FREE(last_acc);
+
+  nreads = readindex + 1;
+  duplicatep = (bool *) CALLOC(nreads,sizeof(bool));
+  
+  /* Sort entries, based on genomicpos_extend_softclip */
+  Stopwatch_start(stopwatch);
+  fprintf(stderr,"Sorting SAM lines...");
+  qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_extend_softclip_lowhigh_cmp);
+  fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+  /* Mark all duplicates within mappers, based on genomicpos_extend_softclip */
+  Stopwatch_start(stopwatch);
+  fprintf(stderr,"Finding duplicates...");
+
+  i = 0;
+  while (i < n_mappers) {
+    j_low = i + 1;
+    while (j_low < n_mappers && cells[j_low]->genomicpos_extend_softclip == cells[i]->genomicpos_extend_softclip && 
+	   cells[j_low]->low_read_p == true) {
+      j_low++;
+    }
+
+    j_high = j_low;
+    while (j_high < n_mappers && cells[j_high]->genomicpos_extend_softclip == cells[i]->genomicpos_extend_softclip) {
+      j_high++;
+    }
+
+    if (j_low > i + 1) {
+      debug(printf("\nFound multiple low hits from %d to (%d - 1) at genomicpos_extend_softclip %u\n",
+		   i,j_low,cells[i]->genomicpos_extend_softclip));
+	  
+      /* Multiple low hits with same chrpos, so opportunity to mark duplicatep */
+      /* Find queryseqs for each */
+      for (k = i; k < j_low; k++) {
+	fp_sam = sam_inputs[cells[k]->filei];
+	debug9(printf("Looking for queryseqs for "));
+	debug9(Cell_print_fromfile(fp_sam,cells[k],headers));
+
+	if (cells[k]->flag & FIRST_READ_P) {
+	  debug9(printf("Flag for entry %d is %u, indicating a first read\n",k,cells[k]->flag));
+	  moveto(fp_sam,cells[k]->linestart);
+	  Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+	  if (cells[k]->flag & QUERY_MINUSP) {
+	    debug(printf("complementing queryseq5\n"));
+	    make_complement_inplace(read,readlength);
+	  }
+	  cells[k]->queryseq5 = read;
+	  debug9(printf("queryseq5 is %s\n",read));
+	    
+	  mate_allocated = queryseq3_index[cells[k]->readindex];
+	  mate = &(cells_allocated[mate_allocated]);
+
+	  debug9(printf("Mate is "));
+	  debug9(Cell_print_fromfile(fp_sam,mate,headers));
+	  moveto(fp_sam,mate->linestart);
+	  Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+	  if (mate->flag & QUERY_MINUSP) {
+	    debug(printf("complementing queryseq3\n"));
+	    make_complement_inplace(read,readlength);
+	  }
+	  cells[k]->queryseq3 = read;
+	  debug9(printf("queryseq3 is %s\n",read));
+
+	} else {
+	  debug9(printf("Flag for entry %d is %u, indicating a second read\n",k,cells[k]->flag));
+	  moveto(fp_sam,cells[k]->linestart);
+	  Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+	  if (cells[k]->flag & QUERY_MINUSP) {
+	    debug(printf("complementing queryseq3\n"));
+	    make_complement_inplace(read,readlength);
+	  }
+	  cells[k]->queryseq3 = read;
+	  debug9(printf("queryseq3 is %s\n",read));
+
+	  mate_allocated = queryseq5_index[cells[k]->readindex];
+	  mate = &(cells_allocated[mate_allocated]);
+	  debug9(printf("Mate is "));
+	  debug9(Cell_print_fromfile(fp_sam,mate,headers));
+	  moveto(fp_sam,mate->linestart);
+	  Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+	  if (mate->flag & QUERY_MINUSP) {
+	    debug(printf("complementing queryseq5\n"));
+	    make_complement_inplace(read,readlength);
+	  }
+	  cells[k]->queryseq5 = read;
+	  debug9(printf("queryseq5 is %s\n",read));
+	}
+	  
+	Cell_standardize_queryseqs(cells[k]);
+      }
+
+      qsort(&(cells[i]),j_low - i,sizeof(T),Cell_queryseq_cmp);
+
+      for (k = i + 1; k < j_low; k++) {
+	debug(printf("Comparing cell %d with %d => cmp %d\n",k,k-1,Cell_queryseq_cmp(&(cells[k]),&(cells[k-1]))));
+	debug(printf("  %s %s\n",cells[k-1]->queryseq_alpha1,cells[k-1]->queryseq_alpha2));
+	debug(printf("  %s %s\n",cells[k]->queryseq_alpha1,cells[k]->queryseq_alpha2));
+	if (Cell_queryseq_cmp(&(cells[k]),&(cells[k-1])) == 0) {
+	  readindex = cells[k]->readindex;
+	  duplicatep[readindex] = true;
+	  if (mark_first_p == true) {
+	    readindex = cells[k-1]->readindex;
+	    duplicatep[readindex] = true;
+	  }
+	}
+      }
+
+      for (k = i; k < j_low; k++) {
+	FREE(cells[k]->queryseq5);
+	FREE(cells[k]->queryseq3);
+      }
+    }
+
+
+    /* Also analyze high ends, to avoid a false negative when the initial_softclip extension is wrong
+       due to an end indel */
+    if (j_high > j_low + 1) {
+      all_duplicates_p = true;
+      for (k = j_low; k < j_high; k++) {
+	readindex = cells[k]->readindex;
+	if (duplicatep[readindex] == false) {
+	  all_duplicates_p = false;
+	}
+      }
+
+      if (all_duplicates_p == false) {
+	debug(printf("\nFound multiple high hits from %d to (%d - 1) at genomicpos_extend_softclip %u\n",
+		     j_low,j_high,cells[j_low]->genomicpos_extend_softclip));
+	
+	/* Multiple high hits with same chrpos, so opportunity to mark duplicatep */
+        /* Find queryseqs for each */
+	for (k = j_low; k < j_high; k++) {
+	  fp_sam = sam_inputs[cells[k]->filei];
+	  debug9(printf("Looking for queryseqs for "));
+	  debug9(Cell_print_fromfile(fp_sam,cells[k],headers));
+
+	  if (cells[k]->flag & FIRST_READ_P) {
+	    debug9(printf("Flag for entry %d is %u, indicating a first read\n",k,cells[k]->flag));
+	    moveto(fp_sam,cells[k]->linestart);
+	    Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+	    if (cells[k]->flag & QUERY_MINUSP) {
+	      debug(printf("complementing queryseq5\n"));
+	      make_complement_inplace(read,readlength);
+	    }
+	    cells[k]->queryseq5 = read;
+	    debug9(printf("queryseq5 is %s\n",read));
+	    
+	    mate_allocated = queryseq3_index[cells[k]->readindex];
+	    mate = &(cells_allocated[mate_allocated]);
+	    debug9(printf("Mate is "));
+	    debug9(Cell_print_fromfile(fp_sam,mate,headers));
+	    moveto(fp_sam,mate->linestart);
+	    Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+	    if (mate->flag & QUERY_MINUSP) {
+	      debug(printf("complementing queryseq3\n"));
+	      make_complement_inplace(read,readlength);
+	    }
+	    cells[k]->queryseq3 = read;
+	    debug9(printf("queryseq3 is %s\n",read));
+
+	  } else {
+	    debug9(printf("Flag for entry %d is %u, indicating a second read\n",k,cells[k]->flag));
+	    moveto(fp_sam,cells[k]->linestart);
+	    Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+	    if (cells[k]->flag & QUERY_MINUSP) {
+	      debug(printf("complementing queryseq3\n"));
+	      make_complement_inplace(read,readlength);
+	    }
+	    cells[k]->queryseq3 = read;
+	    debug9(printf("queryseq3 is %s\n",read));
+
+	    mate_allocated = queryseq5_index[cells[k]->readindex];
+	    mate = &(cells_allocated[mate_allocated]);
+	    debug9(printf("Mate is "));
+	    debug9(Cell_print_fromfile(fp_sam,mate,headers));
+	    moveto(fp_sam,mate->linestart);
+	    Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+	    if (mate->flag & QUERY_MINUSP) {
+	      debug(printf("complementing queryseq5\n"));
+	      make_complement_inplace(read,readlength);
+	    }
+	    cells[k]->queryseq5 = read;
+	    debug9(printf("queryseq5 is %s\n",read));
+	  }
+	  
+	  Cell_standardize_queryseqs(cells[k]);
+	}
+
+	qsort(&(cells[j_low]),j_high - j_low,sizeof(T),Cell_queryseq_cmp);
+
+	for (k = j_low + 1; k < j_high; k++) {
+	  debug(printf("Comparing cell %d with %d => cmp %d\n",k,k-1,Cell_queryseq_cmp(&(cells[k]),&(cells[k-1]))));
+	  debug(printf("  %s %s\n",cells[k-1]->queryseq_alpha1,cells[k-1]->queryseq_alpha2));
+	  debug(printf("  %s %s\n",cells[k]->queryseq_alpha1,cells[k]->queryseq_alpha2));
+	  if (Cell_queryseq_cmp(&(cells[k]),&(cells[k-1])) == 0) {
+	    readindex = cells[k]->readindex;
+	    duplicatep[readindex] = true;
+	    if (mark_first_p == true) {
+	      readindex = cells[k-1]->readindex;
+	      duplicatep[readindex] = true;
+	    }
+	  }
+	}
+
+	for (k = j_low; k < j_high; k++) {
+	  FREE(cells[k]->queryseq5);
+	  FREE(cells[k]->queryseq3);
+	}
+      }
+    }
+
+    i = j_high;
+  }
+
+  /* Mark all duplicates within nomappers, based on queryseq */
+  for (k = n_mappers; k < ncells_total; k++) {
+    if (duplicatep[cells[k]->readindex] == true) {
+      cells[k]->queryseq5 = cells[k]->queryseq3 = NULL; /* Will be sorted to end of list */
+
+    } else if (cells[k]->flag & FIRST_READ_P) {
+      fp_sam = sam_inputs[cells[k]->filei];
+      moveto(fp_sam,cells[k]->linestart);
+      Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+      if (cells[k]->flag & QUERY_MINUSP) {
+	make_complement_inplace(read,readlength);
+      }
+      cells[k]->queryseq5 = read;
+	    
+      mate_allocated = queryseq3_index[cells[k]->readindex];
+      mate = &(cells_allocated[mate_allocated]);
+      moveto(fp_sam,mate->linestart);
+      Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+      if (mate->flag & QUERY_MINUSP) {
+	make_complement_inplace(read,readlength);
+      }
+      cells[k]->queryseq3 = read;
+
+      Cell_standardize_queryseqs(cells[k]);
+
+    } else {
+      fp_sam = sam_inputs[cells[k]->filei];
+      moveto(fp_sam,cells[k]->linestart);
+      Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,cells[k]->linelen);
+      if (cells[k]->flag & QUERY_MINUSP) {
+	make_complement_inplace(read,readlength);
+      }
+      cells[k]->queryseq3 = read;
+      
+      mate_allocated = queryseq5_index[cells[k]->readindex];
+      mate = &(cells_allocated[mate_allocated]);
+      moveto(fp_sam,mate->linestart);
+      Samread_parse_read_fromfile(fp_sam,&flag,&readlength,&read,mate->linelen);
+      if (mate->flag & QUERY_MINUSP) {
+	make_complement_inplace(read,readlength);
+      }
+      cells[k]->queryseq5 = read;
+	  
+      Cell_standardize_queryseqs(cells[k]);
+    }
+  }
+
+  FREE(queryseq3_index);
+  FREE(queryseq5_index);
+
+
+  /* Sort non-mapping entries based on queryseqs */
+  qsort(&(cells[n_mappers]),n_nomappers,sizeof(T),Cell_queryseq_cmp);
+
+  for (k = n_mappers + 1; k < ncells_total; k++) {
+    debug(printf("Comparing cell %d with %d => cmp %d\n",k,k-1,Cell_queryseq_cmp(&(cells[k]),&(cells[k-1]))));
+    if (Cell_queryseq_cmp(&(cells[k]),&(cells[k-1])) == 0) {
+      readindex = cells[k]->readindex;
+      duplicatep[readindex] = true;
+      if (mark_first_p == true) {
+	readindex = cells[k-1]->readindex;
+	duplicatep[readindex] = true;
+      }
+    }
+  }
+
+  for (k = n_mappers; k < ncells_total; k++) {
+    FREE(cells[k]->queryseq5);
+    FREE(cells[k]->queryseq3);
+  }
+  fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+
+  /* Re-sort entries, based on genomicpos (not extended by initial_softclip), and secondary criterion */
+  Stopwatch_start(stopwatch);
+  fprintf(stderr,"Re-sorting entries...");
+  if (restore_original_order_p == true) {
+    qsort(cells,ncells_total,sizeof(T),Cell_lineindex_cmp);
+
+  } else if (secondary_sort_method == NO_SECONDARY_SORT) {
+    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+
+  } else if (secondary_sort_method == ORIG_SECONDARY_SORT) {
+    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_linestart_cmp);
+
+  } else if (secondary_sort_method == ACC_SECONDARY_SORT) {
+    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+
+    i = 0;
+    while (i < ncells_total) {
+      j = i + 1;
+      while (j < ncells_total && cells[j]->genomicpos == cells[i]->genomicpos) {
+	j++;
+      }
+
+      if (j > i + 1) {
+	for (k = i; k < j; k++) {
+	  fp_sam = sam_inputs[cells[k]->filei];
+	  moveto(fp_sam,cells[k]->linestart);
+	  cells[k]->acc = Samread_get_acc_fromfile(&acclength,fp_sam,cells[k]->linelen);
+	}
+	qsort(&(cells[i]),j - i,sizeof(T),Cell_accession_cmp);
+	for (k = i; k < j; k++) {
+	  FREE(cells[k]->acc);
+	}
+      }
+
+      i = j;
+    }
+
+  } else if (secondary_sort_method == MATEFWD_SECONDARY_SORT ||
+	     secondary_sort_method == MATEREV_SECONDARY_SORT) {
+    qsort(cells,ncells_total,sizeof(T),Cell_genomicpos_cmp);
+
+    i = 0;
+    while (i < ncells_total) {
+      j = i + 1;
+      while (j < ncells_total && cells[j]->genomicpos == cells[i]->genomicpos) {
+	j++;
+      }
+
+      if (j > i + 1) {
+	for (k = i; k < j; k++) {
+	  fp_sam = sam_inputs[cells[k]->filei];
+	  moveto(fp_sam,cells[k]->linestart);
+	  cells[k]->mate_genomicpos = Samread_parse_mate_genomicpos_fromfile(fp_sam,chromosome_iit,chroffsets,cells[k]->linelen);
+	}
+	if (secondary_sort_method == MATEFWD_SECONDARY_SORT) {
+	  qsort(&(cells[i]),j - i,sizeof(T),Cell_matefwd_cmp);
+	} else {
+	  qsort(&(cells[i]),j - i,sizeof(T),Cell_materev_cmp);
+	}
+      }
+
+      i = j;
+    }
+  }
+  fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+  /* Print results */
+  Stopwatch_start(stopwatch);
+  fprintf(stderr,"Printing results...");
+
+  for (k = 0; k < ncells_total; k++) {
+    if (duplicatep[cells[k]->readindex] == true) {
+      if (print_duplicates_p == true) {
+	fp_sam = sam_inputs[cells[k]->filei];
+	moveto(fp_sam,cells[k]->linestart);
+	Samread_print_as_duplicate_fromfile(fp_sam,cells[k]->linelen);
+      }
+      nmarked++;
+    } else {
+      if (print_unique_p == true) {
+	/* Non-duplicate */
+	fp_sam = sam_inputs[cells[k]->filei];
+	Cell_print_fromfile(fp_sam,cells[k],headers);
+      }
+    }
+  }
+  fprintf(stderr,"done (%.1f seconds)\n",Stopwatch_stop(stopwatch));
+
+  FREE(duplicatep);
+
+  FREE(cells);
+  FREE(cells_allocated);
+
+  return nmarked;
+}
+
+
+
+#define BUFFERLEN 1024
+
+int
+main (int argc, char *argv[]) {
+  FILE **sam_inputs, *fp_sam;
+  int ninputs, filei;
+  int nchromosomes, i;
+  Univcoord_T *chroffsets;
+  Chrpos_T *chrlengths;
+  size_t fileposition;
+  int lastchar;
+
+  char buffer[BUFFERLEN], *lastp, *p;
+  Intlist_T linelengths;
+  int *headerlengths, linelen;
+  Filestring_T headers = NULL;
+#ifdef DEBUG14
+  Intlist_T linelengths_goldstd;
+  int linelen_goldstd;
+#endif
+
+  char *fileroot = NULL, *iitfile;
+  Univ_IIT_T chromosome_iit;
+  int *ncells, ncells_total, nmarked;
+
+  int opt;
+  extern int optind;
+  extern char *optarg;
+  int long_option_index = 0;
+  const char *long_name;
+
+  while ((opt = getopt_long(argc,argv,"D:d:^?",
+			    long_options,&long_option_index)) != -1) {
+    switch (opt) {
+    case 0:
+      long_name = long_options[long_option_index].name;
+      if (!strcmp(long_name,"version")) {
+	print_program_version();
+	exit(0);
+      } else if (!strcmp(long_name,"help")) {
+	print_program_usage();
+	exit(0);
+
+      } else if (!strcmp(long_name,"split-output")) {
+	split_output_root = optarg;
+      } else if (!strcmp(long_name,"append-output")) {
+	appendp = true;
+
+      } else if (!strcmp(long_name,"sort2")) {
+	if (!strcmp(optarg,"none")) {
+	  secondary_sort_method = NO_SECONDARY_SORT;
+	} else if (!strcmp(optarg,"orig")) {
+	  secondary_sort_method = ORIG_SECONDARY_SORT;
+	} else if (!strcmp(optarg,"accession")) {
+	  secondary_sort_method = ACC_SECONDARY_SORT;
+	} else if (!strcmp(optarg,"mate-fwd")) {
+	  secondary_sort_method = MATEFWD_SECONDARY_SORT;
+	} else if (!strcmp(optarg,"mate-rev")) {
+	  secondary_sort_method = MATEREV_SECONDARY_SORT;
+	} else {
+	  fprintf(stderr,"--sort2 must be none, orig, accession, mate-fwd, or mate-rev\n");
+	  exit(9);
+	}
+
+      } else if (!strcmp(long_name,"mark-dups")) {
+	mark_duplicates_p = true;
+	print_unique_p = true;
+	print_duplicates_p = true;
+
+      } else if (!strcmp(long_name,"mark-first")) {
+	mark_first_p = true;
+
+      } else if (!strcmp(long_name,"dups-only")) {
+	mark_duplicates_p = true;
+	print_unique_p = false;
+	print_duplicates_p = true;
+
+      } else if (!strcmp(long_name,"uniq-only")) {
+	mark_duplicates_p = true;
+	print_unique_p = true;
+	print_duplicates_p = false;
+
+      } else if (!strcmp(long_name,"restore-orig-order")) {
+	restore_original_order_p = true;
+
+      } else if (!strcmp(long_name,"multiple-primaries")) {
+	multiple_primaries_p = true;
+
+      } else if (!strcmp(long_name,"no-sam-headers")) {
+	sam_headers_p = false;
+
+      } else {
+	/* Shouldn't reach here */
+	fprintf(stderr,"Don't recognize option %s.  For usage, run 'get-genome --help'",long_name);
+	exit(9);
+      }
+      break;
+
+    case 'D': user_genomedir = optarg; break;
+    case 'd': 
+      dbroot = (char *) CALLOC(strlen(optarg)+1,sizeof(char));
+      strcpy(dbroot,optarg);
+      break;
+
+    case '^': print_program_version(); exit(0);
+    case '?': print_program_usage(); exit(0);
+    default: exit(9);
+    }
+  }
+  argc -= optind;
+  argv += optind;
+
+  if (dbroot == NULL) {
+    print_program_usage();
+    exit(9);
+  } else if (!strcmp(dbroot,"?")) {
+    Datadir_avail_gmap_databases(stdout,user_genomedir);
+    exit(0);
+  } else {
+    genomesubdir = Datadir_find_genomesubdir(&fileroot,&dbversion,user_genomedir,dbroot);
+    iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
+			      strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
+    sprintf(iitfile,"%s/%s.chromosome.iit",genomesubdir,fileroot);
+    chromosome_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false);
+    FREE(iitfile);
+
+    FREE(dbversion);
+    FREE(genomesubdir);
+    FREE(fileroot);
+    FREE(dbroot);
+
+    nchromosomes = Univ_IIT_total_nintervals(chromosome_iit);
+    chrlengths = Univ_IIT_chrlengths(chromosome_iit);
+    chroffsets = MALLOC(nchromosomes * sizeof(Univcoord_T));
+    chroffsets[0] = 0;
+    for (i = 1; i < nchromosomes; i++) {
+      chroffsets[i] = chroffsets[i-1] + chrlengths[i-1];
+    }
+    FREE(chrlengths);
+  }
+    
+  /* Open all outputs, even if --split-output is not used */
+  outputs = (FILE **) CALLOC((1+N_SPLIT_OUTPUTS),sizeof(FILE *));
+
+
+  /* Inputs */
+  ninputs = argc;
+  sam_inputs = (FILE **) CALLOC(ninputs,sizeof(FILE *));
+  headerlengths = (int *) CALLOC(ninputs,sizeof(int));
+  ncells = (int *) CALLOC(ninputs,sizeof(int));
+  for (filei = 0; filei < ninputs; filei++) {
+    if ((sam_inputs[filei] = fopen(argv[filei],"r")) == NULL) {
+      fprintf(stderr,"Cannot open SAM file %s\n",argv[filei]);
+      exit(9);
+    }
+  }
+
+  stopwatch = Stopwatch_new();
+  Stopwatch_start(stopwatch);
+  fprintf(stderr,"Analyzing %d SAM files...\n",ninputs);
+
+  linelengths = (Intlist_T) NULL;
+  ncells_total = 0;
+  for (filei = 0; filei < ninputs; filei++) {
+    fp_sam = sam_inputs[filei];
+    fileposition = headerlengths[filei] = SAM_header_length(&lastchar,fp_sam); /* Ignore lastchar */
+
+    /* Take care of char read by SAM_header_length */
+#ifdef HAVE_FSEEKO
+    fseeko(fp_sam,-1,SEEK_CUR);
+#else
+    fseek(fp_sam,-1,SEEK_CUR);
+#endif
+
+    linelen = 0;
+    ncells[filei] = 0;
+    while (fgets(buffer,BUFFERLEN,fp_sam) != NULL) {
+      /* printf("Read %s\n",buffer); */
+      lastp = buffer;
+      while ((p = index(lastp,'\n')) != NULL) {
+	linelen += (p - lastp)/sizeof(char) + 1;
+
+	linelengths = Intlist_push(linelengths,linelen);
+	fileposition += linelen;
+	ncells[filei] += 1;
+
+	linelen = 0;
+	lastp = p + 1;
+      }
+      linelen += strlen(lastp);
+      /* printf("Adding %d to get linelen %d\n",strlen(buffer),linelen); */
+    }
+
+    ncells_total += ncells[filei];
+
+    if (fileposition != Access_filesize(argv[filei])) {
+      fprintf(stderr,"Something is wrong with parsing of SAM file %s\n",argv[filei]);
+      fprintf(stderr,"Final file position using sortinfo: %llu\n",(unsigned long long) fileposition);
+      fprintf(stderr,"File size of SAM output file:       %llu\n",(unsigned long long) Access_filesize(argv[0]));
+      exit(9);
+    } else {
+      fprintf(stderr,"  File %d has %d SAM lines.\n",filei+1,ncells[filei]);
+    }
+
+  }
+
+  fprintf(stderr,"Done with analysis (%.1f seconds).  Found %d SAM lines total.\n",
+	  Stopwatch_stop(stopwatch),ncells_total);
+  Stopwatch_free(&stopwatch);
+
+  if (ncells_total == 0) {
+    /* Exit without printing header */
+
+  } else if (sam_headers_p == false) {
+    /* Don't print SAM headers */
+
+  } else {
+    moveto(sam_inputs[0],0);
+    headers = SAM_header_change_HD_tosorted(sam_inputs[0],headerlengths[0]);
+  }
+
+  linelengths = Intlist_reverse(linelengths);
+
+  if (mark_duplicates_p == false) {
+    process_without_dups(sam_inputs,headerlengths,ncells,ninputs,linelengths,ncells_total,
+			 chromosome_iit,chroffsets,headers);
+  } else {
+    nmarked = process_with_dups(sam_inputs,headerlengths,ncells,ninputs,linelengths,ncells_total,
+				chromosome_iit,chroffsets,headers);
+    fprintf(stderr,"Marked %d out of %d SAM lines as duplicates (%.1f%%)\n",
+	    nmarked,ncells_total,100.0*(double) nmarked/(double) (ncells_total));
+  }
+
+  for (filei = 0; filei < ninputs; filei++) {
+    fclose(sam_inputs[filei]);
+  }
+  FREE(sam_inputs);
+  FREE(headerlengths);
+  FREE(ncells);
+
+  if (headers != NULL) {
+    Filestring_free(&headers);
+  }
+
+  /* SAM_header_touch(outputs,split_output_root,appendp); -- Don't want to destroy other SAM files */
+  for (i = 1; i <= N_SPLIT_OUTPUTS; i++) {
+    if (outputs[i] != NULL) {
+      fclose(outputs[i]);
+    }
+  }
+  FREE(outputs);
+
+
+  Intlist_free(&linelengths);
+
+  FREE(chroffsets);
+  Univ_IIT_free(&chromosome_iit);
+
+  return 0;
+}


=====================================
src/samread.c
=====================================
@@ -0,0 +1,1890 @@
+static char rcsid[] = "$Id: samread.c 215476 2018-05-25 23:23:37Z twu $";
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		/* For strcpy */
+#include <strings.h>		/* For rindex */
+#include <ctype.h>
+
+#include "except.h"
+#include "mem.h"
+#include "assert.h"
+#include "bool.h"
+
+#include "samread.h"
+#include "samflags.h"
+#include "chrnum.h"
+
+
+#ifdef DEBUG
+#define debug(x) x
+#else
+#define debug(x)
+#endif
+
+/* Prints location of parsing error for Unexpected output type */
+#ifdef DEBUG1
+#define debug1(x) x
+#else
+#define debug1(x)
+#endif
+
+
+static int
+cigar_string_readlength (int *hardclip_low, int *hardclip_high, char *cigar) {
+  int readlength;
+  unsigned int npos;
+  char *p, type;
+  bool firstp = true;
+
+  *hardclip_low = *hardclip_high = 0;
+  if (cigar[0] == '*') {
+    return 0;
+  } else {
+    readlength = 0;
+
+    p = cigar;
+    while (*p != '\0') {
+      if (sscanf(p,"%u",&npos) != 1) {
+	fprintf(stderr,"Unable to parse cigar %s.  No number in %s\n",cigar,p);
+	exit(9);
+      }
+
+      while (*p != '\0' && isdigit(*p)) {
+	p++;
+      }
+      if (*p == '\0') {
+	fprintf(stderr,"Unable to parse cigar %s.  No letter after number %u\n",cigar,npos);
+	exit(9);
+      } else {
+	type = *p++;
+      }
+
+      if (type == 'S' || type == 'M' || type == 'I') {
+	readlength += (int) npos;
+
+      } else if (type == 'H') {
+	if (firstp == true) {
+	  *hardclip_low = npos;
+	} else {
+	  *hardclip_high = npos;
+	}
+	readlength += (int) npos;
+
+      } else if (type == 'D' || type == 'N') {
+	/* Ignore */
+      } else if (type == 'P') {
+	/* Ignore */
+      } else {
+	fprintf(stderr,"samread.c cannot parse type %c\n",type);
+	exit(9);
+      }
+
+      firstp = false;
+    }
+  }
+
+  return readlength;
+}
+
+
+static int
+cigar_string_initial_softclip (char *cigar) {
+  unsigned int npos;
+  char *p, type;
+
+  if (cigar[0] == '*') {
+    return 0;
+  } else {
+    p = cigar;
+    while (*p != '\0') {
+      if (sscanf(p,"%u",&npos) != 1) {
+	fprintf(stderr,"Unable to parse cigar %s.  No number in %s\n",cigar,p);
+	exit(9);
+      }
+
+      while (*p != '\0' && isdigit(*p)) {
+	p++;
+      }
+      if (*p == '\0') {
+	fprintf(stderr,"Unable to parse cigar %s.  No letter after number %u\n",cigar,npos);
+	exit(9);
+      } else {
+	type = *p++;
+      }
+
+      if (type != 'S') {
+	return 0;
+      } else {
+	return (int) npos;
+      }
+    }
+
+    return 0;
+  }
+}
+
+
+#if 0
+char *
+Samread_get_acc (unsigned int *flag, char *line) {
+  char *acc, *p;
+  int length;
+
+  p = line;
+  while (*p != '\0' && *p != '\t') p++;
+  length = (p - line)/sizeof(char);
+  acc = (char *) MALLOC((length+1)*sizeof(char));
+  strncpy(acc,line,length);
+  acc[length] = '\0';
+
+  if (*p == '\0') {
+    fprintf(stderr,"Can't parse flag part of %s\n",line);
+    abort();
+  } else {
+    p++;			/* Skip over tab */
+  }
+  *flag = strtoul(p,NULL,10);
+
+  return acc;
+}
+#endif
+
+
+/* Takes advantage of the fact that sam_sort knows the linelength */
+char *
+Samread_get_acc_fromfile (int *acclength, FILE *fp, int linelength) {
+  char *acc, *p, c;
+
+  p = acc = MALLOC((linelength + 1) * sizeof(char));
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';			/* terminating char */
+  *acclength = (p - acc)/sizeof(char);
+
+  return acc;
+}
+
+
+/* Called just after we read in '\t', so should start at a field */
+static SAM_split_output_type
+parse_XO_fromfile (FILE *fp) {
+  char c = 1, c0, c1;
+  char abbrev0, abbrev1;
+
+  c0 = fgetc(fp);
+  c1 = fgetc(fp);
+  if (c0 == 'X' && c1 == 'O') {
+    fgetc(fp);		/* : */
+    fgetc(fp);		/* type */
+    fgetc(fp);		/* : */
+    abbrev0 = fgetc(fp);
+    abbrev1 = fgetc(fp);
+    switch (abbrev0) {
+    case 'N':
+      if (abbrev1 == 'M') {
+	return OUTPUT_NM;
+      } else {
+	debug1(fprintf(stderr,"parse_XO_from_file 1: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	return OUTPUT_NONE;
+      }
+      break;
+    case 'C':
+      switch (abbrev1) {
+      case 'U': return OUTPUT_CU;
+      case 'C': return OUTPUT_CC;
+      case 'T': return OUTPUT_CT;
+      case 'M': return OUTPUT_CM;
+      case 'X': return OUTPUT_CX;
+      default:
+	debug1(fprintf(stderr,"parse_XO_from_file 2: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+      }
+      break;
+    case 'H':
+      switch (abbrev1) {
+      case 'U': return OUTPUT_HU;
+      case 'C': return OUTPUT_HC;
+      case 'T': return OUTPUT_HT;
+      case 'M': return OUTPUT_HM;
+      case 'X': return OUTPUT_HX;
+      default:
+	debug1(fprintf(stderr,"parse_XO_from_file 3: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+      }
+      break;
+    case 'U':
+      switch (abbrev1) {
+      case 'U': return OUTPUT_UU;
+      case 'C': return OUTPUT_UC;
+      case 'T': return OUTPUT_UT;
+      case 'M': return OUTPUT_UM;
+      case 'X': return OUTPUT_UX;
+      default:
+	debug1(fprintf(stderr,"parse_XO_from_file 4: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+      }
+      break;
+    case 'P':
+      switch (abbrev1) {
+      case 'C': return OUTPUT_PC;
+      case 'I': return OUTPUT_PI;
+      case 'S': return OUTPUT_PS;
+      case 'L': return OUTPUT_PL;
+      case 'M': return OUTPUT_PM;
+      case 'X': return OUTPUT_PX;
+      default:
+	debug1(fprintf(stderr,"parse_XO_from_file 5: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+      }
+      break;
+    default:
+      debug1(fprintf(stderr,"parse_XO_from_file 6: "));
+      fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+    }
+  }
+
+  while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+  if (c == EOF || c == '\0') {
+    return OUTPUT_NONE;
+  } else {
+    c0 = fgetc(fp);
+    c1 = fgetc(fp);
+    if (c0 == 'X' && c1 == 'O') {
+      fgetc(fp);		/* : */
+      fgetc(fp);		/* type */
+      fgetc(fp);		/* : */
+      abbrev0 = fgetc(fp);
+      abbrev1 = fgetc(fp);
+      switch (abbrev0) {
+      case 'N':
+	if (abbrev1 == 'M') {
+	  return OUTPUT_NM;
+	} else {
+	  debug1(fprintf(stderr,"parse_XO_from_file 7: "));
+	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	  return OUTPUT_NONE;
+	}
+	break;
+      case 'C':
+	switch (abbrev1) {
+	case 'U': return OUTPUT_CU;
+	case 'C': return OUTPUT_CC;
+	case 'T': return OUTPUT_CT;
+	case 'M': return OUTPUT_CM;
+	case 'X': return OUTPUT_CX;
+	default:
+	  debug1(fprintf(stderr,"parse_XO_from_file 8: "));
+	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+	}
+	break;
+      case 'H':
+	switch (abbrev1) {
+	case 'U': return OUTPUT_HU;
+	case 'C': return OUTPUT_HC;
+	case 'T': return OUTPUT_HT;
+	case 'M': return OUTPUT_HM;
+	case 'X': return OUTPUT_HX;
+	default:
+	  debug1(fprintf(stderr,"parse_XO_from_file 9: "));
+	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+	}
+	break;
+      case 'U':
+	switch (abbrev1) {
+	case 'U': return OUTPUT_UU;
+	case 'C': return OUTPUT_UC;
+	case 'T': return OUTPUT_UT;
+	case 'M': return OUTPUT_UM;
+	case 'X': return OUTPUT_UX;
+	default:
+	  debug1(fprintf(stderr,"parse_XO_from_file 10: "));
+	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+	}
+	break;
+      case 'P':
+	switch (abbrev1) {
+	case 'C': return OUTPUT_PC;
+	case 'I': return OUTPUT_PI;
+	case 'S': return OUTPUT_PS;
+	case 'L': return OUTPUT_PL;
+	case 'M': return OUTPUT_PM;
+	case 'X': return OUTPUT_PX;
+	default:
+	  debug1(fprintf(stderr,"parse_XO_from_file 11: "));
+	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+	}
+	break;
+      default:
+	debug1(fprintf(stderr,"parse_XO_from_file 12: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE;
+      }
+    }
+  }
+
+  return OUTPUT_NONE;
+}
+
+
+#define HITI_MAXDIGITS 10
+
+/* Called just after we read in '\t', so should start at a field */
+static SAM_split_output_type
+parse_XO_and_HI_fromfile (char **hiti, FILE *fp) {
+  SAM_split_output_type split_output = OUTPUT_NONE;
+  char *p, c = 1, c0, c1;
+  char abbrev0, abbrev1;
+
+  *hiti = MALLOC((HITI_MAXDIGITS + 1) * sizeof(char));
+
+  c0 = fgetc(fp);
+  c1 = fgetc(fp);
+  if (c0 == 'H' && c1 == 'I') {
+    fgetc(fp);		/* : */
+    fgetc(fp);		/* type */
+    fgetc(fp);		/* : */
+
+    p = *hiti;
+    while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+    *--p = '\0';			/* terminating char */
+
+  } else if (c0 == 'X' && c1 == 'O') {
+    fgetc(fp);		/* : */
+    fgetc(fp);		/* type */
+    fgetc(fp);		/* : */
+    abbrev0 = fgetc(fp);
+    abbrev1 = fgetc(fp);
+    switch (abbrev0) {
+    case 'N':
+      if (abbrev1 == 'M') {
+	split_output = OUTPUT_NM;
+      } else {
+	debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 1: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	split_output = OUTPUT_NONE;
+      }
+      break;
+    case 'C':
+      switch (abbrev1) {
+      case 'U': split_output = OUTPUT_CU; break;
+      case 'C': split_output = OUTPUT_CC; break;
+      case 'T': split_output = OUTPUT_CT; break;
+      case 'M': split_output = OUTPUT_CM; break;
+      case 'X': split_output = OUTPUT_CX; break;
+      default:
+	debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 2: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	split_output = OUTPUT_NONE;
+      }
+      break;
+    case 'H':
+      switch (abbrev1) {
+      case 'U': split_output = OUTPUT_HU; break;
+      case 'C': split_output = OUTPUT_HC; break;
+      case 'T': split_output = OUTPUT_HT; break;
+      case 'M': split_output = OUTPUT_HM; break;
+      case 'X': split_output = OUTPUT_HX; break;
+      default:
+	debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 3: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	split_output = OUTPUT_NONE;
+      }
+      break;
+    case 'U':
+      switch (abbrev1) {
+      case 'U': split_output = OUTPUT_UU; break;
+      case 'C': split_output = OUTPUT_UC; break;
+      case 'T': split_output = OUTPUT_UT; break;
+      case 'M': split_output = OUTPUT_UM; break;
+      case 'X': split_output = OUTPUT_UX; break;
+      default:
+	debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 4: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	split_output = OUTPUT_NONE;
+      }
+      break;
+    case 'P':
+      switch (abbrev1) {
+      case 'C': split_output = OUTPUT_PC; break;
+      case 'I': split_output = OUTPUT_PI; break;
+      case 'S': split_output = OUTPUT_PS; break;
+      case 'L': split_output = OUTPUT_PL; break;
+      case 'M': split_output = OUTPUT_PM; break;
+      case 'X': split_output = OUTPUT_PX; break;
+      default:
+	debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 5: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	split_output = OUTPUT_NONE;
+      }
+      break;
+    default:
+      debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 6: "));
+      fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+      split_output = OUTPUT_NONE;
+    }
+  }
+
+  while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+  if (c == EOF || c == '\0') {
+    return split_output;
+  } else {
+    c0 = fgetc(fp);
+    c1 = fgetc(fp);
+    if (c0 == 'H' && c1 == 'I') {
+      fgetc(fp);		/* : */
+      fgetc(fp);		/* type */
+      fgetc(fp);		/* : */
+
+      p = *hiti;
+      while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+      *--p = '\0';			/* terminating char */
+
+    } else if (c0 == 'X' && c1 == 'O') {
+      fgetc(fp);		/* : */
+      fgetc(fp);		/* type */
+      fgetc(fp);		/* : */
+      abbrev0 = fgetc(fp);
+      abbrev1 = fgetc(fp);
+      switch (abbrev0) {
+      case 'N':
+	if (abbrev1 == 'M') {
+	  split_output = OUTPUT_NM;
+	} else {
+	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 7: "));
+	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	  split_output = OUTPUT_NONE;
+	}
+	break;
+      case 'C':
+	switch (abbrev1) {
+	case 'U': split_output = OUTPUT_CU; break;
+	case 'C': split_output = OUTPUT_CC; break;
+	case 'T': split_output = OUTPUT_CT; break;
+	case 'M': split_output = OUTPUT_CM; break;
+	case 'X': split_output = OUTPUT_CX; break;
+	default:
+	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 8: "));
+	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	  split_output = OUTPUT_NONE;
+	}
+	break;
+      case 'H':
+	switch (abbrev1) {
+	case 'U': split_output = OUTPUT_HU; break;
+	case 'C': split_output = OUTPUT_HC; break;
+	case 'T': split_output = OUTPUT_HT; break;
+	case 'M': split_output = OUTPUT_HM; break;
+	case 'X': split_output = OUTPUT_HX; break;
+	default:
+	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 9: "));
+	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	  split_output = OUTPUT_NONE;
+	}
+	break;
+      case 'U':
+	switch (abbrev1) {
+	case 'U': split_output = OUTPUT_UU; break;
+	case 'C': split_output = OUTPUT_UC; break;
+	case 'T': split_output = OUTPUT_UT; break;
+	case 'M': split_output = OUTPUT_UM; break;
+	case 'X': split_output = OUTPUT_UX; break;
+	default:
+	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 10: "));
+	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	  split_output = OUTPUT_NONE;
+	}
+	break;
+      case 'P':
+	switch (abbrev1) {
+	case 'C': split_output = OUTPUT_PC; break;
+	case 'I': split_output = OUTPUT_PI; break;
+	case 'S': split_output = OUTPUT_PS; break;
+	case 'L': split_output = OUTPUT_PL; break;
+	case 'M': split_output = OUTPUT_PM; break;
+	case 'X': split_output = OUTPUT_PX; break;
+	default:
+	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 11: "));
+	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	  split_output = OUTPUT_NONE;
+	}
+	break;
+      default:
+	debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 12: "));
+	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
+	split_output = OUTPUT_NONE;
+      }
+    }
+  }
+
+  return split_output;
+}
+
+
+
+/* Main parser for processing with dups */
+char *
+Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM_split_output_type *split_output,
+					 char **hiti, Univcoord_T *genomicpos, int *initial_softclip, bool *query_lowp,
+					 FILE *fp, Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength) {
+  char *acc, *p, c;
+  char *substring;
+  Chrnum_T chrnum, mate_chrnum;
+  Chrpos_T chrpos, mate_chrpos;
+  Univcoord_T mate_genomicpos;
+
+  substring = MALLOCA((linelength + 1) * sizeof(char));
+
+  /* 1. QNAME */
+  p = acc = MALLOC((linelength + 1) * sizeof(char));
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';			/* terminating char */
+  *acclength = (p - acc)/sizeof(char);
+
+  /* 2. FLAG.  Skip */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%u",&(*flag)) != 1) {
+    fprintf(stderr,"Unable to find flag in %s\n",substring);
+    abort();
+  } else {
+    debug(printf("  flag = %u\n",*flag));
+  }
+
+  /* 3. RNAME: chr.  Skip */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (!strcmp(substring,"*")) {
+    *genomicpos = 0;
+  } else if ((chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
+    fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+    fprintf(stderr,"Available chromosomes in the genome: ");
+    Univ_IIT_dump_labels(stderr,chromosome_iit);
+    exit(9);
+  } else {
+    *genomicpos = chroffsets[chrnum - 1];
+  }
+
+  /* 4. POS: chrpos */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%u",&chrpos) != 1) {
+    fprintf(stderr,"Unable to find chrpos in %s\n",substring);
+    abort();
+  } else {
+    *genomicpos += chrpos;
+  }
+
+  /* 5. MAPQ: Mapping quality.  Skip */
+  while ((c = fgetc(fp)) != EOF && c != '\t') ;
+
+  /* 6. CIGAR.  Parse for initial_softclip */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  *initial_softclip = cigar_string_initial_softclip(substring);
+
+
+  /* 7. MRNM: Mate chr */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+  
+  if (!strcmp(substring,"*")) {
+    mate_genomicpos = 0;
+  } else if (!strcmp(substring,"=")) {
+    mate_chrnum = chrnum;
+    mate_genomicpos = chroffsets[mate_chrnum - 1];
+  } else if ((mate_chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
+    fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+    fprintf(stderr,"Available chromosomes in the genome: ");
+    Univ_IIT_dump_labels(stderr,chromosome_iit);
+    exit(9);
+  } else {
+    mate_genomicpos = chroffsets[mate_chrnum - 1];
+  }
+
+  /* 8. MPOS: Mate chrpos */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%d",&mate_chrpos) != 1) {
+    fprintf(stderr,"Unable to find mate_chrpos_low in %s\n",substring);
+    abort();
+  } else {
+    mate_genomicpos += mate_chrpos;
+  }
+
+
+  /* Determine if the query is low */
+  if (*genomicpos == 0) {
+    if (*flag & PAIRED_READ) {
+      if (mate_genomicpos != 0) {
+	/* Mate will be mapped, so this is the high end */
+	*query_lowp = false;
+      } else if (*flag & FIRST_READ_P) {
+	*query_lowp = true;
+      } else {
+	*query_lowp = false;
+      }
+    } else {
+      /* Single-end, so this is the low end */
+      *query_lowp = true;
+    }
+
+  } else if (mate_chrpos == 0) {
+    *query_lowp = true;
+
+  } else if (*genomicpos < mate_genomicpos) {
+    *query_lowp = true;
+
+  } else {
+    *query_lowp = false;
+
+  }
+    
+  FREEA(substring);
+
+  /* 9. ISIZE: Insert size.  Skip. */
+  while (fgetc(fp) != '\t') ;
+
+  /* 10. SEQ: queryseq.  Skip. */
+  while (fgetc(fp) != '\t') ;
+
+  /* 11. QUAL: quality scores.  Skip. */
+  while (fgetc(fp) != '\t') ;
+
+  *split_output = parse_XO_and_HI_fromfile(&(*hiti),fp);
+
+  return acc;
+}
+
+
+
+/* ILLUMINA-A1CCE9_0004:1:1:1103:6310#0	0	20	33639850	255	55M21S	*	0	0	AAAAATTGTATACCGCAGATTCAGGCATGGATTCCGTGAAGGAACAACACCTAAANCCAAAGNTCGGAAGANCGGN	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCBCCBDBDCBCCDCDC at CCC&AAAA################	NM:i:2 */
+
+#if 0
+char *
+Samread_parse_line (char **acc, unsigned int *flag, int *mapq, char **chr, Chrpos_T *chrpos, char **cigar,
+		    char **mate_chr, Chrpos_T *mate_chrpos_low,
+		    int *readlength, char **read, char **quality_string, char *line) {
+  char *p, *q;
+  int length, i;
+
+  debug(printf("Entering Samread_parse_line with %s\n",line));
+
+  p = line;
+  while (!isspace(*p)) p++;
+  length = (p - line)/sizeof(char);
+  *acc = (char *) MALLOC((length+1)*sizeof(char));
+  strncpy(*acc,line,length);
+  (*acc)[length] = '\0';
+
+  if (*p != '\0') {		/* Skip over tab */
+    p++;
+  }
+
+  if (sscanf(p,"%u",&(*flag)) != 1) {
+    fprintf(stderr,"Unable to find flag in %s\n",p);
+    abort();
+  } else {
+    debug(printf("  flag = %u\n",*flag));
+  }
+
+  while (!isspace(*p)) p++;	/* Skip over flag */
+  if (*p == '\0') {
+    fprintf(stderr,"Can't parse chr part of %s\n",line);
+    abort();
+  } else {
+    p++;			/* Skip over tab */
+  }
+  q = p;
+  while (!isspace(*q)) q++;
+  length = (q - p)/sizeof(char);
+  *chr = (char *) MALLOC((length+1)*sizeof(char));
+  strncpy(*chr,p,length);
+  (*chr)[length] = '\0';
+
+  debug(printf("  chr = %s\n",*chr));
+  if (*q != '\0') {
+    q++;
+  }
+
+
+  p = q;
+  if (sscanf(p,"%u",&(*chrpos)) != 1) {
+    fprintf(stderr,"Unable to find chrpos in %s\n",p);
+    abort();
+  } else {
+    debug(printf("  chrpos = %u\n",*chrpos));
+  }
+
+
+  while (!isspace(*p)) p++;	/* Skip over chrpos */
+  if (*p == '\0') {
+    fprintf(stderr,"Can't parse chrpos part of %s\n",line);
+    abort();
+  } else {
+    p++;			/* Skip over tab */
+  }
+
+  /* Read mapping quality */
+  if (sscanf(p,"%d",&(*mapq)) != 1) {
+    fprintf(stderr,"Unable to find mapq in %s\n",p);
+    abort();
+  } else {
+    debug(printf("  mapq = %d\n",*mapq));
+  }
+
+  /* Skip past mapping quality */
+  while (!isspace(*p)) p++;
+
+
+  if (*p == '\0') {
+    fprintf(stderr,"Can't parse cigar part of %s\n",line);
+    abort();
+  } else {
+    p++;			/* Skip over tab */
+  }
+  q = p;
+  while (!isspace(*q)) q++;
+  length = (q - p)/sizeof(char);
+  *cigar = (char *) MALLOC((length+1)*sizeof(char));
+  strncpy(*cigar,p,length);
+  (*cigar)[length] = '\0';
+
+  debug(printf("  cigar = %s\n",*cigar));
+  
+
+  /* mate chr */
+  p = q;
+  if (*p != '\0') {
+    p++;			/* Should be a tab */
+  }
+  q = p;
+  while (!isspace(*q)) q++;
+  length = (q - p)/sizeof(char);
+  *mate_chr = (char *) MALLOC((length+1)*sizeof(char));
+  strncpy(*mate_chr,p,length);
+  (*mate_chr)[length] = '\0';
+
+  debug(printf("  mate_chr = %s\n",*mate_chr));
+  if (*q == '\0') {
+    fprintf(stderr,"Can't parse mate chr part of %s\n",line);
+    abort();
+  } else {
+    q++;
+  }
+
+  /* mate chrpos low */
+  p = q;
+  if (sscanf(p,"%u",&(*mate_chrpos_low)) != 1) {
+    fprintf(stderr,"Unable to find mate_chrpos_low in %s\n",p);
+    abort();
+  } else {
+    debug(printf("  mate_chrpos_low = %u\n",*mate_chrpos_low));
+  }
+
+  while (!isspace(*p)) p++;	/* Skip over mate_chrpos */
+  if (*p == '\0') {
+    fprintf(stderr,"Can't parse mate chrpos part of %s\n",line);
+    abort();
+  } else {
+    p++;			/* Skip over tab */
+  }
+
+
+  /* Skip over insert size */
+  while (!isspace(*p)) p++;
+  if (*p == '\0') {
+    fprintf(stderr,"Can't parse mate chrpos part of %s\n",line);
+    abort();
+  } else {
+    p++;
+  }
+
+
+  q = p;
+  while (!isspace(*q)) q++;
+  *readlength = (q - p)/sizeof(char);
+  if (*q == '\t') q++;
+  debug(printf("  readlength = %d\n",*readlength));
+
+  *read = (char *) MALLOC(((*readlength)+1)*sizeof(char));
+  strncpy(*read,p,*readlength);
+  (*read)[*readlength] = '\0';
+
+  debug(printf("  read = %s\n",*read));
+
+  p = q;
+  while (!isspace(*q)) q++;
+  length = (q - p)/sizeof(char);
+  *quality_string = (char *) MALLOC(((*readlength)+1)*sizeof(char));
+  if (length == *readlength) {
+    strncpy(*quality_string,p,length);
+    (*quality_string)[*readlength] = '\0';
+
+  } else {
+    for (i = 0; i < *readlength; i++) {
+      (*quality_string)[i] = ' ';
+    }
+    (*quality_string)[*readlength] = '\0';
+  }
+
+  if (*q == '\t') q++;
+
+  return q;
+}
+#endif
+
+#if 0
+/* Leaves fp at start of auxinfo */
+void
+Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq, char **chr, Chrpos_T *chrpos, char **cigar,
+			     char **mate_chr, Chrpos_T *mate_chrpos_low,
+			     int *readlength, char **read, char **quality_string, int linelength) {
+  char *p, c;
+  int length, i;
+  char *substring;
+
+  debug(printf("Entering Samread_parse_line_fromfile\n"));
+
+  substring = MALLOCA((linelength + 1) * sizeof(char));
+
+  /* 1. QNAME */
+  p = *acc = (char *) MALLOC((linelength + 1) * sizeof(char));
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  /* 2. FLAG */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%u",&(*flag)) != 1) {
+    fprintf(stderr,"Unable to find flag in %s\n",substring);
+    abort();
+  } else {
+    debug(printf("  flag = %u\n",*flag));
+  }
+
+  /* 3. RNAME: chr */
+  p = *chr = (char *) MALLOC((linelength + 1) * sizeof(char));
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  /* 4. POS: chrpos */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%u",&(*chrpos)) != 1) {
+    fprintf(stderr,"Unable to find chrpos in %s\n",substring);
+    abort();
+  } else {
+    debug(printf("  chrpos = %u\n",*chrpos));
+  }
+
+  /* 5. MAPQ: Mapping quality */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%d",&(*mapq)) != 1) {
+    fprintf(stderr,"Unable to find mapq in %s\n",substring);
+    abort();
+  } else {
+    debug(printf("  mapq = %d\n",*mapq));
+  }
+
+  /* 5. CIGAR */
+  p = *cigar = (char *) MALLOC((linelength + 1) * sizeof(char));
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  /* 7. MRNM: Mate chr */
+  p = *mate_chr = (char *) MALLOC((linelength + 1) * sizeof(char));
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  /* 8. MPOS: Mate chrpos */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%d",&(*mate_chrpos_low)) != 1) {
+    fprintf(stderr,"Unable to find mate_chrpos_low in %s\n",substring);
+    abort();
+  } else {
+    debug(printf("  mate_chrpos_low = %u\n",*mate_chrpos_low));
+  }
+
+  /* 9. ISIZE: Insert size.  Skip. */
+  while (fgetc(fp) != '\t') ;
+
+  /* 10. SEQ: queryseq */
+  p = *read = (char *) MALLOC((linelength + 1) * sizeof(char));
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+  *readlength = (p - *read)/sizeof(char);
+
+  /* 11. QUAL: quality scores */
+  p = *quality_string = (char *) MALLOC((*readlength + 1) * sizeof(char));
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+  length = (p - *quality_string)/sizeof(char);
+
+  if (length != *readlength) {
+    for (i = 0; i < *readlength; i++) {
+      (*quality_string)[i] = ' ';
+    }
+    *quality_string[i] = '\0';
+  }
+
+  FREEA(substring);
+
+  return;
+}
+#endif
+
+
+int
+Samread_parse_linelen_fromfile (FILE *fp) {
+  int linelen = 0;
+  
+  /* 1. QNAME: Skip */
+  while (!feof(fp) && fgetc(fp) != '\t') {
+    linelen++;
+  }
+  linelen++;
+
+  if (feof(fp)) {
+    return -1;
+  }
+
+  while (!feof(fp) && fgetc(fp) != '\n') {
+    linelen++;
+  }
+  linelen++;
+
+  return linelen;
+}
+
+
+/* Main parser for processing without dups */
+Univcoord_T
+Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_output_type *split_output,
+				   Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength) {
+  Univcoord_T genomicpos;
+  Chrnum_T chrnum;
+  Chrpos_T chrpos;
+  char *substring, *p, c;
+  
+  substring = MALLOCA((linelength + 1) * sizeof(char));
+
+  /* 1. QNAME: Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 2. FLAG */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%u",&(*flag)) != 1) {
+    fprintf(stderr,"Unable to find flag in %s\n",substring);
+    abort();
+  } else {
+    debug(printf("  flag = %u\n",*flag));
+  }
+
+  /* 3. RNAME: chr */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (!strcmp(substring,"*")) {
+    genomicpos = 0;
+  } else if ((chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
+    fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+    fprintf(stderr,"Available chromosomes in the genome: ");
+    Univ_IIT_dump_labels(stderr,chromosome_iit);
+    exit(9);
+  } else {
+    genomicpos = chroffsets[chrnum - 1];
+  }
+
+  /* 4. POS: chrpos */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%u",&chrpos) != 1) {
+    fprintf(stderr,"Unable to find chrpos in %s\n",substring);
+    abort();
+  } else {
+    genomicpos += chrpos;
+  }
+  
+  FREEA(substring);
+
+  /* 5. MAPQ: Mapping quality.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 6. CIGAR.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 7. MRNM: Mate chr.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 8. MPOS: Mate chrpos.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 9. ISIZE: Insert size.  Skip. */
+  while (fgetc(fp) != '\t') ;
+
+  /* 10. SEQ: queryseq.  Skip. */
+  while (fgetc(fp) != '\t') ;
+
+  /* 11. QUAL: quality scores.  Skip. */
+  while (fgetc(fp) != '\t') ;
+
+  *split_output = parse_XO_fromfile(fp);
+
+  return genomicpos;
+}
+
+
+char *
+Samread_parse_aux_fromfile (FILE *fp, char *auxfield, int linelength) {
+  char *value, *p, c = 1, c0, c1;
+
+  while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+  if (c == '\0') {
+    return (char *) NULL;
+  } else {
+    c0 = fgetc(fp);
+    c1 = fgetc(fp);
+    if (c0 == auxfield[0] && c1 == auxfield[1]) {
+      fgetc(fp);		/* : */
+      fgetc(fp);		/* type */
+      fgetc(fp);		/* : */
+      p = value = MALLOC((linelength+1) * sizeof(char));
+      while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ;
+      *--p = '\0';			/* terminating char */
+      return value;
+    }
+  }
+
+  return (char *) NULL;
+}
+
+
+
+#if 0
+/* Returns only fields needed by sam_sort */
+void
+Samread_parse_read_and_mateinfo_fromfile (FILE *fp, unsigned int *flag, char **mate_chr, Chrpos_T *mate_chrpos,
+					  int *readlength, char **read, int linelength) {
+  char *p, *q, c;
+  char *substring, *clipped;
+  int hardclip_low, hardclip_high, subseq_length;
+
+  substring = MALLOCA((linelength + 1) * sizeof(char));
+
+  /* 1. QNAME.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 2. FLAG */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%u",&(*flag)) != 1) {
+    fprintf(stderr,"Unable to find flag in %s\n",substring);
+    abort();
+  } else {
+    debug(printf("  flag = %u\n",*flag));
+  }
+
+  /* 3. RNAME: chr.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 4. POS: chrpos.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 5. MAPQ: Mapping quality.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 6. CIGAR.  Parse for cigar_readlength. */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  /* For a nomapper, this readlength is incorrect */
+  *readlength = cigar_string_readlength(&hardclip_low,&hardclip_high,substring);
+
+
+  /* 7. MRNM: Mate chr */
+  p = *mate_chr = (char *) MALLOC((linelength + 1) * sizeof(char));
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  /* 8. MPOS: Mate chrpos */
+w  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%d",&(*mate_chrpos)) != 1) {
+    fprintf(stderr,"Unable to find mate_chrpos_low in %s\n",substring);
+    abort();
+  } else {
+    debug(printf("  mate_chrpos_low = %u\n",*mate_chrpos));
+  }
+
+  /* 9. ISIZE: Insert size.  Skip. */
+  while (fgetc(fp) != '\t') ;
+
+  /* 10. SEQ: queryseq */
+  *read = (char *) MALLOC((linelength + 1) * sizeof(char));
+
+  p = &((*read)[hardclip_low]);
+  subseq_length = 0;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') {
+    subseq_length++;
+  }
+
+  if (*readlength == 0) {
+    *readlength = subseq_length;
+  }
+
+  if (subseq_length + hardclip_low + hardclip_high != *readlength) {
+    fprintf(stderr,"Cigar readlength %d is not consistent with subsequence length %d + hardclip_low %d + hardclip_high %d\n",
+	    *readlength,subseq_length,hardclip_low,hardclip_high);
+  }
+
+  if (hardclip_low > 0 || hardclip_high > 0) {
+    if ((clipped = Samread_parse_aux_fromfile(fp,/*auxfield*/"XH",linelength)) == NULL) {
+      fprintf(stderr,"Hard-clipped read needs XH field from a recent GSNAP version\n");
+      exit(9);
+
+    } else {
+      if (hardclip_low > 0) {
+	p = &((*read)[0]);
+	q = clipped;
+	while ((c = *q++) != '\0') {
+	  *p++ = c;
+	}
+      } else {
+	p = &((*read)[(*readlength) - hardclip_high]);
+	q = clipped;
+	while ((c = *q++) != '\0') {
+	  *p++ = c;
+	}
+      }
+      FREE(clipped);
+
+    }
+  }
+  (*read)[*readlength] = '\0';
+
+  FREEA(substring);
+
+  return;
+}
+#endif
+
+
+void
+Samread_parse_read_fromfile (FILE *fp, unsigned int *flag, int *readlength, char **read, int linelength) {
+  char *p, *q, c;
+  char *substring, *clipped;
+  int hardclip_low, hardclip_high, subseq_length;
+
+  substring = MALLOCA((linelength + 1) * sizeof(char));
+
+  /* 1. QNAME.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 2. FLAG.  Skip */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%u",&(*flag)) != 1) {
+    fprintf(stderr,"Unable to find flag in %s\n",substring);
+    abort();
+  } else {
+    debug(printf("  flag = %u\n",*flag));
+  }
+
+  /* 3. RNAME: chr.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 4. POS: chrpos.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 5. MAPQ: Mapping quality.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 6. CIGAR.  Parse for cigar_readlength. */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  /* For a nomapper, this readlength is incorrect */
+  *readlength = cigar_string_readlength(&hardclip_low,&hardclip_high,substring);
+
+
+  /* 7. MRNM: Mate chr.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 8. MPOS: Mate chrpos.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 9. ISIZE: Insert size.  Skip. */
+  while (fgetc(fp) != '\t') ;
+
+  /* 10. SEQ: queryseq */
+  *read = (char *) MALLOC((linelength + 1) * sizeof(char));
+
+  p = &((*read)[hardclip_low]);
+  subseq_length = 0;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') {
+    subseq_length++;
+  }
+
+  if (*readlength == 0) {
+    *readlength = subseq_length;
+  }
+
+  if (subseq_length + hardclip_low + hardclip_high != *readlength) {
+    fprintf(stderr,"Cigar readlength %d is not consistent with subsequence length %d + hardclip_low %d + hardclip_high %d\n",
+	    *readlength,subseq_length,hardclip_low,hardclip_high);
+  }
+
+  if (hardclip_low > 0 || hardclip_high > 0) {
+    if ((clipped = Samread_parse_aux_fromfile(fp,/*auxfield*/"XH",linelength)) == NULL) {
+      fprintf(stderr,"Hard-clipped read needs XH field from a recent GSNAP version\n");
+      exit(9);
+
+    } else {
+      if (hardclip_low > 0) {
+	p = &((*read)[0]);
+	q = clipped;
+	while ((c = *q++) != '\0') {
+	  *p++ = c;
+	}
+      } else {
+	p = &((*read)[(*readlength) - hardclip_high]);
+	q = clipped;
+	while ((c = *q++) != '\0') {
+	  *p++ = c;
+	}
+      }
+      FREE(clipped);
+
+    }
+  }
+
+  (*read)[*readlength] = '\0';
+
+  FREEA(substring);
+
+  return;
+}
+
+
+Univcoord_T
+Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength) {
+  Univcoord_T mate_genomicpos;
+  Chrpos_T mate_chrpos;
+  Chrnum_T chrnum, mate_chrnum;
+  char *p, c;
+  char *substring;
+
+  substring = MALLOCA((linelength + 1) * sizeof(char));
+
+  /* 1. QNAME.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 2. FLAG.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 3. RNAME: chr */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (!strcmp(substring,"*")) {
+    chrnum = -1;
+  } else if ((chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
+    fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+  }
+
+  /* 4. POS: chrpos.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 5. MAPQ: Mapping quality.  Skip */
+  while (fgetc(fp) != '\t') ;
+
+  /* 6. CIGAR.  Parse for cigar_readlength.  Skip. */
+  while (fgetc(fp) != '\t') ;
+
+  /* 7. MRNM: Mate chr */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+  
+  if (!strcmp(substring,"*")) {
+    mate_genomicpos = 0;
+  } else if (!strcmp(substring,"=")) {
+    mate_chrnum = chrnum;
+    mate_genomicpos = chroffsets[mate_chrnum - 1];
+  } else if ((mate_chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) {
+    fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring);
+    fprintf(stderr,"Available chromosomes in the genome: ");
+    Univ_IIT_dump_labels(stderr,chromosome_iit);
+    exit(9);
+  } else {
+    mate_genomicpos = chroffsets[mate_chrnum - 1];
+  }
+
+  /* 8. MPOS: Mate chrpos */
+  p = substring;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ;
+  *--p = '\0';
+
+  if (sscanf(substring,"%d",&mate_chrpos) != 1) {
+    fprintf(stderr,"Unable to find mate_chrpos_low in %s\n",substring);
+    abort();
+  } else {
+    mate_genomicpos += mate_chrpos;
+  }
+
+  FREEA(substring);
+
+  return mate_genomicpos;
+}
+
+
+#if 0
+char *
+Samread_chrinfo (Chrpos_T *chrpos, char **cigar, char *line) {
+  char *chr;
+  unsigned int flag;
+  int mapq;
+
+  char *p, *q;
+  int length;
+
+  debug(printf("Entering Samread_chrinfo with %s\n",line));
+
+  p = line;
+  while (!isspace(*p)) p++;
+  length = (p - line)/sizeof(char);
+#if 0
+  *acc = (char *) MALLOC((length+1)*sizeof(char));
+  strncpy(*acc,line,length);
+  (*acc)[length] = '\0';
+#endif
+
+  if (*p != '\0') {		/* Skip over tab */
+    p++;
+  }
+
+  if (sscanf(p,"%u",&flag) != 1) {
+    fprintf(stderr,"Unable to find flag in %s\n",p);
+    abort();
+  } else {
+    debug(printf("  flag = %u\n",*flag));
+  }
+
+  while (!isspace(*p)) p++;	/* Skip over flag */
+  if (*p == '\0') {
+    fprintf(stderr,"Can't parse chr part of %s\n",line);
+    abort();
+  } else {
+    p++;			/* Skip over tab */
+  }
+  q = p;
+  while (!isspace(*q)) q++;
+  length = (q - p)/sizeof(char);
+  chr = (char *) MALLOC((length+1)*sizeof(char));
+  strncpy(chr,p,length);
+  chr[length] = '\0';
+
+  debug(printf("  chr = %s\n",chr));
+  if (*q != '\0') {
+    q++;
+  }
+
+
+  p = q;
+  if (sscanf(p,"%u",&(*chrpos)) != 1) {
+    fprintf(stderr,"Unable to find chrpos in %s\n",p);
+    abort();
+  } else {
+    debug(printf("  chrpos = %u\n",*chrpos));
+  }
+
+  while (!isspace(*p)) p++;	/* Skip over chrpos */
+  if (*p == '\0') {
+    fprintf(stderr,"Can't parse chrpos part of %s\n",line);
+    abort();
+  } else {
+    p++;			/* Skip over tab */
+  }
+
+  /* Read mapping quality */
+  if (sscanf(p,"%d",&mapq) != 1) {
+    fprintf(stderr,"Unable to find mapq in %s\n",p);
+    abort();
+  } else {
+    debug(printf("  mapq = %d\n",mapq));
+  }
+
+  /* Skip past mapping quality */
+  while (!isspace(*p)) p++;
+
+
+  if (*p == '\0') {
+    fprintf(stderr,"Can't parse cigar part of %s\n",line);
+    abort();
+  } else {
+    p++;			/* Skip over tab */
+  }
+  q = p;
+  while (!isspace(*q)) q++;
+  length = (q - p)/sizeof(char);
+  *cigar = (char *) MALLOC((length+1)*sizeof(char));
+  strncpy(*cigar,p,length);
+  (*cigar)[length] = '\0';
+
+  debug(printf("  cigar = %s\n",*cigar));
+
+  return chr;
+}
+#endif
+
+
+
+#define CHUNK 1024
+
+void
+Samread_print_as_duplicate_fromfile (FILE *fp, int linelength) {
+  int nread = 0;
+  char buffer[CHUNK], *p, c;
+  unsigned int flag;
+
+  /* 1. QNAME */
+  while ((c = fgetc(fp)) != EOF && c != '\t') {
+    putchar(c);
+    nread++;
+  }
+  putchar('\t');
+  nread++;
+
+  /* 2. FLAG */
+  p = buffer;
+  while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') {
+    nread++;
+  }
+  nread++;
+  *--p = '\0';
+
+  if (sscanf(buffer,"%u",&flag) != 1) {
+    fprintf(stderr,"Unable to find flag in %s\n",buffer);
+    abort();
+  } else {
+    printf("%u\t",flag | DUPLICATE_READ);
+  }
+
+  /* 3... Rest */
+  linelength -= nread;
+
+  while (linelength > CHUNK) {
+    fread(buffer,sizeof(char),CHUNK,fp);
+    fwrite(buffer,sizeof(char),CHUNK,stdout);
+    linelength -= CHUNK;
+  }
+  if (linelength > 0) {
+    fread(buffer,sizeof(char),linelength,fp);
+    fwrite(buffer,sizeof(char),linelength,stdout);
+  }
+
+  return;
+}
+
+
+#if 0
+char
+Samread_splice_strand (char *auxinfo) {
+  char *p;
+  char tag1, tag2;
+
+  debug(printf("Entering Samread_splice_strand with %s\n",auxinfo));
+
+  p = auxinfo;
+  while (*p != '\0' && *p != '\n') {
+    tag1 = p[0];
+    tag2 = p[1];
+
+    if (tag1 == 'X' && tag2 == 'S') {
+      debug(printf("Found tag XS\n"));
+      /* XS:A: */
+      p += 5;
+
+      if (*p == '+') {
+	return '+';
+      } else if (*p == '-') {
+	return '-';
+      } else if (*p == '?') {
+	return '?';
+      } else {
+	fprintf(stderr,"Cannot parse strand %c after XS tag\n",*p);
+	return ' ';
+      }
+    } else {
+      while (*p != '\0' && *p != '\t') {
+	p++;
+      }
+      if (*p == '\t') {
+	p++;
+      }
+    }
+  }
+
+  return ' ';
+}
+#endif
+
+
+#if 0
+Intlist_T
+Samread_parse_cigar (Uintlist_T *npositions, int *readlength, char *cigar) {
+  Intlist_T types = NULL;
+  unsigned int npos;
+  char *p, type;
+
+  *npositions = (Uintlist_T) NULL;
+  *readlength = 0;
+
+  if (cigar[0] == '*') {
+    return (Intlist_T) NULL;
+  }
+
+  p = cigar;
+  while (*p != '\0') {
+    if (sscanf(p,"%u",&npos) != 1) {
+      fprintf(stderr,"Unable to parse cigar %s.  No number in %s\n",cigar,p);
+      abort();
+    } else {
+      *npositions = Uintlist_push(*npositions,npos);
+    }
+
+    while (*p != '\0' && isdigit(*p)) {
+      p++;
+    }
+    if (*p == '\0') {
+      fprintf(stderr,"Unable to parse cigar %s.  No letter after number %u\n",cigar,npos);
+      exit(9);
+    } else {
+      type = *p++;
+      types = Intlist_push(types,(int) type);
+    }
+
+    if (type == 'S' || type == 'M' || type == 'I') {
+      *readlength += npos;
+    } else if (type == 'H') {
+      *readlength += npos;
+    } else if (type == 'D' || type == 'N') {
+      /* Ignore */
+    } else {
+      fprintf(stderr,"Unable to parse cigar %s.  Do not recognize letter %c\n",cigar,type);
+      exit(9);
+    }
+  }
+
+  *npositions = Uintlist_reverse(*npositions);
+  return Intlist_reverse(types);
+}
+#endif
+
+
+#if 0
+void
+Samread_print_cigar (Intlist_T types, Uintlist_T npositions) {
+  Intlist_T p;
+  Uintlist_T q;
+
+  for (p = types, q = npositions; p != NULL; p = Intlist_next(p), q = Uintlist_next(q)) {
+    printf("%u%c",Uintlist_head(q),Intlist_head(p));
+  }
+  return;
+}
+#endif
+
+
+#if 0
+Chrpos_T
+Samread_chrpos_high (Intlist_T types, Uintlist_T npositions, Chrpos_T chrpos_low) {
+  Intlist_T p;
+  Uintlist_T q;
+  Chrpos_T chrpos_high;
+  int type;
+
+  chrpos_high = chrpos_low;
+  for (p = types, q = npositions; p != NULL; p = Intlist_next(p), q = Uintlist_next(q)) {
+    if ((type = Intlist_head(p)) == 'S') {
+      /* Ignore */
+
+    } else if (type == 'H') {
+      /* Ignore */
+
+    } else if (type == 'M') {
+      chrpos_high += Uintlist_head(q);
+
+    } else if (type == 'N') {
+      chrpos_high += Uintlist_head(q);
+
+    } else if (type == 'I') {
+      /* Do nothing */
+
+    } else if (type == 'D') {
+      /* CHECK */
+      chrpos_high += Uintlist_head(q);
+
+    } else {
+      fprintf(stderr,"Cannot parse type %c\n",type);
+      exit(9);
+    }
+    debug(printf("  type = %c, chrpos = %u\n",type,chrpos_high));
+  }
+
+  return chrpos_high - 1U;
+}
+#endif
+
+
+#if 0
+int
+Samread_get_query_coordinates (int *query5, int *query3, Intlist_T types, Uintlist_T npositions,
+			       int readlength, char *cigar) {
+  int validlength;
+  Intlist_T p;
+  Uintlist_T q;
+  int type;
+
+  *query5 = 1;			/* 1-based */
+  *query3 = readlength;
+  validlength = 0;
+
+  p = types;
+  q = npositions;
+  while (p != NULL) {
+    if ((type = Intlist_head(p)) == 'S') {
+      if (p == types) {
+	*query5 = Uintlist_head(q) + 1; /* 1-based */
+      } else if (Intlist_next(p) == NULL) {
+	*query3 = readlength - Uintlist_head(q);
+      } else {
+	fprintf(stderr,"Cannot parse cigar %s.  Type S occurs in middle\n",cigar);
+	exit(9);
+      }
+    } else if (type == 'H') {
+      /* Do nothing */
+    } else if (type == 'M') {
+      validlength += Uintlist_head(q);
+    } else if (type == 'N') {
+      /* Do nothing */
+    } else if (type == 'I') {
+      validlength += Uintlist_head(q);
+    } else if (type == 'D') {
+      /* Do nothing */
+    }
+    p = Intlist_next(p);
+    q = Uintlist_next(q);
+  }
+
+  debug(printf("Got query %d to %d, with length %d\n",*query5,*query3,validlength));
+  if (validlength != (*query3) - (*query5) + 1) {
+    fprintf(stderr,"Validlength %d from cigar != %d - %d + 1\n",validlength,*query3,*query5);
+    abort();
+  }
+
+  return validlength;
+}
+#endif
+
+
+
+#if 0
+int
+get_substrings (int *querylength, int **query_starts, Chrpos_T **genomic_starts, Chrpos_T **genomic_ends,
+		char *cigar, Chrpos_T chrpos_low) {
+  int nsubstrings = 0;
+  unsigned int npos;
+  char *p, type;
+
+  int querypos = 0;
+  Chrpos_T genomicpos = chrpos_low;
+  Intlist_T query_starts_list = NULL;
+  Uintlist_T genomic_starts_list = NULL, genomic_ends_list = NULL;
+
+  if (cigar[0] == '*') {
+    *querylength = 0;
+    *query_starts = (int *) NULL;
+    *genomic_starts = (Chrpos_T *) NULL;
+    *genomic_ends = (Chrpos_T *) NULL;
+    return 0;
+  }
+
+  query_starts_list = Intlist_push(NULL,querypos);
+  genomic_starts_list = Uintlist_push(NULL,genomicpos);
+
+  p = cigar;
+  while (*p != '\0') {
+    if (sscanf(p,"%u",&npos) != 1) {
+      fprintf(stderr,"Unable to parse cigar %s in get_substrings.  No number in %s\n",cigar,p);
+      abort();
+    }
+
+    while (*p != '\0' && isdigit(*p)) {
+      p++;
+    }
+    if (*p == '\0') {
+      fprintf(stderr,"Unable to parse cigar %s.  No letter after number %u\n",cigar,npos);
+      exit(9);
+    } else {
+      type = *p++;
+    }
+
+    if (type == 'S') {
+      querypos += npos;
+
+    } else if (type == 'M') {
+      querypos += npos;
+      genomicpos += npos;
+
+    } else if (type == 'I') {
+      querypos += npos;
+
+    } else if (type == 'H') {
+      /* ? querypos += npos; */
+
+    } else if (type == 'D') {
+      genomicpos += npos;
+
+    } else if (type == 'N') {
+      genomic_ends_list = Uintlist_push(genomic_ends_list,genomicpos);
+      /* nsubstrings++; */
+
+      genomicpos += npos;
+
+      query_starts_list = Intlist_push(query_starts_list,querypos);
+      genomic_starts_list = Uintlist_push(genomic_starts_list,genomicpos);
+
+    } else {
+      fprintf(stderr,"Unable to parse cigar %s.  Do not recognize letter %c\n",cigar,type);
+      exit(9);
+    }
+  }
+
+  *querylength = querypos;
+  genomic_ends_list = Uintlist_push(genomic_ends_list,genomicpos);
+  /* nsubstrings++; */
+
+
+  /* Convert lists to arrays */
+  query_starts_list = Intlist_reverse(query_starts_list);
+  *query_starts = Intlist_to_array(&nsubstrings,query_starts_list);
+  Intlist_free(&query_starts_list);
+
+  genomic_starts_list = Uintlist_reverse(genomic_starts_list);
+  *genomic_starts = Uintlist_to_array(&nsubstrings,genomic_starts_list);
+  Uintlist_free(&genomic_starts_list);
+
+  genomic_ends_list = Uintlist_reverse(genomic_ends_list);
+  *genomic_ends = Uintlist_to_array(&nsubstrings,genomic_ends_list);
+  Uintlist_free(&genomic_ends_list);
+
+  return nsubstrings;
+}
+#endif
+
+
+
+#if 0
+int
+Samread_compute_insert_length (int *querylength5, int *querylength3,
+			       char *cigar5, Chrpos_T chrpos_low_5, char *cigar3, Chrpos_T chrpos_low_3) {
+  int insert_length;
+  int nsubstrings5, nsubstrings3, i, j;
+  int *query_starts_5, *query_starts_3;
+  Chrpos_T *genomic_starts_5, *genomic_ends_5, *genomic_starts_3, *genomic_ends_3;
+  Chrpos_T pos5, pos3;
+
+  if (cigar5[0] == '*' || cigar3[0] == '*') {
+    return 0;
+  }
+
+  nsubstrings5 = get_substrings(&(*querylength5),&query_starts_5,&genomic_starts_5,&genomic_ends_5,cigar5,chrpos_low_5);
+  nsubstrings3 = get_substrings(&(*querylength3),&query_starts_3,&genomic_starts_3,&genomic_ends_3,cigar3,chrpos_low_3);
+
+  for (i = 0; i < nsubstrings5; i++) {
+    for (j = 0; j < nsubstrings3; j++) {
+      if (genomic_ends_5[i] < genomic_starts_3[j]) {
+	/* No overlap */
+      } else if (genomic_starts_5[i] > genomic_ends_3[j]) {
+	/* No overlap */
+      } else {
+	pos5 = genomic_starts_5[i] - query_starts_5[i];
+	pos3 = genomic_starts_3[j] - query_starts_3[j];
+
+	FREE(query_starts_5);
+	FREE(genomic_starts_5);
+	FREE(genomic_ends_5);
+	FREE(query_starts_3);
+	FREE(genomic_starts_3);
+	FREE(genomic_ends_3);
+
+	if (pos5 > pos3) {
+	  return (int) (pos5 - pos3);
+	} else {
+	  return (int) (pos3 - pos5);
+	}
+      }
+    }
+  }
+
+  if (genomic_ends_5[nsubstrings5-1] < genomic_starts_3[0]) {
+    insert_length = genomic_starts_3[0] - genomic_ends_5[nsubstrings5-1] + (*querylength5) + (*querylength3);
+  } else if (genomic_ends_3[nsubstrings3-1] < genomic_starts_5[0]) {
+    insert_length = genomic_starts_5[0] - genomic_ends_3[nsubstrings3-1] + (*querylength5) + (*querylength3);
+  } else {
+    insert_length = 0;
+  }
+
+  FREE(query_starts_5);
+  FREE(genomic_starts_5);
+  FREE(genomic_ends_5);
+
+  FREE(query_starts_3);
+  FREE(genomic_starts_3);
+  FREE(genomic_ends_3);
+
+  return insert_length;
+}
+#endif
+
+


=====================================
src/samread.h
=====================================
@@ -0,0 +1,40 @@
+/* $Id: samread.h 154089 2014-11-25 21:03:16Z twu $ */
+#ifndef SAMREAD_INCLUDED
+#define SAMREAD_INCLUDED
+#include <stdio.h>
+#include "samflags.h"
+#include "genomicpos.h"
+#include "intlist.h"
+#include "uintlist.h"
+#include "iit-read-univ.h"
+
+
+extern char *
+Samread_get_acc_fromfile (int *acclength, FILE *fp, int linelength);
+
+extern char *
+Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM_split_output_type *split_output,
+					 char **hiti, Univcoord_T *genomicpos, int *initial_softclip, bool *query_lowp,
+					 FILE *fp, Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength);
+
+extern int
+Samread_parse_linelen_fromfile (FILE *fp);
+
+extern Univcoord_T
+Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_output_type *split_output,
+				   Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength);
+
+extern void
+Samread_parse_read_fromfile (FILE *fp, unsigned int *flag, int *readlength, char **read, int linelength);
+
+extern Univcoord_T
+Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength);
+
+extern char *
+Samread_parse_aux_fromfile (FILE *fp, char *auxfield, int linelength);
+
+extern void
+Samread_print_as_duplicate_fromfile (FILE *fp, int linelength);
+
+#endif
+


=====================================
src/stage3hr.c
=====================================
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage3hr.c 222929 2020-06-29 16:09:28Z twu $";
+static char rcsid[] = "$Id: stage3hr.c 223081 2020-09-13 14:21:03Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -5010,7 +5010,7 @@ Stage3end_new_substitution (int *found_score_overall, int *found_score_within_tr
 						mismatch_positions_alloc,query_compress,chroffset,sensedir);
     splice_queryend_p = Substring_qend_trim(&qend,&splicetype_queryend,&ambig_prob_queryend,
 					    univdiagonal,pos5,querylength,plusp,genestrand,
-					    mismatch_positions_alloc,query_compress,chroffset,sensedir);
+					    mismatch_positions_alloc,query_compress,chroffset,chrhigh,sensedir);
 
     debug0(printf("Trimming querystart yields splicep %d, qstart %d, prob %f\n",splice_querystart_p,qstart,ambig_prob_querystart));
     debug0(printf("Trimming queryend yields splicep %d, qend %d, prob %f\n",splice_queryend_p,qend,ambig_prob_queryend));
@@ -5046,7 +5046,7 @@ Stage3end_new_substitution (int *found_score_overall, int *found_score_within_tr
     /* trim_querystart and trim_queryend Genome_count_mismatches_substring are flipped, but not for Substring_new */
     splice_querystart_p = Substring_qend_trim(&qend,&splicetype_querystart,&ambig_prob_querystart,
 					      univdiagonal,pos5,querylength,plusp,genestrand,
-					      mismatch_positions_alloc,query_compress,chroffset,sensedir);
+					      mismatch_positions_alloc,query_compress,chroffset,chrhigh,sensedir);
     splice_queryend_p = Substring_qstart_trim(&qstart,&splicetype_queryend,&ambig_prob_queryend,
 					      univdiagonal,pos3,querylength,plusp,genestrand,
 					      mismatch_positions_alloc,query_compress,chroffset,sensedir);


=====================================
src/substring.c
=====================================
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: substring.c 222930 2020-06-29 16:10:11Z twu $";
+static char rcsid[] = "$Id: substring.c 223080 2020-09-13 14:20:25Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -1157,12 +1157,13 @@ embellish_genomic_sam (char *result, char *genomic_diff, char *query, int querys
 /* Still takes left as a parameter */
 int
 Substring_trim_qstart_nosplice (int *nmismatches, int *mismatch_positions_alloc,
-				Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
-				bool plusp, int genestrand) {
+				Compress_T query_compress, Univcoord_T left, Univcoord_T chroffset,
+				int pos5, int pos3, int querylength, bool plusp, int genestrand) {
   int score = 0;
   int trimpos = pos5, pos, prevpos, i;
   int *mismatch_positions = mismatch_positions_alloc;
   int alignlength = pos3 - pos5;
+  Univcoord_T univdiagonal;
 
 
   debug8(printf("Entered Substring_trim_qstart_nosplice with pos5 %d, pos3 %d, mismatch_scores %d/%d, match_score %d\n",
@@ -1222,8 +1223,9 @@ Substring_trim_qstart_nosplice (int *nmismatches, int *mismatch_positions_alloc,
     /* score = 0; */
   }
 
-  if (novelsplicingp == false && trimpos == 0) {
-    /* For DNA-seq, take the initial mismatch at the beginning of the read */
+  if (novelsplicingp == false && trimpos == 0 &&
+      left + (Univcoord_T) querylength >= chroffset + (Univcoord_T) querylength) {
+    /* For DNA-seq, if still within chromosome, take the initial mismatch at the beginning of the read */
     debug8(printf("Backing up 1 bp to start => trimpos %d\n",0));
     return 0;
   } else {
@@ -1236,8 +1238,8 @@ Substring_trim_qstart_nosplice (int *nmismatches, int *mismatch_positions_alloc,
 /* Still takes left as a parameter */
 int
 Substring_trim_qend_nosplice (int *nmismatches, int *mismatch_positions_alloc,
-			      Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
-			      int querylength, bool plusp, int genestrand) {
+			      Compress_T query_compress, Univcoord_T left, Univcoord_T chrhigh,
+			      int pos5, int pos3, int querylength, bool plusp, int genestrand) {
   int score = 0;
   int trimpos = pos3, pos, prevpos, i;
   int *mismatch_positions = mismatch_positions_alloc;
@@ -1301,8 +1303,9 @@ Substring_trim_qend_nosplice (int *nmismatches, int *mismatch_positions_alloc,
     /* score = 0; */
   }
 
-  if (novelsplicingp == false && trimpos == querylength - 1) {
-    /* For DNA-seq, take the final mismatch at end of read */
+  if (novelsplicingp == false && trimpos == querylength - 1 &&
+      left + (Univcoord_T) querylength <= chrhigh) {
+    /* For DNA-seq, if still within the chromosome, take the final mismatch at end of read */
     debug8(printf("Advancing 1 bp to end => trimpos %d\n",pos3));
     return querylength;
   } else {
@@ -1325,14 +1328,15 @@ Substring_trimmed_qstarts (int *result, Splicetype_T *splicetype, int **ambig_qs
   int nspliceends;
   int nmismatches;
 
+
   debug8(printf("\n***Entered Substring_trimmed_qstarts with univdiagonal %u, qend %d..%d, plusp %d, sensedir %d\n",
 		univdiagonal-chroffset,0,qend,plusp,sensedir));
 
   left = univdiagonal - (Univcoord_T) querylength;
   pos5 = (univdiagonal >= (Univcoord_T) querylength) ? 0 : (int) -left;
 
-  trimpos = Substring_trim_qstart_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,left,
-					   pos5,/*pos3*/qend,plusp,genestrand);
+  trimpos = Substring_trim_qstart_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
+					   left,chroffset,pos5,/*pos3*/qend,querylength,plusp,genestrand);
   debug8(printf("trimpos %d (relative to %d)\n",trimpos,/*qstart*/0));
 
   if (trimpos >= qend) {
@@ -1401,7 +1405,7 @@ Substring_qstart_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_pro
   pos5 = (univdiagonal >= (Univcoord_T) querylength) ? 0 : (int) -left;
 
   *trimpos = Substring_trim_qstart_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
-					    left,pos5,pos3,plusp,genestrand);
+					    left,chroffset,pos5,pos3,querylength,plusp,genestrand);
   debug8(printf("trimpos %d (relative to %d)\n",*trimpos,pos5));
 
   if (*trimpos >= pos3) {
@@ -1470,7 +1474,7 @@ bool
 Substring_trimmed_qends (int *result, Splicetype_T *splicetype, int **ambig_qends, double **ambig_probs_3,
 			 Univcoord_T univdiagonal, int qstart, int querylength,
 			 bool plusp, int genestrand, int *mismatch_positions_alloc, Compress_T query_compress,
-			 Univcoord_T chroffset, int sensedir) {
+			 Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir) {
   Univcoord_T left;
   int trimpos, pos3;
   int nspliceends;
@@ -1482,8 +1486,8 @@ Substring_trimmed_qends (int *result, Splicetype_T *splicetype, int **ambig_qend
   left = univdiagonal - (Univcoord_T) querylength;
   pos3 = (univdiagonal <= genomelength) ? querylength : (int) (genomelength - left);
 
-  trimpos = Substring_trim_qend_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,left,
-					 /*pos5*/qstart,pos3,querylength,plusp,genestrand);
+  trimpos = Substring_trim_qend_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
+					 left,chrhigh,/*pos5*/qstart,pos3,querylength,plusp,genestrand);
   debug8(printf("trimpos %d (relative to %d)\n",trimpos,/*qend*/querylength));
 
   if (trimpos <= qstart) {
@@ -1536,7 +1540,7 @@ bool
 Substring_qend_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_prob_qend,
 		     Univcoord_T univdiagonal, int pos5, int querylength, bool plusp, int genestrand,
 		     int *mismatch_positions_alloc, Compress_T query_compress,
-		     Univcoord_T chroffset, int sensedir) {
+		     Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir) {
   Univcoord_T left;
   int pos3;
   int nspliceends;
@@ -1551,7 +1555,7 @@ Substring_qend_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_prob_
   pos3 = (univdiagonal <= genomelength) ? querylength : (int) (genomelength - left);
 
   *trimpos = Substring_trim_qend_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
-					  left,pos5,pos3,querylength,plusp,genestrand);
+					  left,chrhigh,pos5,pos3,querylength,plusp,genestrand);
   debug8(printf("trimpos %d (relative to %d)\n",*trimpos,pos3));
 
   if (*trimpos <= pos5) {
@@ -2206,7 +2210,7 @@ Substring_extend_anchor_querystart (T anchor, T substring, int *mismatch_positio
 					      /*univdiagonal*/anchor->left + anchor->querylength,/*pos5*/qstart,
 					      anchor->querylength,/*plusp*/false,anchor->genestrand,
 					      mismatch_positions_alloc,query_compress,
-					      anchor->chroffset,anchor->sensedir);
+					      anchor->chroffset,anchor->chrhigh,anchor->sensedir);
     /* printf("Substring_extend_anchor_querystart, minus, yields splicep %d, qend %d, prob %f\n",
        splice_querystart_p,qend,ambig_prob_querystart); */
 
@@ -2270,7 +2274,7 @@ Substring_extend_anchor_queryend (T anchor, T substring, int *mismatch_positions
 					    /*univdiagonal*/anchor->left + anchor->querylength,/*pos5*/qstart,
 					    anchor->querylength,/*plusp*/true,anchor->genestrand,
 					    mismatch_positions_alloc,query_compress,
-					    anchor->chroffset,anchor->sensedir);
+					    anchor->chroffset,anchor->chrhigh,anchor->sensedir);
     /* printf("Substring_extend_anchor_queryend, plus, yields splicep %d, qend %d, prob %f\n",
        splice_queryend_p,qend,ambig_prob_queryend); */
 


=====================================
src/substring.h
=====================================
@@ -1,4 +1,4 @@
-/* $Id: substring.h 222930 2020-06-29 16:10:11Z twu $ */
+/* $Id: substring.h 223080 2020-09-13 14:20:25Z twu $ */
 #ifndef SUBSTRING_INCLUDED
 #define SUBSTRING_INCLUDED
 
@@ -60,12 +60,12 @@ Substring_overlap_segment_trimmed (T substring1, T substring2);
 
 extern int
 Substring_trim_qstart_nosplice (int *nmismatches, int *mismatch_positions_alloc,
-				Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
-				bool plusp,int genestrand);
+				Compress_T query_compress, Univcoord_T left, Univcoord_T chroffset,
+				int pos5, int pos3, int querylength, bool plusp, int genestrand);
 extern int
 Substring_trim_qend_nosplice (int *nmismatches, int *mismatch_positions_alloc,
-			      Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
-			      int querylength, bool plusp, int genestrand);
+			      Compress_T query_compress, Univcoord_T left, Univcoord_T chrhigh,
+			      int pos5, int pos3, int querylength, bool plusp, int genestrand);
 
 extern bool
 Substring_trimmed_qstarts (int *result, Splicetype_T *splicetype, int **ambig_qstarts, double **ambig_probs_5,
@@ -82,13 +82,13 @@ extern bool
 Substring_trimmed_qends (int *result, Splicetype_T *splicetype, int **ambig_qends, double **ambig_probs_3,
 			 Univcoord_T univdiagonal, int qstart, int querylength,
 			 bool plusp, int genestrand, int *mismatch_positions_alloc, Compress_T query_compress,
-			 Univcoord_T chroffset, int sensedir);
+			 Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir);
 
 extern bool
 Substring_qend_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_prob_qend,
 		     Univcoord_T univdiagonal, int pos5, int querylength, bool plusp, int genestrand,
 		     int *mismatch_positions_alloc, Compress_T query_compress,
-		     Univcoord_T chroffset, int sensedir);
+		     Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir);
 
 extern int
 Substring_compute_nmatches (int *ref_nmatches_plus_spliced_trims, Univcoord_T left,


=====================================
src/terminal.c
=====================================
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: terminal.c 222933 2020-06-29 16:12:14Z twu $";
+static char rcsid[] = "$Id: terminal.c 223081 2020-09-13 14:21:03Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -139,9 +139,11 @@ find_terminals (List_T *sense_terminals, List_T *antisense_terminals, List_T elt
 
 	/* TODO: Consider trimming separately for SENSE_FORWARD and SENSE_ANTI */
 	pos5_trimmed = Substring_trim_qstart_nosplice(&nmismatches_ignore,mismatch_positions_alloc,
-						      query_compress,segment_left,pos5,pos3,plusp,genestrand);
+						      query_compress,segment_left,chroffset,
+						      pos5,pos3,querylength,plusp,genestrand);
 	pos3_trimmed = Substring_trim_qend_nosplice(&nmismatches_ignore,mismatch_positions_alloc,
-						    query_compress,segment_left,pos5,pos3,querylength,plusp,genestrand);
+						    query_compress,segment_left,chrhigh,
+						    pos5,pos3,querylength,plusp,genestrand);
 	if (plusp == true) {
 	  querystart = pos5_trimmed;
 	  queryend = pos3_trimmed;
@@ -206,9 +208,11 @@ find_terminals (List_T *sense_terminals, List_T *antisense_terminals, List_T elt
 
 	/* TODO: Consider trimming separately for SENSE_FORWARD and SENSE_ANTI */
 	pos5_trimmed = Substring_trim_qstart_nosplice(&nmismatches_ignore,mismatch_positions_alloc,
-						      query_compress,segment_left,pos5,pos3,plusp,genestrand);
+						      query_compress,segment_left,chroffset,
+						      pos5,pos3,querylength,plusp,genestrand);
 	pos3_trimmed = Substring_trim_qend_nosplice(&nmismatches_ignore,mismatch_positions_alloc,
-						    query_compress,segment_left,pos5,pos3,querylength,plusp,genestrand);
+						    query_compress,segment_left,chrhigh,
+						    pos5,pos3,querylength,plusp,genestrand);
 	if (plusp == true) {
 	  querystart = pos5_trimmed;
 	  queryend = pos3_trimmed;


=====================================
util/gmap_build.pl.in
=====================================
@@ -1,5 +1,5 @@
 #! @PERL@
-# $Id: gmap_build.pl.in 222761 2020-06-01 00:24:12Z twu $
+# $Id: gmap_build.pl.in 223082 2020-09-13 14:21:33Z twu $
 
 use warnings;	
 
@@ -531,7 +531,7 @@ sub print_usage {
 gmap_build: Builds a gmap database for a genome to be used by GMAP or GSNAP.
 Part of GMAP package, version $package_version.
 
-Usage: gmap_build [options...] -d <genome> [-c <transcriptome> -t <transcript_fasta>] <genome_fasta_files>
+Usage: gmap_build [options...] -d <genome> [-c <transcriptome> -T <transcript_fasta>] <genome_fasta_files>
 
 You are free to name <genome> and <transcriptome> as you wish.  You
 will use the same names when performing alignments subsequently using



View it on GitLab: https://salsa.debian.org/med-team/gmap/-/commit/56daea0c64f8224fa212b8030f22ac30b7d175e2

-- 
View it on GitLab: https://salsa.debian.org/med-team/gmap/-/commit/56daea0c64f8224fa212b8030f22ac30b7d175e2
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200918/2426a327/attachment-0001.html>


More information about the debian-med-commit mailing list