[med-svn] [hisat2] 01/01: New upstream version 2.1.0

Andreas Tille tille at debian.org
Mon Sep 4 13:09:22 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to annotated tag upstream/2.1.0
in repository hisat2.

commit 81f45224282adf2a684c9d27da99d059ee68a16c
Author: Andreas Tille <tille at debian.org>
Date:   Mon Sep 4 14:37:13 2017 +0200

    New upstream version 2.1.0
---
 MANUAL                                             |   17 +-
 MANUAL.markdown                                    |   54 +-
 Makefile                                           |    2 +
 VERSION                                            |    2 +-
 aligner_swsse.h                                    |    2 +-
 aligner_swsse_ee_i16.cpp                           |    2 +-
 aligner_swsse_ee_u8.cpp                            |    2 +-
 aligner_swsse_loc_i16.cpp                          |    2 +-
 aligner_swsse_loc_u8.cpp                           |    2 +-
 aln_sink.h                                         |  535 ++--
 alt.h                                              |  109 +-
 diff_sample.h                                      |    4 +-
 doc/manual.inc.html                                |   28 +-
 ds.cpp                                             |    2 +-
 fast_mutex.h                                       |  542 ++--
 gbwt_graph.h                                       |   34 +-
 gfm.h                                              |   77 +-
 gp.h                                               |   24 +-
 hgfm.h                                             |   15 +-
 hi_aligner.h                                       |  629 ++++-
 hisat2                                             |  124 +-
 hisat2.cpp                                         |  107 +-
 hisat2_build.cpp                                   |   11 +-
 hisat2_genotype.py                                 | 1099 --------
 hisat2_inspect.cpp                                 |   23 +-
 hisat2_test_BRCA_genotyping.py                     |  827 ------
 hisat2_test_HLA_genotyping.py                      | 2699 --------------------
 hisatgenotype.py                                   |  489 ++++
 ...type_genome.py => hisatgenotype_build_genome.py |  382 ++-
 hisatgenotype_extract_reads.py                     |  430 ++++
 hisatgenotype_extract_vars.py                      | 1264 +++++----
 ...tgenotype_typing.py => hisatgenotype_hla_cyp.py |  223 +-
 hisatgenotype_locus.py                             | 2335 +++++++++++++++++
 hisatgenotype_modules/__init__.py                  |    0
 .../hisatgenotype_assembly_graph.py                | 1771 +++++++++++++
 .../hisatgenotype_convert_codis.py                 |  627 +++++
 .../hisatgenotype_extract_codis_data.py            |  166 ++
 .../hisatgenotype_extract_cyp_data.py              | 1061 ++++++++
 hisatgenotype_modules/hisatgenotype_gene_typing.py |   18 +
 .../hisatgenotype_typing_common.py                 | 1549 +++++++++++
 hisatgenotype_scripts/compare_HLA_Omixon.py        |  129 +
 hisatgenotype_scripts/extract_Omixon_HLA.py        |  115 +
 .../get_haplotype_ILMN_StrandSeq.py                |   97 +
 .../hisatgenotype_HLA_genotyping_PGs.py            |  199 ++
 .../hisatgenotype_locus_samples.py                 |  275 ++
 old_hisat2_test_HLA_genotyping.py                  | 1341 ----------
 opts.h                                             |    6 +
 pe.cpp                                             |    1 +
 processor_support.h                                |    9 +-
 qual.h                                             |    1 +
 spliced_aligner.h                                  |  149 +-
 tp.h                                               |   12 +-
 util.h                                             |    2 +-
 53 files changed, 11897 insertions(+), 7728 deletions(-)

diff --git a/MANUAL b/MANUAL
index ba1e8d6..28ebfc7 100644
--- a/MANUAL
+++ b/MANUAL
@@ -504,7 +504,8 @@ than using this option.  It has no effect to provide splice sites that are alrea
     --novel-splicesite-outfile <path>
 
 In this mode, HISAT2 reports a list of splice sites in the file <path>:  
-   chromosome name `<tab>` genomic position of the flanking base on the left side of an intron `<tab>` genomic position of the flanking base on the right `<tab>` strand
+   chromosome name `<tab>` genomic position of the flanking base on the left side of an intron `<tab>` genomic position of the flanking base on the right `<tab>` strand (+, -, and .)
+   '.' indicates an unknown strand for non-canonical splice sites.
 
     --novel-splicesite-infile <path>
 
@@ -550,6 +551,10 @@ Report alignments tailored specifically for Cufflinks. In addition to what HISAT
 With this option, HISAT2 looks for novel splice sites with three signals (GT/AG, GC/AG, AT/AC), but all user-provided splice sites are used irrespective of their signals.
 HISAT2 produces an optional field, XS:A:[+-], for every spliced alignment.
 
+    --no-templatelen-adjustment
+
+Disables template length adjustment for RNA-seq reads.
+
 #### Reporting options
 
     -k <int>
@@ -569,7 +574,7 @@ aligning reads to long, repetitive genomes large `-k` can be very, very slow.
 
     --max-seeds <int>
 
-HISAT2, like other aligners, uses seed-and-extend approaches.  HISAT2 tries to extend seeds to full-length alignments. In HISAT2, --max-seeds is used to control the maximum number of seeds that will be extended. HISAT2 extends up to these many seeds and skips the rest of the seeds. Large values for `--max-seeds` may improve alignment sensitivity, but HISAT2 is not designed with large values for `--max-seeds` in mind, and when aligning reads to long, repetitive genomes large `--max-seeds` [...]
+HISAT2, like other aligners, uses seed-and-extend approaches.  HISAT2 tries to extend seeds to full-length alignments. In HISAT2, --max-seeds is used to control the maximum number of seeds that will be extended. HISAT2 extends up to these many seeds and skips the rest of the seeds. Large values for `--max-seeds` may improve alignment sensitivity, but HISAT2 is not designed with large values for `--max-seeds` in mind, and when aligning reads to long, repetitive genomes large `--max-seeds` [...]
 
     --secondary
 
@@ -706,6 +711,14 @@ in the same order as they did in the inputs.
 
 Print nothing besides alignments and serious errors.
 
+    --summary-file
+
+Print alignment summary to this file.
+
+    --new-summary
+
+Print alignment summary in a new style, which is more machine-friendly.
+
     --met-file <path>
 
 Write `hisat2` metrics to file `<path>`.  Having alignment metric can be useful
diff --git a/MANUAL.markdown b/MANUAL.markdown
index e5f657e..a88b0f6 100644
--- a/MANUAL.markdown
+++ b/MANUAL.markdown
@@ -793,7 +793,8 @@ than using this option.  It has no effect to provide splice sites that are alrea
 </td><td>
 
 In this mode, HISAT2 reports a list of splice sites in the file <path>:  
-   chromosome name `<tab>` genomic position of the flanking base on the left side of an intron `<tab>` genomic position of the flanking base on the right `<tab>` strand
+   chromosome name `<tab>` genomic position of the flanking base on the left side of an intron `<tab>` genomic position of the flanking base on the right `<tab>` strand (+, -, and .)
+   '.' indicates an unknown strand for non-canonical splice sites.
 
 </td></tr>
 
@@ -893,6 +894,28 @@ HISAT2 produces an optional field, XS:A:[+-], for every spliced alignment.
 
 </td></tr>
 
+<tr><td id="hisat2-options-avoid-pseudogene">
+[`--avoid-pseudogene`]: #hisat2-options-avoid-pseudogene
+
+    --avoid-pseudogene
+
+</td><td>
+
+Try to avoid aligning reads to pseudogenes.  Note this option is experimental and needs further investigation.
+
+</td></tr>
+
+<tr><td id="hisat2-options-no-templatelen-adjustment">
+[`--no-templatelen-adjustment`]: #hisat2-options-no-templatelen-adjustment
+
+    --no-templatelen-adjustment
+
+</td><td>
+
+Disables template length adjustment for RNA-seq reads.
+
+</td></tr>
+
 </table>
 
 #### Reporting options
@@ -929,7 +952,7 @@ aligning reads to long, repetitive genomes large `-k` can be very, very slow.
 
 </td><td>
 
-HISAT2, like other aligners, uses seed-and-extend approaches.  HISAT2 tries to extend seeds to full-length alignments. In HISAT2, --max-seeds is used to control the maximum number of seeds that will be extended. HISAT2 extends up to these many seeds and skips the rest of the seeds. Large values for `--max-seeds` may improve alignment sensitivity, but HISAT2 is not designed with large values for `--max-seeds` in mind, and when aligning reads to long, repetitive genomes large `--max-seeds` [...]
+HISAT2, like other aligners, uses seed-and-extend approaches.  HISAT2 tries to extend seeds to full-length alignments. In HISAT2, --max-seeds is used to control the maximum number of seeds that will be extended. HISAT2 extends up to these many seeds and skips the rest of the seeds. Large values for `--max-seeds` may improve alignment sensitivity, but HISAT2 is not designed with large values for `--max-seeds` in mind, and when aligning reads to long, repetitive genomes large `--max-seeds` [...]
 
 </td></tr>
 
@@ -1174,6 +1197,31 @@ in the same order as they did in the inputs.
 Print nothing besides alignments and serious errors.
 
 </td></tr>
+
+<tr><td id="hisat2-summary-file">
+
+[`--summary-file`]: #hisat2-options-summary-file
+
+    --summary-file
+
+</td><td>
+
+Print alignment summary to this file.
+
+</td></tr>
+
+<tr><td id="hisat2-new-summary">
+
+[`--new-summary`]: #hisat2-options-new-summary
+
+    --new-summary
+
+</td><td>
+
+Print alignment summary in a new style, which is more machine-friendly.
+
+</td></tr>
+
 <tr><td id="hisat2-options-met-file">
 
 [`--met-file`]: #hisat2-options-met-file
@@ -1562,7 +1610,7 @@ left to right, the fields are:
 4.  1-based offset into the forward reference strand where leftmost
     character of the alignment occurs
 
-5.  Mapping quality
+5.  Mapping quality.  Mapping quality of HISAT2 
 
 6.  CIGAR string representation of alignment
 
diff --git a/Makefile b/Makefile
index e08bcc3..1bd7a69 100644
--- a/Makefile
+++ b/Makefile
@@ -186,6 +186,8 @@ HISAT2_BIN_LIST_AUX = hisat2-build-s-debug \
 GENERAL_LIST = $(wildcard scripts/*.sh) \
 	$(wildcard scripts/*.pl) \
 	$(wildcard *.py) \
+	$(wildcard hisatgenotype_modules/*.py) \
+	$(wildcard hisatgenotype_scripts/*.py) \
 	doc/manual.inc.html \
 	doc/README \
 	doc/style.css \
diff --git a/VERSION b/VERSION
index b9d2bdf..7ec1d6d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.0.5
\ No newline at end of file
+2.1.0
diff --git a/aligner_swsse.h b/aligner_swsse.h
index 8e5bbd3..70d5c6d 100644
--- a/aligner_swsse.h
+++ b/aligner_swsse.h
@@ -26,7 +26,7 @@
 #include "scoring.h"
 #include "mask.h"
 #include "sse_util.h"
-#include <strings.h>
+#include <string>
 
 
 struct SSEMetrics {
diff --git a/aligner_swsse_ee_i16.cpp b/aligner_swsse_ee_i16.cpp
index 86baef0..4a28646 100644
--- a/aligner_swsse_ee_i16.cpp
+++ b/aligner_swsse_ee_i16.cpp
@@ -635,7 +635,7 @@ TAlScore SwAligner::alignGatherEE16(int& flag, bool debug) {
 			if(cperTri_) {
 				size_t rc_mod = coli & cper_.lomask_;
 				assert_lt(rc_mod, cper_.per_);
-				int64_t row = -rc_mod-1;
+				int64_t row = -(int64_t)rc_mod-1;
 				int64_t row_mod = row;
 				int64_t row_div = 0;
 				size_t idx = coli >> cper_.perpow2_;
diff --git a/aligner_swsse_ee_u8.cpp b/aligner_swsse_ee_u8.cpp
index 2b7beeb..8b1fc18 100644
--- a/aligner_swsse_ee_u8.cpp
+++ b/aligner_swsse_ee_u8.cpp
@@ -627,7 +627,7 @@ TAlScore SwAligner::alignGatherEE8(int& flag, bool debug) {
 			if(cperTri_) {
 				size_t rc_mod = coli & cper_.lomask_;
 				assert_lt(rc_mod, cper_.per_);
-				int64_t row = -rc_mod-1;
+				int64_t row = -(int64_t)rc_mod-1;
 				int64_t row_mod = row;
 				int64_t row_div = 0;
 				size_t idx = coli >> cper_.perpow2_;
diff --git a/aligner_swsse_loc_i16.cpp b/aligner_swsse_loc_i16.cpp
index b7c87b9..e4e8fac 100644
--- a/aligner_swsse_loc_i16.cpp
+++ b/aligner_swsse_loc_i16.cpp
@@ -730,7 +730,7 @@ TAlScore SwAligner::alignGatherLoc16(int& flag, bool debug) {
 			if(cperTri_) {
 				size_t rc_mod = coli & cper_.lomask_;
 				assert_lt(rc_mod, cper_.per_);
-				int64_t row = -rc_mod-1;
+				int64_t row = -(int64_t)rc_mod-1;
 				int64_t row_mod = row;
 				int64_t row_div = 0;
 				size_t idx = coli >> cper_.perpow2_;
diff --git a/aligner_swsse_loc_u8.cpp b/aligner_swsse_loc_u8.cpp
index b8f0691..673df2d 100644
--- a/aligner_swsse_loc_u8.cpp
+++ b/aligner_swsse_loc_u8.cpp
@@ -736,7 +736,7 @@ TAlScore SwAligner::alignGatherLoc8(int& flag, bool debug) {
 				// Checkpoint for triangular mini-fills
 				size_t rc_mod = coli & cper_.lomask_;
 				assert_lt(rc_mod, cper_.per_);
-				int64_t row = -rc_mod-1;
+				int64_t row = -(int64_t)rc_mod-1;
 				int64_t row_mod = row;
 				int64_t row_div = 0;
 				size_t idx = coli >> cper_.perpow2_;
diff --git a/aln_sink.h b/aln_sink.h
index 6239277..06093b6 100644
--- a/aln_sink.h
+++ b/aln_sink.h
@@ -778,10 +778,12 @@ public:
 	 * updates.
 	 */
 	void printAlSumm(
+        ostream& out,
 		const ReportingMetrics& met,
 		size_t repThresh, // threshold for uniqueness, or max if no thresh
 		bool discord,     // looked for discordant alignments
 		bool mixed,       // looked for unpaired alignments where paired failed?
+        bool newSummary,  // alignment summary in a new style
 		bool hadoopOut);  // output Hadoop counters?
 
 	/**
@@ -789,19 +791,23 @@ public:
 	 * synchronization is necessary.
 	 */
 	void finish(
-		size_t repThresh,
-		bool discord,
-		bool mixed,
-		bool hadoopOut)
+                ostream& out,
+                size_t repThresh,
+                bool discord,
+                bool mixed,
+                bool newSummary,
+                bool hadoopOut)
 	{
 		// Close output streams
 		if(!quiet_) {
 			printAlSumm(
-				met_,
-				repThresh,
-				discord,
-				mixed,
-				hadoopOut);
+                        out,
+                        met_,
+                        repThresh,
+                        discord,
+                        mixed,
+                        newSummary,
+                        hadoopOut);
 		}
 	}
 
@@ -1064,7 +1070,8 @@ public:
 		const PerReadMetrics& prm,       // per-read metrics
 		const Scoring& sc,               // scoring scheme
 		bool suppressSeedSummary = true,
-		bool suppressAlignments = false);
+		bool suppressAlignments = false,
+        bool templateLenAdjustment = true);
 	
 	/**
 	 * Called by the aligner when a new unpaired or paired alignment is
@@ -1474,184 +1481,184 @@ static inline std::ostream& printPct(
  */
 template <typename index_t>
 void AlnSink<index_t>::printAlSumm(
+                                   ostream& out,
 								   const ReportingMetrics& met,
 								   size_t repThresh,   // threshold for uniqueness, or max if no thresh
 								   bool discord,       // looked for discordant alignments
 								   bool mixed,         // looked for unpaired alignments where paired failed?
+                                   bool newSummary,    // alignment summary in a new style
 								   bool hadoopOut)     // output Hadoop counters?
 {
 	// NOTE: there's a filtering step at the very beginning, so everything
 	// being reported here is post filtering
 	
 	bool canRep = repThresh != MAX_SIZE_T;
-	if(hadoopOut) {
-		cerr << "reporter:counter:Bowtie,Reads processed," << met.nread << endl;
-	}
-	uint64_t totread = met.nread;
-	if(totread > 0) {
-		cerr << "" << met.nread << " reads; of these:" << endl;
-	} else {
-		assert_eq(0, met.npaired);
-		assert_eq(0, met.nunpaired);
-		cerr << "" << totread << " reads" << endl;
-	}
-	uint64_t totpair = met.npaired;
-	if(totpair > 0) {
-		// Paired output
-		cerr << "  " << totpair << " (";
-		printPct(cerr, totpair, totread);
-		cerr << ") were paired; of these:" << endl;
-		
-		// Concordants
-		cerr << "    " << met.nconcord_0 << " (";
-		printPct(cerr, met.nconcord_0, met.npaired);
-		cerr << ") aligned concordantly 0 times" << endl;
-		if(canRep) {
-			// Print the number that aligned concordantly exactly once
-			assert_eq(met.nconcord_uni, met.nconcord_uni1+met.nconcord_uni2);
-			cerr << "    " << met.nconcord_uni1 << " (";
-			printPct(cerr, met.nconcord_uni1, met.npaired);
-			cerr << ") aligned concordantly exactly 1 time" << endl;
-			
-			// Print the number that aligned concordantly more than once but
-			// fewer times than the limit
-			
-			cerr << "    " << met.nconcord_uni2+met.nconcord_rep << " (";
-			printPct(cerr, met.nconcord_uni2+met.nconcord_rep, met.npaired);
-			cerr << ") aligned concordantly >1 times" << endl;
-		} else {
-			// Print the number that aligned concordantly exactly once
-			assert_eq(met.nconcord_uni, met.nconcord_uni1+met.nconcord_uni2);
-			cerr << "    " << met.nconcord_uni1 << " (";
-			printPct(cerr, met.nconcord_uni1, met.npaired);
-			cerr << ") aligned concordantly exactly 1 time" << endl;
-			
-			// Print the number that aligned concordantly more than once
-			cerr << "    " << met.nconcord_uni2 << " (";
-			printPct(cerr, met.nconcord_uni2, met.npaired);
-			cerr << ") aligned concordantly >1 times" << endl;
-		}
-		if(discord) {
-			// TODO: what about discoardant and on separate chromosomes?
-			
-			// Bring out the unaligned pair total so we can subtract discordants
-			cerr << "    ----" << endl;
-			cerr << "    " << met.nconcord_0
-			<< " pairs aligned concordantly 0 times; of these:" << endl;
-			// Discordants
-			cerr << "      " << met.ndiscord << " (";
-			printPct(cerr, met.ndiscord, met.nconcord_0);
-			cerr << ") aligned discordantly 1 time" << endl;
-		}
-		uint64_t ncondiscord_0 = met.nconcord_0 - met.ndiscord;
-		if(mixed) {
-			// Bring out the unaligned pair total so we can subtract discordants
-			cerr << "    ----" << endl;
-			cerr << "    " << ncondiscord_0
-			<< " pairs aligned 0 times concordantly or discordantly; of these:" << endl;
-			cerr << "      " << (ncondiscord_0 * 2) << " mates make up the pairs; of these:" << endl;
-			cerr << "        " << met.nunp_0_0 << " " << "(";
-			printPct(cerr, met.nunp_0_0, ncondiscord_0 * 2);
-			cerr << ") aligned 0 times" << endl;
-			if(canRep) {
-				// Print the number that aligned exactly once
-				assert_eq(met.nunp_0_uni, met.nunp_0_uni1+met.nunp_0_uni2);
-				cerr << "        " << met.nunp_0_uni1 << " (";
-				printPct(cerr, met.nunp_0_uni1, ncondiscord_0 * 2);
-				cerr << ") aligned exactly 1 time" << endl;
-				
-				// Print the number that aligned more than once but fewer times
-				// than the limit
-				cerr << "        " << met.nunp_0_uni2+met.nunp_0_rep << " (";
-				printPct(cerr, met.nunp_0_uni2+met.nunp_0_rep, ncondiscord_0 * 2);
-				cerr << ") aligned >1 times" << endl;
-			} else {
-				// Print the number that aligned exactly once
-				assert_eq(met.nunp_0_uni, met.nunp_0_uni1+met.nunp_0_uni2);
-				cerr << "        " << met.nunp_0_uni1 << " (";
-				printPct(cerr, met.nunp_0_uni1, ncondiscord_0 * 2);
-				cerr << ") aligned exactly 1 time" << endl;
-				
-				// Print the number that aligned more than once but fewer times
-				// than the limit
-				cerr << "        " << met.nunp_0_uni2 << " (";
-				printPct(cerr, met.nunp_0_uni2, ncondiscord_0 * 2);
-				cerr << ") aligned >1 times" << endl;
-			}
-			
-			//if(canRep) {
-			//	// Bring out the repetitively aligned pair total so we can subtract discordants
-			//	cerr << "    ----" << endl;
-			//	cerr << "    " << met.nconcord_rep
-			//		 << " pairs aligned concordantly >" << repThresh
-			//		 << " times; of these:" << endl;
-			//	cerr << "      " << (met.nconcord_rep * 2) << " mates make up the pairs; of these:" << endl;
-			//	
-			//	cerr << "        " << met.nunp_rep_0 << " (";
-			//	printPct(cerr, met.nunp_rep_0, met.nconcord_rep * 2);
-			//	cerr << ") aligned 0 times" << endl;
-			//	
-			//	cerr << "        " << met.nunp_rep_uni << " (";
-			//	printPct(cerr, met.nunp_rep_uni, met.nconcord_rep * 2);
-			//	cerr << ") aligned >0 and <=" << repThresh << " times" << endl;
-			//	
-			//	cerr << "        " << met.nunp_rep_rep << " (";
-			//	printPct(cerr, met.nunp_rep_rep, met.nconcord_rep * 2);
-			//	cerr << ") aligned >" << repThresh << " times" << endl;
-			//}
-		}
-	}
-	uint64_t totunpair = met.nunpaired;
-	if(totunpair > 0) {
-		// Unpaired output
-		cerr << "  " << totunpair << " (";
-		printPct(cerr, totunpair, totread);
-		cerr << ") were unpaired; of these:" << endl;
-		
-		cerr << "    " << met.nunp_0 << " (";
-		printPct(cerr, met.nunp_0, met.nunpaired);
-		cerr << ") aligned 0 times" << endl;
-		if(hadoopOut) {
-			cerr << "reporter:counter:Bowtie 2,Unpaired reads with 0 alignments,"
-			<< met.nunpaired << endl;
-		}
-		
-		if(canRep) {
-			// Print the number that aligned exactly once
-			assert_eq(met.nunp_uni, met.nunp_uni1+met.nunp_uni2);
-			cerr << "    " << met.nunp_uni1 << " (";
-			printPct(cerr, met.nunp_uni1, met.nunpaired);
-			cerr << ") aligned exactly 1 time" << endl;
-			
-			// Print the number that aligned more than once but fewer times
-			// than the limit
-			cerr << "    " << met.nunp_uni2+met.nunp_rep << " (";
-			printPct(cerr, met.nunp_uni2+met.nunp_rep, met.nunpaired);
-			cerr << ") aligned >1 times" << endl;
-		} else {
-			// Print the number that aligned exactly once
-			assert_eq(met.nunp_uni, met.nunp_uni1+met.nunp_uni2);
-			cerr << "    " << met.nunp_uni1 << " (";
-			printPct(cerr, met.nunp_uni1, met.nunpaired);
-			cerr << ") aligned exactly 1 time" << endl;
-			
-			// Print the number that aligned more than once
-			cerr << "    " << met.nunp_uni2 << " (";
-			printPct(cerr, met.nunp_uni2, met.nunpaired);
-			cerr << ") aligned >1 times" << endl;
-		}
-	}
-	uint64_t tot_al_cand = totunpair + totpair*2;
-	uint64_t tot_al =
-	(met.nconcord_uni + met.nconcord_rep)*2 +
-	(met.ndiscord)*2 +
-	met.nunp_0_uni +
-	met.nunp_0_rep + 
-	met.nunp_uni +
-	met.nunp_rep;
-	assert_leq(tot_al, tot_al_cand);
-	printPct(cerr, tot_al, tot_al_cand);
-	cerr << " overall alignment rate" << endl;
+    if(hadoopOut) {
+        out << "reporter:counter:HISAT2,Reads processed," << met.nread << endl;
+    }
+    uint64_t totread = met.nread;
+    uint64_t totpair = met.npaired;
+    uint64_t totunpair = met.nunpaired;
+    uint64_t tot_al_cand = totunpair + totpair*2;
+    uint64_t tot_al = (met.nconcord_uni + met.nconcord_rep) * 2 + (met.ndiscord) * 2 + met.nunp_0_uni + met.nunp_0_rep + met.nunp_uni + met.nunp_rep;
+    assert_leq(tot_al, tot_al_cand);
+    if(newSummary) {
+        out << "HISAT2 summary stats:" << endl;
+        if(totpair > 0) {
+            uint64_t ncondiscord_0 = met.nconcord_0 - met.ndiscord;
+            out << "\tTotal pairs: " << totpair << endl;
+            out << "\t\tAligned concordantly or discordantly 0 time: " << ncondiscord_0 << " ("; printPct(out, ncondiscord_0, met.npaired); out << ")" << endl;
+            out << "\t\tAligned concordantly 1 time: " << met.nconcord_uni1 << " ("; printPct(out, met.nconcord_uni1, met.npaired); out << ")" << endl;
+            out << "\t\tAligned concordantly >1 times: " << met.nconcord_uni2 << " ("; printPct(out, met.nconcord_uni2, met.npaired); out << ")" << endl;
+            out << "\t\tAligned discordantly 1 time: " << met.ndiscord << " ("; printPct(out, met.ndiscord, met.npaired); out << ")" << endl;
+            
+            out << "\tTotal unpaired reads: " << ncondiscord_0 * 2 << endl;
+            out << "\t\tAligned 0 time: " << met.nunp_0_0 << " ("; printPct(out, met.nunp_0_0, ncondiscord_0 * 2); out << ")" << endl;
+            out << "\t\tAligned 1 time: " << met.nunp_0_uni1 << " ("; printPct(out, met.nunp_0_uni1, ncondiscord_0 * 2); out << ")" << endl;
+            out << "\t\tAligned >1 times: " << met.nunp_0_uni2 << " ("; printPct(out, met.nunp_0_uni2, ncondiscord_0 * 2); out << ")" << endl;
+        } else {
+            out << "\tTotal reads: " << totread << endl;
+            out << "\t\tAligned 0 time: " << met.nunp_0 << " ("; printPct(out, met.nunp_0, met.nunpaired); out << ")" << endl;
+            out << "\t\tAligned 1 time: " << met.nunp_uni1 << " ("; printPct(out, met.nunp_uni1, met.nunpaired); out << ")" << endl;
+            out << "\t\tAligned >1 times: " << met.nunp_uni2 << " ("; printPct(out, met.nunp_uni2, met.nunpaired); out << ")" << endl;
+        }
+        out << "\tOverall alignment rate: "; printPct(out, tot_al, tot_al_cand); out << endl;
+        
+    } else {
+        if(totread > 0) {
+            out << "" << totread << " reads; of these:" << endl;
+        } else {
+            assert_eq(0, met.npaired);
+            assert_eq(0, met.nunpaired);
+            out << "" << totread << " reads" << endl;
+        }
+        if(totpair > 0) {
+            // Paired output
+            out << "  " << totpair << " (";
+            printPct(out, totpair, totread);
+            out << ") were paired; of these:" << endl;
+            
+            // Concordants
+            out << "    " << met.nconcord_0 << " (";
+            printPct(out, met.nconcord_0, met.npaired);
+            out << ") aligned concordantly 0 times" << endl;
+            if(canRep) {
+                // Print the number that aligned concordantly exactly once
+                assert_eq(met.nconcord_uni, met.nconcord_uni1+met.nconcord_uni2);
+                out << "    " << met.nconcord_uni1 << " (";
+                printPct(out, met.nconcord_uni1, met.npaired);
+                out << ") aligned concordantly exactly 1 time" << endl;
+                
+                // Print the number that aligned concordantly more than once but
+                // fewer times than the limit
+                
+                out << "    " << met.nconcord_uni2+met.nconcord_rep << " (";
+                printPct(out, met.nconcord_uni2+met.nconcord_rep, met.npaired);
+                out << ") aligned concordantly >1 times" << endl;
+            } else {
+                // Print the number that aligned concordantly exactly once
+                assert_eq(met.nconcord_uni, met.nconcord_uni1+met.nconcord_uni2);
+                out << "    " << met.nconcord_uni1 << " (";
+                printPct(out, met.nconcord_uni1, met.npaired);
+                out << ") aligned concordantly exactly 1 time" << endl;
+                
+                // Print the number that aligned concordantly more than once
+                out << "    " << met.nconcord_uni2 << " (";
+                printPct(out, met.nconcord_uni2, met.npaired);
+                out << ") aligned concordantly >1 times" << endl;
+            }
+            if(discord) {
+                // TODO: what about discoardant and on separate chromosomes?
+                
+                // Bring out the unaligned pair total so we can subtract discordants
+                out << "    ----" << endl;
+                out << "    " << met.nconcord_0
+                << " pairs aligned concordantly 0 times; of these:" << endl;
+                // Discordants
+                out << "      " << met.ndiscord << " (";
+                printPct(out, met.ndiscord, met.nconcord_0);
+                out << ") aligned discordantly 1 time" << endl;
+            }
+            uint64_t ncondiscord_0 = met.nconcord_0 - met.ndiscord;
+            if(mixed) {
+                // Bring out the unaligned pair total so we can subtract discordants
+                out << "    ----" << endl;
+                out << "    " << ncondiscord_0
+                    << " pairs aligned 0 times concordantly or discordantly; of these:" << endl;
+                out << "      " << (ncondiscord_0 * 2) << " mates make up the pairs; of these:" << endl;
+                out << "        " << met.nunp_0_0 << " " << "(";
+                printPct(out, met.nunp_0_0, ncondiscord_0 * 2);
+                out << ") aligned 0 times" << endl;
+                if(canRep) {
+                    // Print the number that aligned exactly once
+                    assert_eq(met.nunp_0_uni, met.nunp_0_uni1+met.nunp_0_uni2);
+                    out << "        " << met.nunp_0_uni1 << " (";
+                    printPct(out, met.nunp_0_uni1, ncondiscord_0 * 2);
+                    out << ") aligned exactly 1 time" << endl;
+                    
+                    // Print the number that aligned more than once but fewer times
+                    // than the limit
+                    out << "        " << met.nunp_0_uni2+met.nunp_0_rep << " (";
+                    printPct(out, met.nunp_0_uni2+met.nunp_0_rep, ncondiscord_0 * 2);
+                    out << ") aligned >1 times" << endl;
+                } else {
+                    // Print the number that aligned exactly once
+                    assert_eq(met.nunp_0_uni, met.nunp_0_uni1+met.nunp_0_uni2);
+                    out << "        " << met.nunp_0_uni1 << " (";
+                    printPct(out, met.nunp_0_uni1, ncondiscord_0 * 2);
+                    out << ") aligned exactly 1 time" << endl;
+                    
+                    // Print the number that aligned more than once but fewer times
+                    // than the limit
+                    out << "        " << met.nunp_0_uni2 << " (";
+                    printPct(out, met.nunp_0_uni2, ncondiscord_0 * 2);
+                    out << ") aligned >1 times" << endl;
+                }
+            }
+        }
+        if(totunpair > 0) {
+            // Unpaired output
+            out << "  " << totunpair << " (";
+            printPct(out, totunpair, totread);
+            out << ") were unpaired; of these:" << endl;
+            
+            out << "    " << met.nunp_0 << " (";
+            printPct(out, met.nunp_0, met.nunpaired);
+            out << ") aligned 0 times" << endl;
+            if(hadoopOut) {
+                out << "reporter:counter:HISAT 2,Unpaired reads with 0 alignments,"
+                << met.nunpaired << endl;
+            }
+            
+            if(canRep) {
+                // Print the number that aligned exactly once
+                assert_eq(met.nunp_uni, met.nunp_uni1+met.nunp_uni2);
+                out << "    " << met.nunp_uni1 << " (";
+                printPct(out, met.nunp_uni1, met.nunpaired);
+                out << ") aligned exactly 1 time" << endl;
+                
+                // Print the number that aligned more than once but fewer times
+                // than the limit
+                out << "    " << met.nunp_uni2+met.nunp_rep << " (";
+                printPct(out, met.nunp_uni2+met.nunp_rep, met.nunpaired);
+                out << ") aligned >1 times" << endl;
+            } else {
+                // Print the number that aligned exactly once
+                assert_eq(met.nunp_uni, met.nunp_uni1+met.nunp_uni2);
+                out << "    " << met.nunp_uni1 << " (";
+                printPct(out, met.nunp_uni1, met.nunpaired);
+                out << ") aligned exactly 1 time" << endl;
+                
+                // Print the number that aligned more than once
+                out << "    " << met.nunp_uni2 << " (";
+                printPct(out, met.nunp_uni2, met.nunpaired);
+                out << ") aligned >1 times" << endl;
+            }
+        }
+        
+        printPct(out, tot_al, tot_al_cand);
+        out << " overall alignment rate" << endl;
+    }
 }
 
 /**
@@ -1792,7 +1799,8 @@ void AlnSinkWrap<index_t>::finishRead(
 									  const PerReadMetrics& prm,       // per-read metrics
 									  const Scoring& sc,               // scoring scheme
 									  bool suppressSeedSummary,        // = true
-                                      bool suppressAlignments)         // = false
+                                      bool suppressAlignments,         // = false
+                                      bool templateLenAdjustment)      // = true
 {
 	obuf_.clear();
 	OutputQueueMark qqm(g_.outq(), obuf_, rdid_, threadid_);
@@ -1892,7 +1900,7 @@ void AlnSinkWrap<index_t>::finishRead(
 							rs1->fw()); // opp fw
 			// Issue: we only set the flags once, but some of the flags might
 			// vary from pair to pair among the pairs we're reporting.  For
-			// instance, whether the a given mate aligns to the forward strand.
+			// instance, whether a given mate aligns to the forward strand.
 			SeedAlSumm ssm1, ssm2;
             if(sr1 != NULL && sr2 != NULL) {
                 sr1->toSeedAlSumm(ssm1);
@@ -1900,8 +1908,13 @@ void AlnSinkWrap<index_t>::finishRead(
             }
 			for(size_t i = 0; i < rs1_.size(); i++) {
                 spliceSites_.clear();
-				rs1_[i].setMateParams(ALN_RES_TYPE_MATE1, &rs2_[i], flags1, ssdb_, threads_rids_mindist_, &spliceSites_);
-				rs2_[i].setMateParams(ALN_RES_TYPE_MATE2, &rs1_[i], flags2, ssdb_, threads_rids_mindist_, &spliceSites_);
+                if(templateLenAdjustment) {
+                    rs1_[i].setMateParams(ALN_RES_TYPE_MATE1, &rs2_[i], flags1, ssdb_, threads_rids_mindist_, &spliceSites_);
+                    rs2_[i].setMateParams(ALN_RES_TYPE_MATE2, &rs1_[i], flags2, ssdb_, threads_rids_mindist_, &spliceSites_);
+                } else {
+                    rs1_[i].setMateParams(ALN_RES_TYPE_MATE1, &rs2_[i], flags1);
+                    rs2_[i].setMateParams(ALN_RES_TYPE_MATE2, &rs1_[i], flags2);
+                }
 				assert_eq(abs(rs1_[i].fragmentLength()), abs(rs2_[i].fragmentLength()));
 			}
 			assert(!select1_.empty());
@@ -1930,7 +1943,7 @@ void AlnSinkWrap<index_t>::finishRead(
 			} else {
 				met.nconcord_uni++;
 				assert(!rs1_.empty());
-				if(rs1_.size() == 1) {
+				if(select1_.size() == 1) {
 					met.nconcord_uni1++;
 				} else {
 					met.nconcord_uni2++;
@@ -2028,84 +2041,7 @@ void AlnSinkWrap<index_t>::finishRead(
 		// the mates having more than one.
 		//assert(nunpair1 == 0 || nunpair2 == 0);
 		assert(!pairMax);
-		
-		// Update counters given that one mate didn't align
-		if(readIsPair()) {
-			met.nconcord_0++;
-		}
-		if(rd1_ != NULL) {
-			if(nunpair1 > 0) {
-				// Update counters
-				if(readIsPair()) {
-					if(unpair1Max) met.nunp_0_rep++;
-					else {
-						met.nunp_0_uni++;
-						assert(!rs1u_.empty());
-						if(rs1u_.size() == 1) {
-							met.nunp_0_uni1++;
-						} else {
-							met.nunp_0_uni2++;
-						}
-					}
-				} else {
-					if(unpair1Max) met.nunp_rep++;
-					else {
-						met.nunp_uni++;
-						assert(!rs1u_.empty());
-						if(rs1u_.size() == 1) {
-							met.nunp_uni1++;
-						} else {
-							met.nunp_uni2++;
-						}
-					}
-				}
-			} else if(unpair1Max) {
-				// Update counters
-				if(readIsPair())   met.nunp_0_rep++;
-				else               met.nunp_rep++;
-			} else {
-				// Update counters
-				if(readIsPair())   met.nunp_0_0++;
-				else               met.nunp_0++;
-			}
-		}
-		if(rd2_ != NULL) {
-			if(nunpair2 > 0) {
-				// Update counters
-				if(readIsPair()) {
-					if(unpair2Max) met.nunp_0_rep++;
-					else {
-						assert(!rs2u_.empty());
-						met.nunp_0_uni++;
-						if(rs2u_.size() == 1) {
-							met.nunp_0_uni1++;
-						} else {
-							met.nunp_0_uni2++;
-						}
-					}
-				} else {
-					if(unpair2Max) met.nunp_rep++;
-					else {
-						assert(!rs2u_.empty());
-						met.nunp_uni++;
-						if(rs2u_.size() == 1) {
-							met.nunp_uni1++;
-						} else {
-							met.nunp_uni2++;
-						}
-					}
-				}
-			} else if(unpair2Max) {
-				// Update counters
-				if(readIsPair())   met.nunp_0_rep++;
-				else               met.nunp_rep++;
-			} else {
-				// Update counters
-				if(readIsPair())   met.nunp_0_0++;
-				else               met.nunp_0++;
-			}
-		}
-		
+			
 		const AlnRes *repRs1 = NULL, *repRs2 = NULL;
 		AlnSetSumm summ1, summ2;
 		AlnFlags flags1, flags2;
@@ -2172,6 +2108,83 @@ void AlnSinkWrap<index_t>::finishRead(
 			// summary
 			assert(!unpair2Max);
 		}
+        
+        // Update counters given that one mate didn't align
+        if(readIsPair()) {
+            met.nconcord_0++;
+        }
+        if(rd1_ != NULL) {
+            if(nunpair1 > 0) {
+                // Update counters
+                if(readIsPair()) {
+                    if(unpair1Max) met.nunp_0_rep++;
+                    else {
+                        met.nunp_0_uni++;
+                        assert(!rs1u_.empty());
+                        if(select1_.size() == 1) {
+                            met.nunp_0_uni1++;
+                        } else {
+                            met.nunp_0_uni2++;
+                        }
+                    }
+                } else {
+                    if(unpair1Max) met.nunp_rep++;
+                    else {
+                        met.nunp_uni++;
+                        assert(!rs1u_.empty());
+                        if(select1_.size() == 1) {
+                            met.nunp_uni1++;
+                        } else {
+                            met.nunp_uni2++;
+                        }
+                    }
+                }
+            } else if(unpair1Max) {
+                // Update counters
+                if(readIsPair())   met.nunp_0_rep++;
+                else               met.nunp_rep++;
+            } else {
+                // Update counters
+                if(readIsPair())   met.nunp_0_0++;
+                else               met.nunp_0++;
+            }
+        }
+        if(rd2_ != NULL) {
+            if(nunpair2 > 0) {
+                // Update counters
+                if(readIsPair()) {
+                    if(unpair2Max) met.nunp_0_rep++;
+                    else {
+                        assert(!rs2u_.empty());
+                        met.nunp_0_uni++;
+                        if(select2_.size() == 1) {
+                            met.nunp_0_uni1++;
+                        } else {
+                            met.nunp_0_uni2++;
+                        }
+                    }
+                } else {
+                    if(unpair2Max) met.nunp_rep++;
+                    else {
+                        assert(!rs2u_.empty());
+                        met.nunp_uni++;
+                        if(select2_.size() == 1) {
+                            met.nunp_uni1++;
+                        } else {
+                            met.nunp_uni2++;
+                        }
+                    }
+                }
+            } else if(unpair2Max) {
+                // Update counters
+                if(readIsPair())   met.nunp_0_rep++;
+                else               met.nunp_rep++;
+            } else {
+                // Update counters
+                if(readIsPair())   met.nunp_0_0++;
+                else               met.nunp_0++;
+            }
+        }
 		
 		// Now set up flags
 		if(rep1) {
diff --git a/alt.h b/alt.h
index 3914f12..fa820a7 100644
--- a/alt.h
+++ b/alt.h
@@ -124,6 +124,31 @@ public:
         return true;
     }
     
+    bool isSame(const ALT& o) const {
+        if(type != o.type)
+            return false;
+        if(type == ALT_SNP_SGL) {
+            return pos == o.pos && seq == o.seq;
+        } else if(type == ALT_SNP_DEL || type == ALT_SNP_INS || type == ALT_SPLICESITE) {
+            if(type == ALT_SNP_INS) {
+                if(seq != o.seq)
+                    return false;
+            }
+            if(reversed == o.reversed) {
+                return pos == o.pos && len == o.len;
+            } else {
+                if(reversed) {
+                    return pos - len + 1 == o.pos && len == o.len;
+                } else {
+                    return pos == o.pos - o.len + 1 && len == o.len;
+                }
+            }       
+        } else {
+            assert(false);
+        }
+        return true;
+    }
+    
 #ifndef NDEBUG
     bool repOk() const {
         if(type == ALT_SNP_SGL) {
@@ -181,6 +206,51 @@ public:
 
 
 template <typename index_t>
+struct Haplotype {
+    Haplotype() {
+        reset();
+    }
+    
+    void reset() {
+        left = right = 0;
+        alts.clear();
+    }
+    
+    index_t left;
+    index_t right;
+    EList<index_t, 1> alts;
+    
+    bool operator< (const Haplotype& o) const {
+        if(left != o.left) return left < o.left;
+        if(right != o.right) return right < o.right;
+        return false;
+    }
+    
+    bool write(ofstream& f_out, bool bigEndian) const {
+        writeIndex<index_t>(f_out, left, bigEndian);
+        writeIndex<index_t>(f_out, right, bigEndian);
+        writeIndex<index_t>(f_out, alts.size(), bigEndian);
+        for(index_t i = 0; i < alts.size(); i++) {
+            writeIndex<index_t>(f_out, alts[i], bigEndian);
+        }
+        return true;
+    }
+    
+    bool read(ifstream& f_in, bool bigEndian) {
+        left = readIndex<index_t>(f_in, bigEndian);
+        right = readIndex<index_t>(f_in, bigEndian);
+        assert_leq(left, right);
+        index_t num_alts = readIndex<index_t>(f_in, bigEndian);
+        alts.resizeExact(num_alts); alts.clear();
+        for(index_t i = 0; i < num_alts; i++) {
+            alts.push_back(readIndex<index_t>(f_in, bigEndian));
+        }
+        return true;
+    }
+};
+
+
+template <typename index_t>
 class ALTDB {
 public:
     ALTDB() :
@@ -199,41 +269,26 @@ public:
     void setSpliceSites(bool ss) { _ss = ss; }
     void setExons(bool exon) { _exon = exon; }
     
-    EList<ALT<index_t> >& alts()     { return _alts; }
-    EList<string>&        altnames() { return _altnames; }
+    EList<ALT<index_t> >&       alts()       { return _alts; }
+    EList<string>&              altnames()   { return _altnames; }
+    EList<Haplotype<index_t> >& haplotypes() { return _haplotypes; }
+    EList<index_t>&             haplotype_maxrights() { return _haplotype_maxrights; }
     
-    const EList<ALT<index_t> >& alts() const     { return _alts; }
-    const EList<string>&        altnames() const { return _altnames; }
+    const EList<ALT<index_t> >&       alts() const       { return _alts; }
+    const EList<string>&              altnames() const   { return _altnames; }
+    const EList<Haplotype<index_t> >& haplotypes() const { return _haplotypes; }
+    const EList<index_t>&             haplotype_maxrights() const { return _haplotype_maxrights; }
 
 private:
     bool _snp;
     bool _ss;
     bool _exon;
     
-    EList<ALT<index_t> > _alts;
-    EList<string>        _altnames;
+    EList<ALT<index_t> >       _alts;
+    EList<string>              _altnames;
+    EList<Haplotype<index_t> > _haplotypes;
+    EList<index_t>             _haplotype_maxrights;
 };
 
-template <typename index_t>
-struct Haplotype {
-    Haplotype() {
-        reset();
-    }
-    
-    void reset() {
-        left = right = 0;
-        alts.clear();
-    }
-    
-    bool operator< (const Haplotype& o) const {
-        if(left != o.left) return left < o.left;
-        if(right != o.right) return right < o.right;
-        return false;
-    }
-    
-    index_t left;
-    index_t right;
-    EList<index_t, 4> alts;
-};
 
 #endif /*ifndef ALT_H_*/
diff --git a/diff_sample.h b/diff_sample.h
index 92e24b4..dda8e09 100644
--- a/diff_sample.h
+++ b/diff_sample.h
@@ -172,8 +172,8 @@ void calcExhaustiveDC(T i, bool verbose = false, bool sanityCheck = false) {
 					assert_lt(d2, v);
 					assert_gt(d1, 0);
 					assert_gt(d2, 0);
-					if(!diffs[d1]) diffCnt++; diffs[d1] = true;
-					if(!diffs[d2]) diffCnt++; diffs[d2] = true;
+                    if(!diffs[d1]) { diffCnt++; diffs[d1] = true; }
+                    if(!diffs[d2]) { diffCnt++; diffs[d2] = true; }
 				}
 			}
 			// Do we observe all possible differences (except 0)
diff --git a/doc/manual.inc.html b/doc/manual.inc.html
index 59daf71..13b48dc 100644
--- a/doc/manual.inc.html
+++ b/doc/manual.inc.html
@@ -398,7 +398,7 @@
 <pre><code>--novel-splicesite-outfile <path></code></pre>
 </td><td>
 
-<p>In this mode, HISAT2 reports a list of splice sites in the file <path>:<br /> chromosome name <code><tab></code> genomic position of the flanking base on the left side of an intron <code><tab></code> genomic position of the flanking base on the right <code><tab></code> strand</p>
+<p>In this mode, HISAT2 reports a list of splice sites in the file <path>:<br /> chromosome name <code><tab></code> genomic position of the flanking base on the left side of an intron <code><tab></code> genomic position of the flanking base on the right <code><tab></code> strand (+, -, and .) '.' indicates an unknown strand for non-canonical splice sites.</p>
 </td></tr>
 
 <tr><td id="hisat2-options-novel-splicesite-infile">
@@ -455,6 +455,13 @@
 <p>Report alignments tailored specifically for Cufflinks. In addition to what HISAT2 does with the above option (--dta), With this option, HISAT2 looks for novel splice sites with three signals (GT/AG, GC/AG, AT/AC), but all user-provided splice sites are used irrespective of their signals. HISAT2 produces an optional field, XS:A:[+-], for every spliced alignment.</p>
 </td></tr>
 
+<tr><td id="hisat2-options-no-templatelen-adjustment">
+<pre><code>--no-templatelen-adjustment</code></pre>
+</td><td>
+
+<p>Disables template length adjustment for RNA-seq reads.</p>
+</td></tr>
+
 </table>
 
 <h4 id="reporting-options">Reporting options</h4>
@@ -473,7 +480,7 @@
 <pre><code>--max-seeds <int></code></pre>
 </td><td>
 
-<p>HISAT2, like other aligners, uses seed-and-extend approaches. HISAT2 tries to extend seeds to full-length alignments. In HISAT2, --max-seeds is used to control the maximum number of seeds that will be extended. HISAT2 extends up to these many seeds and skips the rest of the seeds. Large values for <code>--max-seeds</code> may improve alignment sensitivity, but HISAT2 is not designed with large values for <code>--max-seeds</code> in mind, and when aligning reads to long, repetitive gen [...]
+<p>HISAT2, like other aligners, uses seed-and-extend approaches. HISAT2 tries to extend seeds to full-length alignments. In HISAT2, --max-seeds is used to control the maximum number of seeds that will be extended. HISAT2 extends up to these many seeds and skips the rest of the seeds. Large values for <code>--max-seeds</code> may improve alignment sensitivity, but HISAT2 is not designed with large values for <code>--max-seeds</code> in mind, and when aligning reads to long, repetitive gen [...]
 </td></tr>
 
 <tr><td id="hisat2-options-secondary">
@@ -582,6 +589,23 @@
 
 <p>Print nothing besides alignments and serious errors.</p>
 </td></tr>
+
+<tr><td id="hisat2-summary-file">
+
+<pre><code>--summary-file</code></pre>
+</td><td>
+
+<p>Print alignment summary to this file.</p>
+</td></tr>
+
+<tr><td id="hisat2-new-summary">
+
+<pre><code>--new-summary</code></pre>
+</td><td>
+
+<p>Print alignment summary in a new style, which is more machine-friendly.</p>
+</td></tr>
+
 <tr><td id="hisat2-options-met-file">
 
 <pre><code>--met-file <path></code></pre>
diff --git a/ds.cpp b/ds.cpp
index b98eb95..35bdaac 100644
--- a/ds.cpp
+++ b/ds.cpp
@@ -19,7 +19,7 @@
 
 #include "ds.h"
 
-MemoryTally gMemTally;
+extern MemoryTally gMemTally;
 
 /**
  * Tally a memory allocation of size amt bytes.
diff --git a/fast_mutex.h b/fast_mutex.h
index 4d4b7cc..7334fb0 100755
--- a/fast_mutex.h
+++ b/fast_mutex.h
@@ -1,248 +1,294 @@
-/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*-
-Copyright (c) 2010-2012 Marcus Geelnard
-
-This software is provided 'as-is', without any express or implied
-warranty. In no event will the authors be held liable for any damages
-arising from the use of this software.
-
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it
-freely, subject to the following restrictions:
-
-    1. The origin of this software must not be misrepresented; you must not
-    claim that you wrote the original software. If you use this software
-    in a product, an acknowledgment in the product documentation would be
-    appreciated but is not required.
-
-    2. Altered source versions must be plainly marked as such, and must not be
-    misrepresented as being the original software.
-
-    3. This notice may not be removed or altered from any source
-    distribution.
-*/
-
-#ifndef _FAST_MUTEX_H_
-#define _FAST_MUTEX_H_
-
-/// @file
-
-// Which platform are we on?
-#if !defined(_TTHREAD_PLATFORM_DEFINED_)
-  #if defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
-    #define _TTHREAD_WIN32_
-  #else
-    #define _TTHREAD_POSIX_
-  #endif
-  #define _TTHREAD_PLATFORM_DEFINED_
-#endif
-
-// Check if we can support the assembly language level implementation (otherwise
-// revert to the system API)
-#if (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || \
-    (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || \
-    (defined(__GNUC__) && (defined(__ppc__)))
-  #define _FAST_MUTEX_ASM_
-#else
-  #define _FAST_MUTEX_SYS_
-#endif
-
-#if defined(_TTHREAD_WIN32_)
-  #ifndef WIN32_LEAN_AND_MEAN
-    #define WIN32_LEAN_AND_MEAN
-    #define __UNDEF_LEAN_AND_MEAN
-  #endif
-  #include <windows.h>
-  #ifdef __UNDEF_LEAN_AND_MEAN
-    #undef WIN32_LEAN_AND_MEAN
-    #undef __UNDEF_LEAN_AND_MEAN
-  #endif
-#else
-  #ifdef _FAST_MUTEX_ASM_
-    #include <sched.h>
-  #else
-    #include <pthread.h>
-  #endif
-#endif
-
-namespace tthread {
-
-/// Fast mutex class.
-/// This is a mutual exclusion object for synchronizing access to shared
-/// memory areas for several threads. It is similar to the tthread::mutex class,
-/// but instead of using system level functions, it is implemented as an atomic
-/// spin lock with very low CPU overhead.
-///
-/// The \c fast_mutex class is NOT compatible with the \c condition_variable
-/// class (however, it IS compatible with the \c lock_guard class). It should
-/// also be noted that the \c fast_mutex class typically does not provide
-/// as accurate thread scheduling as a the standard \c mutex class does.
-///
-/// Because of the limitations of the class, it should only be used in
-/// situations where the mutex needs to be locked/unlocked very frequently.
-///
-/// @note The "fast" version of this class relies on inline assembler language,
-/// which is currently only supported for 32/64-bit Intel x86/AMD64 and
-/// PowerPC architectures on a limited number of compilers (GNU g++ and MS
-/// Visual C++).
-/// For other architectures/compilers, system functions are used instead.
-class fast_mutex {
-  public:
-    /// Constructor.
-#if defined(_FAST_MUTEX_ASM_)
-    fast_mutex() : mLock(0) {}
-#else
-    fast_mutex()
-    {
-  #if defined(_TTHREAD_WIN32_)
-      InitializeCriticalSection(&mHandle);
-  #elif defined(_TTHREAD_POSIX_)
-      pthread_mutex_init(&mHandle, NULL);
-  #endif
-    }
-#endif
-
-#if !defined(_FAST_MUTEX_ASM_)
-    /// Destructor.
-    ~fast_mutex()
-    {
-  #if defined(_TTHREAD_WIN32_)
-      DeleteCriticalSection(&mHandle);
-  #elif defined(_TTHREAD_POSIX_)
-      pthread_mutex_destroy(&mHandle);
-  #endif
-    }
-#endif
-
-    /// Lock the mutex.
-    /// The method will block the calling thread until a lock on the mutex can
-    /// be obtained. The mutex remains locked until \c unlock() is called.
-    /// @see lock_guard
-    inline void lock()
-    {
-#if defined(_FAST_MUTEX_ASM_)
-      bool gotLock;
-      do {
-        gotLock = try_lock();
-        if(!gotLock)
-        {
-  #if defined(_TTHREAD_WIN32_)
-          Sleep(0);
-  #elif defined(_TTHREAD_POSIX_)
-          sched_yield();
-  #endif
-        }
-      } while(!gotLock);
-#else
-  #if defined(_TTHREAD_WIN32_)
-      EnterCriticalSection(&mHandle);
-  #elif defined(_TTHREAD_POSIX_)
-      pthread_mutex_lock(&mHandle);
-  #endif
-#endif
-    }
-
-    /// Try to lock the mutex.
-    /// The method will try to lock the mutex. If it fails, the function will
-    /// return immediately (non-blocking).
-    /// @return \c true if the lock was acquired, or \c false if the lock could
-    /// not be acquired.
-    inline bool try_lock()
-    {
-#if defined(_FAST_MUTEX_ASM_)
-      int oldLock;
-  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-      asm volatile (
-        "movl $1,%%eax\n\t"
-        "xchg %%eax,%0\n\t"
-        "movl %%eax,%1\n\t"
-        : "=m" (mLock), "=m" (oldLock)
-        :
-        : "%eax", "memory"
-      );
-  #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
-      int *ptrLock = &mLock;
-      __asm {
-        mov eax,1
-        mov ecx,ptrLock
-        xchg eax,[ecx]
-        mov oldLock,eax
-      }
-  #elif defined(__GNUC__) && (defined(__ppc__))
-      int newLock = 1;
-      asm volatile (
-        "\n1:\n\t"
-        "lwarx  %0,0,%1\n\t"
-        "cmpwi  0,%0,0\n\t"
-        "bne-   2f\n\t"
-        "stwcx. %2,0,%1\n\t"
-        "bne-   1b\n\t"
-        "isync\n"
-        "2:\n\t"
-        : "=&r" (oldLock)
-        : "r" (&mLock), "r" (newLock)
-        : "cr0", "memory"
-      );
-  #endif
-      return (oldLock == 0);
-#else
-  #if defined(_TTHREAD_WIN32_)
-      return TryEnterCriticalSection(&mHandle) ? true : false;
-  #elif defined(_TTHREAD_POSIX_)
-      return (pthread_mutex_trylock(&mHandle) == 0) ? true : false;
-  #endif
-#endif
-    }
-
-    /// Unlock the mutex.
-    /// If any threads are waiting for the lock on this mutex, one of them will
-    /// be unblocked.
-    inline void unlock()
-    {
-#if defined(_FAST_MUTEX_ASM_)
-  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-      asm volatile (
-        "movl $0,%%eax\n\t"
-        "xchg %%eax,%0\n\t"
-        : "=m" (mLock)
-        :
-        : "%eax", "memory"
-      );
-  #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
-      int *ptrLock = &mLock;
-      __asm {
-        mov eax,0
-        mov ecx,ptrLock
-        xchg eax,[ecx]
-      }
-  #elif defined(__GNUC__) && (defined(__ppc__))
-      asm volatile (
-        "sync\n\t"  // Replace with lwsync where possible?
-        : : : "memory"
-      );
-      mLock = 0;
-  #endif
-#else
-  #if defined(_TTHREAD_WIN32_)
-      LeaveCriticalSection(&mHandle);
-  #elif defined(_TTHREAD_POSIX_)
-      pthread_mutex_unlock(&mHandle);
-  #endif
-#endif
-    }
-
-  private:
-#if defined(_FAST_MUTEX_ASM_)
-    int mLock;
-#else
-  #if defined(_TTHREAD_WIN32_)
-    CRITICAL_SECTION mHandle;
-  #elif defined(_TTHREAD_POSIX_)
-    pthread_mutex_t mHandle;
-  #endif
-#endif
-};
-
-}
-
-#endif // _FAST_MUTEX_H_
-
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*-
+Copyright (c) 2010-2012 Marcus Geelnard
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+
+*****************************************************************************
+Copyright (c) 2016 Nigel Dyer
+This version has been modified from the original, whose original Copyright notice
+is reproduced above.  The permissions continue as above.
+
+The software has been modified in three ways:
+
+	a)  mHandle is now a mutable pointer variable which is passed to the
+	copy in the new =operator. This allows it to be pushed into a vector or equivalent
+	It is not intended to allow multiple copies of a mutex to be created
+
+	b)	The code for 64bit MSC builds uses InterlockedExchange which provides more
+	efficient access to the xchg assembler instruction than implementing the code as 
+	a function in a separate .asm file. MSC does not support in line assembler for 64 bit builds.
+
+	c) The NO_FAST_MUTEX_ASM option has been introduced
+
+*/
+
+#ifndef _FAST_MUTEX_H_
+#define _FAST_MUTEX_H_
+
+/// @file
+
+// Which platform are we on?
+#if !defined(_TTHREAD_PLATFORM_DEFINED_)
+  #if defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
+    #define _TTHREAD_WIN32_
+  #else
+    #define _TTHREAD_POSIX_
+  #endif
+  #define _TTHREAD_PLATFORM_DEFINED_
+#endif
+
+// Check if we can support the assembly language level implementation (otherwise
+// revert to the system API)
+#if !defined NO_FAST_MUTEX_ASM && ((defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || \
+    (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || \
+    (defined(__GNUC__) && (defined(__ppc__))))
+  #define _FAST_MUTEX_ASM_
+#else
+  #define _FAST_MUTEX_SYS_
+#endif
+
+#if defined(_TTHREAD_WIN32_)
+  #ifndef WIN32_LEAN_AND_MEAN
+    #define WIN32_LEAN_AND_MEAN
+    #define __UNDEF_LEAN_AND_MEAN
+  #endif
+  #include <windows.h>
+  #ifdef __UNDEF_LEAN_AND_MEAN
+    #undef WIN32_LEAN_AND_MEAN
+    #undef __UNDEF_LEAN_AND_MEAN
+  #endif
+#else
+  #ifdef _FAST_MUTEX_ASM_
+    #include <sched.h>
+  #else
+    #include <pthread.h>
+  #endif
+#endif
+
+namespace tthread {
+
+/// Fast mutex class.
+/// This is a mutual exclusion object for synchronizing access to shared
+/// memory areas for several threads. It is similar to the tthread::mutex class,
+/// but instead of using system level functions, it is implemented as an atomic
+/// spin lock with very low CPU overhead.
+///
+/// The \c fast_mutex class is NOT compatible with the \c condition_variable
+/// class (however, it IS compatible with the \c lock_guard class). It should
+/// also be noted that the \c fast_mutex class typically does not provide
+/// as accurate thread scheduling as a the standard \c mutex class does.
+///
+/// Because of the limitations of the class, it should only be used in
+/// situations where the mutex needs to be locked/unlocked very frequently.
+///
+/// @note The "fast" version of this class relies on inline assembler language,
+/// which is currently only supported for 32/64-bit Intel x86/AMD64 and
+/// PowerPC architectures on a limited number of compilers (GNU g++ and MS
+/// Visual C++).
+/// For other architectures/compilers, system functions are used instead.
+class fast_mutex {
+  public:
+    /// Constructor.
+#if defined(_FAST_MUTEX_ASM_)
+    fast_mutex() : mLock(0) {}
+#else
+    fast_mutex()
+    {
+  #if defined(_TTHREAD_WIN32_)
+		mHandle = new CRITICAL_SECTION();
+      InitializeCriticalSection(mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+		mHandle = new pthread_mutex_t();
+      pthread_mutex_init(mHandle, NULL);
+  #endif
+    }
+#endif
+
+#if !defined(_FAST_MUTEX_ASM_)
+    /// Destructor.
+    ~fast_mutex()
+    {
+		if (mHandle)
+		{
+ #if defined(_TTHREAD_WIN32_)
+			DeleteCriticalSection(mHandle);
+#elif defined(_TTHREAD_POSIX_)
+			pthread_mutex_destroy(mHandle);
+#endif
+			delete mHandle;
+			mHandle = 0;
+		}
+    }
+	/// The handle is passed from the source to the desitination
+	/// Used primarily when mutexes are pushed onto a List either on their
+	/// own or as a member variable of another classs
+	fast_mutex & operator = (const fast_mutex& fm)
+	{
+		mHandle = fm.mHandle;
+		fm.mHandle = 0;
+		return *this;
+	}
+#endif
+
+    /// Lock the mutex.
+    /// The method will block the calling thread until a lock on the mutex can
+    /// be obtained. The mutex remains locked until \c unlock() is called.
+    /// @see lock_guard
+    inline void lock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+      bool gotLock;
+      do {
+        gotLock = try_lock();
+        if(!gotLock)
+        {
+#if defined(_TTHREAD_WIN32_)
+			Sleep(0);
+#elif defined(_TTHREAD_POSIX_)
+			sched_yield();
+#endif
+		}
+      } while(!gotLock);
+#else
+  #if defined(_TTHREAD_WIN32_)
+      EnterCriticalSection(mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_lock(mHandle);
+  #endif
+#endif
+    }
+
+    /// Try to lock the mutex.
+    /// The method will try to lock the mutex. If it fails, the function will
+    /// return immediately (non-blocking).
+    /// @return \c true if the lock was acquired, or \c false if the lock could
+    /// not be acquired.
+    inline bool try_lock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+      int oldLock;
+  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+      asm volatile (
+        "movl $1,%%eax\n\t"
+        "xchg %%eax,%0\n\t"
+        "movl %%eax,%1\n\t"
+        : "=m" (mLock), "=m" (oldLock)
+        :
+        : "%eax", "memory"
+      );
+  #elif defined(_MSC_VER)
+	#if defined(_M_IX86)
+	  int *ptrLock = &mLock;
+	  __asm {
+		  mov eax, 1
+		  mov ecx, ptrLock
+		  xchg eax, [ecx]
+		  mov oldLock, eax
+	  }
+     #elif  defined(_M_X64)
+	  oldLock = InterlockedExchange(&mLock, 1);
+     #endif
+  #elif defined(__GNUC__) && (defined(__ppc__))
+      int newLock = 1;
+      asm volatile (
+        "\n1:\n\t"
+        "lwarx  %0,0,%1\n\t"
+        "cmpwi  0,%0,0\n\t"
+        "bne-   2f\n\t"
+        "stwcx. %2,0,%1\n\t"
+        "bne-   1b\n\t"
+        "isync\n"
+        "2:\n\t"
+        : "=&r" (oldLock)
+        : "r" (&mLock), "r" (newLock)
+        : "cr0", "memory"
+      );
+  #endif
+      return (oldLock == 0);
+#else
+  #if defined(_TTHREAD_WIN32_)
+      return TryEnterCriticalSection(mHandle) ? true : false;
+  #elif defined(_TTHREAD_POSIX_)
+      return (pthread_mutex_trylock(mHandle) == 0) ? true : false;
+  #endif
+#endif
+    }
+
+    /// Unlock the mutex.
+    /// If any threads are waiting for the lock on this mutex, one of them will
+    /// be unblocked.
+    inline void unlock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+      asm volatile (
+        "movl $0,%%eax\n\t"
+        "xchg %%eax,%0\n\t"
+        : "=m" (mLock)
+        :
+        : "%eax", "memory"
+      );
+  #elif defined(_MSC_VER)
+    #if defined(_M_IX86) 
+		int *ptrLock = &mLock;
+		__asm {
+		mov eax,0
+		mov ecx,ptrLock
+		xchg eax,[ecx]
+      }
+    #elif defined(_M_X64)
+		InterlockedExchange(&mLock, 0);
+    #endif
+  #elif defined(__GNUC__) && (defined(__ppc__))
+      asm volatile (
+        "sync\n\t"  // Replace with lwsync where possible?
+        : : : "memory"
+      );
+      mLock = 0;
+  #endif
+#else
+  #if defined(_TTHREAD_WIN32_)
+      LeaveCriticalSection(mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_unlock(mHandle);
+  #endif
+#endif
+    }
+
+  private:
+#if defined(_FAST_MUTEX_ASM_)
+#if defined(_M_X64) && defined (_MSC_VER)
+	  long mLock;
+#else
+	  int mLock;
+#endif
+#else
+  #if defined(_TTHREAD_WIN32_)
+    mutable CRITICAL_SECTION * mHandle;
+  #elif defined(_TTHREAD_POSIX_)
+    mutable pthread_mutex_t * mHandle;
+  #endif
+#endif
+};
+
+}
+
+#endif // _FAST_MUTEX_H_
+
diff --git a/gbwt_graph.h b/gbwt_graph.h
index fe1e305..f534df9 100644
--- a/gbwt_graph.h
+++ b/gbwt_graph.h
@@ -632,7 +632,7 @@ RefGraph<index_t>::RefGraph(const SString<char>& s,
         // Create nodes and edges for haplotypes
         for(index_t i = 0; i < haplotypes.size(); i++) {
             const Haplotype<index_t>& haplotype = haplotypes[i];
-            const EList<index_t, 4>& snpIDs = haplotype.alts;
+            const EList<index_t, 1>& snpIDs = haplotype.alts;
             assert_gt(snpIDs.size(), 0);
             assert_lt(haplotype.right, s.length());
             bool pass = true;
@@ -793,21 +793,21 @@ RefGraph<index_t>::RefGraph(const SString<char>& s,
         }
     }
     
-    // daehwan - for debugging purposes
-#if 0
-    cout << "num nodes: " << nodes.size() << endl;
-    for(index_t i = 0; i < nodes.size(); i++) {
-        const Node& n = nodes[i];
-        cout << i << "\t" << n.label << "\t" << n.value << endl;
-    }
-    
-    sort(edges.begin(), edges.end());
-    cout << "num edges: " << edges.size() << endl;
-    for(index_t i = 0; i < edges.size(); i++) {
-        const Edge& e = edges[i];
-        cout << i << "\t" << e.from << " --> " << e.to << endl;
+#ifndef NDEBUG
+    if(debug) {
+        cout << "num nodes: " << nodes.size() << endl;
+        for(index_t i = 0; i < nodes.size(); i++) {
+            const Node& n = nodes[i];
+            cout << i << "\t" << n.label << "\t" << n.value << endl;
+        }
+        
+        sort(edges.begin(), edges.end());
+        cout << "num edges: " << edges.size() << endl;
+        for(index_t i = 0; i < edges.size(); i++) {
+            const Edge& e = edges[i];
+            cout << i << "\t" << e.from << " --> " << e.to << endl;
+        }
     }
-    exit(1);
 #endif
 }
 
@@ -949,7 +949,7 @@ void RefGraph<index_t>::buildGraph_worker(void* vp) {
             const Haplotype<index_t>& haplotype = haplotypes[haplotype_idx];
             if(haplotype.left < curr_pos) continue;
             if(haplotype.right >= curr_pos + curr_len) break;
-            const EList<index_t, 4>& snpIDs = haplotype.alts;
+            const EList<index_t, 1>& snpIDs = haplotype.alts;
             assert_gt(snpIDs.size(), 0);
             bool pass = true;
             for(index_t s = 0; s < snpIDs.size(); s++) {
@@ -2290,7 +2290,7 @@ void PathGraph<index_t>::printInfo()
     if(verbose) {
         cerr << "Generation " << generation
         << " (" << temp_nodes << " -> " << nodes.size() << " nodes, "
-        << ranks << " ranks" << endl;
+        << ranks << " ranks)" << endl;
     }
 }
 
diff --git a/gfm.h b/gfm.h
index 1049099..2a3385e 100644
--- a/gfm.h
+++ b/gfm.h
@@ -544,7 +544,11 @@ struct USE_POPCNT_GENERIC {
 struct USE_POPCNT_INSTRUCTION {
     inline static int pop64(uint64_t x) {
         int64_t count;
+#ifdef USING_MSC_COMPILER
+		count = __popcnt64(x);
+#else
         asm ("popcntq %[x],%[count]\n": [count] "=&r" (count): [x] "r" (x));
+#endif
         return (int)count;
     }
 };
@@ -573,10 +577,13 @@ inline static int countInU64(int c, uint64_t dw) {
 
 #ifdef POPCNT_CAPABILITY   // wrapping of "struct"
 struct USE_POPCNT_GENERIC_BITS {
+	// Use this standard bit-bashing population count
+	inline static uint64_t pop64(uint64_t x) {
+#else
+// Use this standard bit-bashing population count
+	inline static uint64_t pop6464(uint64_t x) {
 #endif
-    // Use this standard bit-bashing population count
-    inline static int pop64(uint64_t x) {
-        x -= (x >> 1) & 0x5555555555555555ULL;
+		x -= (x >> 1) & 0x5555555555555555ULL;
         x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
         x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
         return int((x * 0x0101010101010101ULL) >> 56);
@@ -596,7 +603,7 @@ inline static int countInU64_bits(uint64_t dw) {
 #ifdef POPCNT_CAPABILITY
     uint64_t tmp = Operation().pop64(dw);
 #else
-    uint64_t tmp = pop64(dw);
+    uint64_t tmp = pop6464(dw);
 #endif
     return (int) tmp;
 }
@@ -641,7 +648,7 @@ public:
 	    mmFile2_(NULL), \
         _nthreads(1)
 
-	/// Construct an Ebwt from the given input file
+	/// Construct a GFM from the given input file
 	GFM(const string& in,
         ALTDB<index_t>* altdb,
         int needEntireReverse,
@@ -660,6 +667,7 @@ public:
         bool startVerbose, // = false,
         bool passMemExc, // = false,
         bool sanityCheck, // = false)
+        bool useHaplotype, // = false
         bool skipLoading = false) :
         GFM_INITS
 	{
@@ -699,6 +707,7 @@ public:
 
         // Read ALTs
         EList<ALT<index_t> >& alts = altdb->alts();
+        EList<Haplotype<index_t> >& haplotypes = altdb->haplotypes();
         EList<string>& altnames = altdb->altnames();
         alts.clear(); altnames.clear();
         string in7Str = in + ".7." + gfm_ext;
@@ -710,23 +719,51 @@ public:
             cerr << "Could not open index file " << in7Str.c_str() << endl;
         }
         
+        EList<index_t> to_alti;
+        index_t to_alti_far = 0;
         readI32(in7, this->toBe());
         index_t numAlts = readIndex<index_t>(in7, this->toBe());
         if(numAlts > 0) {
+            alts.resizeExact(numAlts); alts.clear();
+            to_alti.resizeExact(numAlts); to_alti.clear();
             while(!in7.eof()) {
                 alts.expand();
                 alts.back().read(in7, this->toBe());
+                to_alti.push_back(to_alti_far);
+                to_alti_far++;
                 if(!loadSpliceSites) {
                     if(alts.back().splicesite()) {
                         alts.pop_back();
                         assert_gt(numAlts, 0);
                         numAlts--;
+                        to_alti.back() = std::numeric_limits<index_t>::max();
+                        to_alti_far--;
                     }
                 }
                 if(alts.size() == numAlts) break;
             }
         }
         assert_eq(alts.size(), numAlts);
+        assert_eq(to_alti_far, numAlts);
+        if(useHaplotype) {
+            // Check if it hits the end of file, and this routine is needed for backward compatibility
+            if(in7.peek() != std::ifstream::traits_type::eof()) {
+                index_t numHaplotypes = readIndex<index_t>(in7, this->toBe());
+                if(numHaplotypes > 0) {
+                    haplotypes.resizeExact(numHaplotypes);
+                    haplotypes.clear();
+                    while(!in7.eof()) {
+                        haplotypes.expand();
+                        haplotypes.back().read(in7, this->toBe());
+                        Haplotype<index_t>& ht = haplotypes.back();
+                        for(index_t h = 0; h < ht.alts.size(); h++) {
+                            ht.alts[h] = to_alti[ht.alts[h]];
+                        }
+                        if(haplotypes.size() == numHaplotypes) break;
+                    }
+                }
+            }
+        }
         
         if(verbose || startVerbose) cerr << "Opening \"" << in8Str.c_str() << "\"" << endl;
         ifstream in8(in8Str.c_str(), ios::binary);
@@ -780,6 +817,25 @@ public:
             for(size_t i = 0; i < alts.size(); i++) {
                 alts[i] = buf[i].first;
                 altnames[i] = buf2[buf[i].second];
+                if(buf[i].second < numAlts) {
+                    to_alti[buf[i].second] = i;
+                }
+            }
+        }
+        
+        if(useHaplotype) {
+            EList<index_t>& haplotype_maxrights = altdb->haplotype_maxrights();
+            haplotype_maxrights.resizeExact(haplotypes.size());
+            for(index_t h = 0; h < haplotypes.size(); h++) {
+                Haplotype<index_t>& ht = haplotypes[h];
+                for(index_t h2 = 0; h2 < ht.alts.size(); h2++) {
+                    ht.alts[h2] = to_alti[ht.alts[h2]];
+                }
+                if(h == 0) {
+                    haplotype_maxrights[h] = ht.right;
+                } else {
+                    haplotype_maxrights[h] = std::max<index_t>(haplotype_maxrights[h - 1], ht.right);
+                }
             }
         }
         
@@ -1326,7 +1382,7 @@ public:
                             assert_lt(bp, 4);
                             if((int)bp == s[pos]) {
                                 cerr << "Warning: single type should have a different base than " << "ACGTN"[(int)s[pos]]
-                                << " (" << snp_id << ") at " << genome_pos << " on " << chr << endl;
+                                     << " (" << snp_id << ") at " << genome_pos << " on " << chr << endl;
                                 _alts.pop_back();
                                 continue;
                                 // throw 1;
@@ -1696,7 +1752,7 @@ public:
                         buf3[before] = (index_t)i;
                     }
                     for(size_t h = 0; h < _haplotypes.size(); h++) {
-                        EList<index_t, 4>& alts = _haplotypes[h].alts;
+                        EList<index_t, 1>& alts = _haplotypes[h].alts;
                         for(size_t a = 0; a < alts.size(); a++) {
                             index_t before = alts[a];
                             assert_lt(before, buf3.size());
@@ -1729,6 +1785,11 @@ public:
                     fout8 << _altnames[i] << endl;
                 }
                 
+                writeIndex<index_t>(fout7, (index_t)_haplotypes.size(), this->toBe());
+                for(index_t i = 0; i < _haplotypes.size(); i++) {
+                    _haplotypes[i].write(fout7, this->toBe());
+                }
+                
                 fout7.close();
                 fout8.close();
             }
@@ -5117,7 +5178,7 @@ index_t GFM<index_t>::walkLeft(index_t row, index_t steps) const {
         for(index_t i = 0; i < _zOffs.size(); i++) {
             if(row == _zOffs[i]) return (index_t)INDEX_MAX;
         }
-        pair<index_t, index_t> range = this->mapGLF1(row, l, NULL ASSERT_ONLY(, false));
+        pair<index_t, index_t> range = this->mapGLF1(row, l, (pair<index_t, index_t> *)NULL ASSERT_ONLY(, false));
         index_t newrow = range.first;
 		assert_neq((index_t)INDEX_MAX, newrow);
 		assert_neq(newrow, row);
diff --git a/gp.h b/gp.h
index 35f8f44..02744fa 100644
--- a/gp.h
+++ b/gp.h
@@ -37,9 +37,15 @@ public:
     
     GraphPolicy() { reset(); }
     
-    GraphPolicy(size_t maxAltsTried)
+    GraphPolicy(size_t maxAltsTried,
+                bool useHaplotype,
+                bool haplotypeOnly,
+                bool enableCODIS)
     {
-        init(maxAltsTried);
+        init(maxAltsTried,
+             useHaplotype,
+             haplotypeOnly,
+             enableCODIS);
     }
     
     /**
@@ -50,16 +56,28 @@ public:
     
     /**
      */
-    void init(size_t maxAltsTried)
+    void init(size_t maxAltsTried,
+              bool useHaplotype = false,
+              bool haplotypeOnly = false,
+              bool enableCODIS = false)
     {
         maxAltsTried_ = maxAltsTried;
+        useHaplotype_ = useHaplotype;
+        haplotypeOnly_ = haplotypeOnly;
+        enableCODIS_ = enableCODIS;
     }
     
     size_t maxAltsTried() const { return maxAltsTried_; }
+    bool   useHaplotype() const { return useHaplotype_; }
+    bool   haplotypeOnly() const { return haplotypeOnly_; }
+    bool   enableCODIS() const { return enableCODIS_; }
     
     
 private:
     size_t maxAltsTried_;
+    bool   useHaplotype_;
+    bool   haplotypeOnly_;
+    bool   enableCODIS_;
 };
 
 #endif /*ndef GP_H_*/
diff --git a/hgfm.h b/hgfm.h
index 8bf4593..735d840 100644
--- a/hgfm.h
+++ b/hgfm.h
@@ -65,7 +65,8 @@ public:
              bool verbose, // = false,
              bool startVerbose, // = false,
              bool passMemExc, // = false,
-             bool sanityCheck) : // = false) :
+             bool sanityCheck, // = false)
+             bool useHaplotype) : // = false
 	GFM<index_t>(in,
                  altdb,
                  needEntireReverse,
@@ -84,6 +85,7 @@ public:
                  startVerbose,
                  passMemExc,
                  sanityCheck,
+                 useHaplotype,
                  true)
 	{
 		this->_in1Str = in + ".5." + gfm_ext;
@@ -1551,7 +1553,7 @@ template <typename index_t = uint32_t, typename local_index_t = uint16_t>
 class HGFM : public GFM<index_t> {
 	typedef GFM<index_t> PARENT_CLASS;
 public:
-	/// Construct an Ebwt from the given input file
+	/// Construct a GFM from the given input file
 	HGFM(const string& in,
          ALTDB<index_t>* altdb,
          int needEntireReverse,
@@ -1570,6 +1572,7 @@ public:
          bool startVerbose, // = false,
          bool passMemExc, // = false,
          bool sanityCheck, // = false
+         bool useHaplotype, // = false
          bool skipLoading = false) :
     GFM<index_t>(in,
                  altdb,
@@ -1589,6 +1592,7 @@ public:
                  startVerbose,
                  passMemExc,
                  sanityCheck,
+                 useHaplotype,
                  skipLoading),
     _in5(NULL),
     _in6(NULL)
@@ -1597,7 +1601,7 @@ public:
         _in6Str = in + ".6." + gfm_ext;
     }
 	
-	/// Construct an Ebwt from the given header parameters and string
+	/// Construct a HGFM from the given header parameters and string
 	/// vector, optionally using a blockwise suffix sorter with the
 	/// given 'bmax' and 'dcv' parameters.  The string vector is
 	/// ultimately joined and the joined string is passed to buildToDisk().
@@ -1897,7 +1901,7 @@ void HGFM<index_t, local_index_t>::gbwt_worker(void* vp)
     }
 }
     
-/// Construct an Ebwt from the given header parameters and string
+/// Construct a GFM from the given header parameters and string
 /// vector, optionally using a blockwise suffix sorter with the
 /// given 'bmax' and 'dcv' parameters.  The string vector is
 /// ultimately joined and the joined string is passed to buildToDisk().
@@ -2595,7 +2599,8 @@ void HGFM<index_t, local_index_t>::readIntoMemory(
                                                                                           false,  // _verbose
                                                                                           false,
                                                                                           this->_passMemExc,
-                                                                                          this->_sanity);
+                                                                                          this->_sanity,
+                                                                                          false); // use haplotypes?
         
 		if(tidx >= _localGFMs.size()) {
 			assert_eq(tidx, _localGFMs.size());
diff --git a/hi_aligner.h b/hi_aligner.h
index 7183e17..53aa66f 100644
--- a/hi_aligner.h
+++ b/hi_aligner.h
@@ -403,6 +403,8 @@ struct SharedTempVars {
     EList<SStringExpandable<char> > raw_refbufs;
     EList<Edit>                     alt_edits;
     ELList<Edit, 128, 4>            candidate_edits;
+    ELList<pair<index_t, index_t> > ht_llist;
+    Haplotype<index_t>              cmp_ht;
     
     ASSERT_ONLY(SStringExpandable<uint32_t> destU32);
     
@@ -413,6 +415,7 @@ struct SharedTempVars {
     ASSERT_ONLY(EList<index_t> refoffs);
     
     LinkedEList<EList<Edit> > raw_edits;
+    LinkedEList<EList<pair<index_t, index_t> > > raw_ht_lists;
 };
 
 /**
@@ -432,18 +435,22 @@ struct GenomeHit {
     _toff((index_t)INDEX_MAX),
     _joinedOff((index_t)INDEX_MAX),
     _edits(NULL),
+    _ht_list(NULL),
     _score(MIN_I64),
     _localscore(MIN_I64),
     _hitcount(1),
     _edits_node(NULL),
+    _ht_list_node(NULL),
     _sharedVars(NULL)
     {
     }
     
     GenomeHit(const GenomeHit& otherHit) :
     _edits(NULL),
+    _ht_list(NULL),
     _hitcount(1),
     _edits_node(NULL),
+    _ht_list_node(NULL),
     _sharedVars(NULL)
     {
         init(otherHit._fw,
@@ -456,6 +463,7 @@ struct GenomeHit {
              otherHit._joinedOff,
              *(otherHit._sharedVars),
              otherHit._edits,
+             otherHit._ht_list,
              otherHit._score,
              otherHit._localscore,
              otherHit._splicescore);
@@ -473,6 +481,7 @@ struct GenomeHit {
              otherHit._joinedOff,
              *(otherHit._sharedVars),
              otherHit._edits,
+             otherHit._ht_list,
              otherHit._score,
              otherHit._localscore,
              otherHit._splicescore);
@@ -487,8 +496,15 @@ struct GenomeHit {
             _sharedVars->raw_edits.delete_node(_edits_node);
             _edits = NULL;
             _edits_node = NULL;
-            _sharedVars = NULL;
         }
+        if(_ht_list_node != NULL) {
+            assert(_ht_list != NULL);
+            assert(_sharedVars != NULL);
+            _sharedVars->raw_ht_lists.delete_node(_ht_list_node);
+            _ht_list = NULL;
+            _ht_list_node = NULL;
+        }
+        _sharedVars = NULL;
     }
 	
 	void init(
@@ -502,6 +518,7 @@ struct GenomeHit {
               index_t                   joinedOff,
               SharedTempVars<index_t>&  sharedVars,
               EList<Edit>*              edits = NULL,
+              EList<pair<index_t, index_t> >* ht_list = NULL,
               int64_t                   score = 0,
               int64_t                   localscore = 0,
               double                    splicescore = 0.0)
@@ -528,8 +545,18 @@ struct GenomeHit {
         }
         assert(_edits != NULL);
         _edits->clear();
-        
         if(edits != NULL) *_edits = *edits;
+        
+        if(_ht_list == NULL) {
+            assert(_ht_list_node == NULL);
+            _ht_list_node = _sharedVars->raw_ht_lists.new_node();
+            assert(_ht_list_node != NULL);
+            _ht_list = &(_ht_list_node->payload);
+        }
+        assert(_ht_list != NULL);
+        _ht_list->clear();
+        if(ht_list != NULL) *_ht_list = *ht_list;
+        
         _hitcount = 1;
 	}
     
@@ -574,26 +601,23 @@ struct GenomeHit {
      * Extend the partial alignment (GenomeHit) bidirectionally
      */
     bool extend(
-                const Read&             rd,
-                const GFM<index_t>&     gfm,
-                const BitPairReference& ref,
-                const ALTDB<index_t>&   altdb,
-                SpliceSiteDB&           ssdb,
-                SwAligner&              swa,
-                SwMetrics&              swm,
-                PerReadMetrics&         prm,
-                const Scoring&          sc,
-                TAlScore                minsc,
-                RandomSource&           rnd,           // pseudo-random source
-                index_t                 minK_local,
-                index_t                 minIntronLen,
-                index_t                 maxIntronLen,
-                index_t                 minAnchorLen,
-                index_t                 minAnchorLen_noncan,
-                const index_t           maxAltsTried,
-                index_t&                leftext,
-                index_t&                rightext,
-                index_t                 mm = 0);
+                const Read&                rd,
+                const GFM<index_t>&        gfm,
+                const BitPairReference&    ref,
+                const ALTDB<index_t>&      altdb,
+                SpliceSiteDB&              ssdb,
+                SwAligner&                 swa,
+                SwMetrics&                 swm,
+                PerReadMetrics&            prm,
+                const Scoring&             sc,
+                TAlScore                   minsc,
+                RandomSource&              rnd,           // pseudo-random source
+                index_t                    minK_local,
+                const TranscriptomePolicy& tpol,
+                const GraphPolicy&         gpol,
+                index_t&                   leftext,
+                index_t&                   rightext,
+                index_t                    mm = 0);
     
     /**
      * Adjust alignment with respect to SNPs, usually updating Edits
@@ -609,7 +633,7 @@ struct GenomeHit {
                               const GFM<index_t>&         gfm,
                               const ALTDB<index_t>&       altdb,
                               const BitPairReference&     ref,
-                              const index_t               maxAltsTried);
+                              const GraphPolicy&          gpol);
     
     /**
      * Adjust alignment with respect to SNPs, usually updating Edits
@@ -620,7 +644,7 @@ struct GenomeHit {
                        const GFM<index_t>&     gfm,
                        const ALTDB<index_t>&   altdb,
                        const BitPairReference& ref,
-                       const index_t           maxAltsTried);
+                       const GraphPolicy&      gpol);
    
     /*
      *
@@ -646,23 +670,28 @@ struct GenomeHit {
      *
      */
     static index_t alignWithALTs(
-                                 const EList<ALT<index_t> >& alts,
-                                 index_t                     joinedOff,
-                                 const BTDnaString&          rdseq,
-                                 index_t                     base_rdoff,
-                                 index_t                     rdoff,
-                                 index_t                     rdlen,
-                                 const BitPairReference&     ref,
-                                 SharedTempVars<index_t>&    sharedVar,
-                                 index_t                     tidx,
-                                 int                         rfoff,
-                                 index_t                     rflen,
-                                 bool                        left,
-                                 const index_t               max_numALTsTried,
-                                 EList<Edit>&                edits,
-                                 ELList<Edit, 128, 4>*       candidate_edits = NULL,
-                                 index_t                     mm = 0,
-                                 index_t*                    numNs = NULL)
+                                 const EList<ALT<index_t> >&       alts,
+                                 const EList<Haplotype<index_t> >& haplotypes,
+                                 const EList<index_t>&             haplotype_maxrights,
+                                 index_t                           joinedOff,
+                                 const BTDnaString&                rdseq,
+                                 index_t                           base_rdoff,
+                                 index_t                           rdoff,
+                                 index_t                           rdlen,
+                                 const BitPairReference&           ref,
+                                 SharedTempVars<index_t>&          sharedVar,
+                                 index_t                           tidx,
+                                 int                               rfoff,
+                                 index_t                           rflen,
+                                 bool                              left,
+                                 const GraphPolicy&                gpol,
+                                 EList<Edit>&                      edits,
+                                 ELList<pair<index_t, index_t> >&  ht_llist,
+                                 EList<pair<index_t, index_t> >&   ht_list,
+                                 Haplotype<index_t>&               cmp_ht,
+                                 ELList<Edit, 128, 4>*             candidate_edits = NULL,
+                                 index_t                           mm = 0,
+                                 index_t*                          numNs = NULL)
     {
         int best_rdoff = (int)rdoff;
         if(numNs != NULL) *numNs = 0;
@@ -671,8 +700,13 @@ struct GenomeHit {
         alt_edits = edits;
         index_t nedits = (index_t)edits.size();
         if(candidate_edits != NULL) candidate_edits->clear();
+        ht_llist.clear();
+        // ht_llist.expand();
+        // ht_llist[0] = ht_list;
         alignWithALTs_recur(
                             alts,
+                            haplotypes,
+                            haplotype_maxrights,
                             joinedOff,
                             rdseq,
                             rdoff - base_rdoff,
@@ -690,11 +724,13 @@ struct GenomeHit {
                             left,
                             edits,
                             mm,
+                            ht_llist,
+                            cmp_ht,
                             candidate_edits,
                             0, /* tmp_numNs */
                             numNs,
                             0,    /* dep */
-                            max_numALTsTried,
+                            gpol,
                             numALTsTried);
         index_t extlen = 0;
         if(left) {
@@ -740,31 +776,35 @@ struct GenomeHit {
      *
      */
     static index_t alignWithALTs_recur(
-                                       const EList<ALT<index_t> >& alts,
-                                       index_t                     joinedOff,
-                                       const BTDnaString&          rdseq,
-                                       index_t                     rdoff_add,
-                                       index_t                     rdoff,
-                                       index_t                     rdlen,
-                                       const BitPairReference&     ref,
-                                       EList<SStringExpandable<char> >& raw_refbufs,
+                                       const EList<ALT<index_t> >&       alts,
+                                       const EList<Haplotype<index_t> >& haplotypes,
+                                       const EList<index_t>&             haplotype_maxrights,
+                                       index_t                           joinedOff,
+                                       const BTDnaString&                rdseq,
+                                       index_t                           rdoff_add,
+                                       index_t                           rdoff,
+                                       index_t                           rdlen,
+                                       const BitPairReference&           ref,
+                                       EList<SStringExpandable<char> >&  raw_refbufs,
                                        ASSERT_ONLY(SStringExpandable<uint32_t> destU32,)
-                                       EList<Edit>&                tmp_edits,
-                                       int&                        best_rdoff,
-                                       const char*                 rfseq,
-                                       index_t                     tidx,
-                                       int                         rfoff,
-                                       index_t                     rflen,
-                                       bool                        left,
-                                       EList<Edit>&                edits,
-                                       index_t                     mm,
-                                       ELList<Edit, 128, 4>*       candidate_edits,
-                                       index_t                     tmp_numNs,
-                                       index_t*                    numNs,
-                                       index_t                     dep,
-                                       const index_t               max_numALTsTried,
-                                       index_t&                    numALTsTried,
-                                       ALT_TYPE                    prev_alt_type = ALT_NONE);
+                                       EList<Edit>&                      tmp_edits,
+                                       int&                              best_rdoff,
+                                       const char*                       rfseq,
+                                       index_t                           tidx,
+                                       int                               rfoff,
+                                       index_t                           rflen,
+                                       bool                              left,
+                                       EList<Edit>&                      edits,
+                                       index_t                           mm,
+                                       ELList<pair<index_t, index_t> >&  ht_llist,
+                                       Haplotype<index_t>&               cmp_ht,
+                                       ELList<Edit, 128, 4>*             candidate_edits,
+                                       index_t                           tmp_numNs,
+                                       index_t*                          numNs,
+                                       index_t                           dep,
+                                       const GraphPolicy&                gpol,
+                                       index_t&                          numALTsTried,
+                                       ALT_TYPE                          prev_alt_type = ALT_NONE);
     
     /**
      * For alignment involving indel, move the indels
@@ -1182,6 +1222,7 @@ public:
     index_t         _toff;
     index_t         _joinedOff;
 	EList<Edit>*    _edits;
+    EList<pair<index_t, index_t> >* _ht_list;
     int64_t         _score;
     int64_t         _localscore;
     double          _splicescore;
@@ -1189,6 +1230,7 @@ public:
     index_t         _hitcount;  // for selection purposes
     
     LinkedEListNode<EList<Edit> >*  _edits_node;
+    LinkedEListNode<EList<pair<index_t, index_t> > >*  _ht_list_node;
     SharedTempVars<index_t>* _sharedVars;
 };
 
@@ -1294,7 +1336,6 @@ bool GenomeHit<index_t>::combineWith(
     const index_t reflen = ref.approxLen(_tidx);
     if(this_toff + len > reflen) return false;
     assert_leq(this_toff + len, reflen);
-    assert_geq(other_toff + other_len, len);
     
     // check if an indel or an intron is necessary
     index_t refdif = other_toff - this_toff;
@@ -1737,7 +1778,22 @@ bool GenomeHit<index_t>::combineWith(
             assert_geq(this_rdoff, this->_rdoff);
             index_t addoff = this_rdoff - this->_rdoff;
             if(rdc != rfc) {
-                Edit e((uint32_t)(i + addoff), rfc, rdc, EDIT_TYPE_MM, false);
+                ALT<index_t> cmp_alt;
+                assert_geq(this_toff, this->_toff);
+                cmp_alt.pos = this->_joinedOff + i + (this_toff - this->_toff);
+                index_t alt_i = (index_t)altdb.alts().bsearchLoBound(cmp_alt);
+                index_t add_alt_i = std::numeric_limits<index_t>::max();
+                for(; alt_i < altdb.alts().size(); alt_i++) {
+                    const ALT<index_t>& alt = altdb.alts()[alt_i];
+                    if(alt.left > cmp_alt.pos) break;
+                    if(alt.type != ALT_SNP_SGL) continue;
+                    if(alt.seq == rdc) {
+                        add_alt_i = alt_i;
+                        break;
+                    }
+                }
+                
+                Edit e((uint32_t)(i + addoff), rfc, rdc, EDIT_TYPE_MM, false, add_alt_i);
                 _edits->push_back(e);
             }
             if(i == maxscorei) {
@@ -1834,26 +1890,23 @@ bool GenomeHit<index_t>::combineWith(
  */
 template <typename index_t>
 bool GenomeHit<index_t>::extend(
-                                const Read&             rd,
-                                const GFM<index_t>&     gfm,
-                                const BitPairReference& ref,
-                                const ALTDB<index_t>&   altdb,
-                                SpliceSiteDB&           ssdb,
-                                SwAligner&              swa,
-                                SwMetrics&              swm,
-                                PerReadMetrics&         prm,
-                                const Scoring&          sc,
-                                TAlScore                minsc,
-                                RandomSource&           rnd,           // pseudo-random source
-                                index_t                 minK_local,
-                                index_t                 minIntronLen,
-                                index_t                 maxIntronLen,
-                                index_t                 minAnchorLen,
-                                index_t                 minAnchorLen_noncan,
-                                const index_t           maxAltsTried,
-                                index_t&                leftext,
-                                index_t&                rightext,
-                                index_t                 mm)
+                                const Read&                rd,
+                                const GFM<index_t>&        gfm,
+                                const BitPairReference&    ref,
+                                const ALTDB<index_t>&      altdb,
+                                SpliceSiteDB&              ssdb,
+                                SwAligner&                 swa,
+                                SwMetrics&                 swm,
+                                PerReadMetrics&            prm,
+                                const Scoring&             sc,
+                                TAlScore                   minsc,
+                                RandomSource&              rnd,           // pseudo-random source
+                                index_t                    minK_local,
+                                const TranscriptomePolicy& tpol,
+                                const GraphPolicy&         gpol,
+                                index_t&                   leftext,
+                                index_t&                   rightext,
+                                index_t                    mm)
 {
     assert_lt(this->_tidx, ref.numRefs());
     index_t max_leftext = leftext, max_rightext = rightext;
@@ -1863,6 +1916,11 @@ bool GenomeHit<index_t>::extend(
     bool doLeftAlign = false;
     assert(_sharedVars != NULL);
     
+    const index_t minIntronLen = tpol.minIntronLen();
+    const index_t maxIntronLen = tpol.maxIntronLen();
+    const index_t minAnchorLen = tpol.minAnchorLen();
+    const index_t minAnchorLen_noncan = tpol.minAnchorLen_noncan();
+    
     // extend the alignment further in the left direction
     // with 'mm' mismatches allowed
     const BTDnaString& seq = _fw ? rd.patFw : rd.patRc;
@@ -1885,6 +1943,8 @@ bool GenomeHit<index_t>::extend(
         index_t num_prev_edits = (index_t)_edits->size();
         index_t best_ext = alignWithALTs(
                                          altdb.alts(),
+                                         altdb.haplotypes(),
+                                         altdb.haplotype_maxrights(),
                                          this->_joinedOff,
                                          seq,
                                          this->_rdoff - 1,
@@ -1896,8 +1956,11 @@ bool GenomeHit<index_t>::extend(
                                          rl,
                                          reflen,
                                          true, /* left? */
-                                         maxAltsTried,
+                                         gpol,
                                          *this->_edits,
+                                         _sharedVars->ht_llist,
+                                         *this->_ht_list,
+                                         _sharedVars->cmp_ht,
                                          NULL,
                                          mm,
                                          &numNs);
@@ -1961,6 +2024,8 @@ bool GenomeHit<index_t>::extend(
             }
             index_t best_ext = alignWithALTs(
                                              altdb.alts(),
+                                             altdb.haplotypes(),
+                                             altdb.haplotype_maxrights(),
                                              this->_joinedOff + ref_ext,
                                              seq,
                                              this->_rdoff,
@@ -1972,8 +2037,11 @@ bool GenomeHit<index_t>::extend(
                                              (int)rl,
                                              reflen,
                                              false,
-                                             maxAltsTried,
+                                             gpol,
                                              *this->_edits,
+                                             _sharedVars->ht_llist,
+                                             *this->_ht_list,
+                                             _sharedVars->cmp_ht,
                                              NULL,
                                              mm);
             // Do not allow for any edits including known snps and splice sites when extending zero-length hit
@@ -2036,7 +2104,7 @@ bool GenomeHit<index_t>::adjustWithALT(
                                        const GFM<index_t>&         gfm,
                                        const ALTDB<index_t>&       altdb,
                                        const BitPairReference&     ref,
-                                       const index_t               maxAltsTried)
+                                       const GraphPolicy&          gpol)
 {
     if(gfm.gh().linearFM()) {
         genomeHits.expand();
@@ -2095,7 +2163,7 @@ bool GenomeHit<index_t>::adjustWithALT(
         bool found2 = false;
         // maxAltsTried is not directly related to the size of offDiffs,
         // but let's make the size of offDiffs is determined by maxAltsTried
-        const index_t max_offDiffs_size = max<index_t>(4, maxAltsTried / 4);
+        const index_t max_offDiffs_size = max<index_t>(4, gpol.maxAltsTried() / 4);
         if(offDiffs.size() > max_offDiffs_size) offDiffs.resize(max_offDiffs_size);
         for(index_t o = 0; o < offDiffs.size() && !found2; o++) {
             const pair<index_t, int>& offDiff = offDiffs[o];
@@ -2125,6 +2193,8 @@ bool GenomeHit<index_t>::adjustWithALT(
             index_t reflen = genomeHit._len + 10;
             index_t alignedLen = alignWithALTs(
                                                alts,
+                                               altdb.haplotypes(),
+                                               altdb.haplotype_maxrights(),
                                                genomeHit._joinedOff,
                                                seq,
                                                genomeHit._rdoff,
@@ -2136,8 +2206,11 @@ bool GenomeHit<index_t>::adjustWithALT(
                                                (int)genomeHit._toff,
                                                reflen,
                                                false, /* left? */
-                                               maxAltsTried,
+                                               gpol,
                                                *genomeHit._edits,
+                                               sharedVars.ht_llist,
+                                               *genomeHit._ht_list,
+                                               sharedVars.cmp_ht,
                                                &candidate_edits);
             if(alignedLen == genomeHit._len) {
                 found2 = true;
@@ -2181,7 +2254,7 @@ bool GenomeHit<index_t>::adjustWithALT(
                                        const GFM<index_t>&     gfm,
                                        const ALTDB<index_t>&   altdb,
                                        const BitPairReference& ref,
-                                       const index_t           maxAltsTried)
+                                       const GraphPolicy&      gpol)
 {
     if(gfm.gh().linearFM()) return true;
     assert_lt(this->_tidx, ref.numRefs());
@@ -2199,7 +2272,7 @@ bool GenomeHit<index_t>::adjustWithALT(
     bool found = false;
     // maxAltsTried is not directly related to the size of offDiffs,
     // but let's make the size of offDiffs is determined by maxAltsTried
-    const index_t max_offDiffs_size = max<index_t>(4, maxAltsTried / 4);
+    const index_t max_offDiffs_size = max<index_t>(4, gpol.maxAltsTried() / 4);
     if(offDiffs.size() > max_offDiffs_size) offDiffs.resize(max_offDiffs_size);
     for(index_t o = 0; o < offDiffs.size() && !found; o++) {
         const pair<index_t, int>& offDiff = offDiffs[o];
@@ -2226,6 +2299,8 @@ bool GenomeHit<index_t>::adjustWithALT(
         index_t reflen = this->_len + 10;
         index_t alignedLen = alignWithALTs(
                                            alts,
+                                           altdb.haplotypes(),
+                                           altdb.haplotype_maxrights(),
                                            this->_joinedOff,
                                            seq,
                                            this->_rdoff,
@@ -2237,8 +2312,11 @@ bool GenomeHit<index_t>::adjustWithALT(
                                            (int)this->_toff,
                                            reflen,
                                            false, /* left? */
-                                           maxAltsTried,
+                                           gpol,
                                            *this->_edits,
+                                           _sharedVars->ht_llist,
+                                           *this->_ht_list,
+                                           _sharedVars->cmp_ht,
                                            &_sharedVars->candidate_edits);
         if(alignedLen == this->_len) {
             found = true;
@@ -2386,42 +2464,171 @@ void GenomeHit<index_t>::findOffDiffs(
     }
 }
 
+
+/*
+ *
+ */
+template <typename index_t>
+void add_haplotypes(
+                    const EList<ALT<index_t> >&       alts,
+                    const EList<Haplotype<index_t> >& haplotypes,
+                    const EList<index_t>&             haplotype_maxrights,
+                    Haplotype<index_t>&               cmp_ht,
+                    EList<pair<index_t, index_t> >&   ht_list,
+                    index_t                           rdlen,
+                    bool                              left_ext = true,
+                    bool                              initial = false)
+{
+    pair<int, int> ht_range;
+    ht_range.first = ht_range.second = (int)haplotypes.bsearchLoBound(cmp_ht);
+    if(ht_range.first >= haplotypes.size())
+        return;
+    
+    if(left_ext) {
+        for(; ht_range.first >= 0; ht_range.first--) {
+            const Haplotype<index_t>& ht = haplotypes[ht_range.first];
+            if(!initial) {
+                if(ht.right >= cmp_ht.left) continue;
+            }
+            index_t ht_maxright = haplotype_maxrights[ht_range.first];
+            assert_geq(ht_maxright, ht.right);
+            if(ht_maxright + rdlen - 1 < cmp_ht.left) break;
+            if(ht.alts.size() <= 0) continue;
+            bool added = false;
+            for(index_t h = 0; h < ht_list.size(); h++) {
+                if(ht_list[h].first == ht_range.first) {
+                    added = true;
+                    break;
+                }
+            }
+            if(added) continue;
+            ht_list.expand();
+            ht_list.back().first = ht_range.first;
+            assert_gt(ht.alts.size(), 0);
+            if(ht.right < cmp_ht.left) {
+                ht_list.back().second = ht.alts.size() - 1;
+            } else {
+                assert(initial);
+                ht_list.back().second = ht.alts.size();
+                for(int a = (int)ht.alts.size() - 1; a >= 0; a--) {
+                    index_t alti = ht.alts[a];
+                    assert_lt(alti, alts.size());
+                    const ALT<index_t>& alt = alts[alti];
+                    assert(alt.snp());
+                    ht_list.back().second = (index_t)a;
+                    if(cmp_ht.left > alt.pos) break;
+                }
+                if(ht_list.back().second == ht.alts.size()) {
+                    ht_list.pop_back();
+                }
+            }
+        }
+    } else {
+        if(initial) {
+            for(; ht_range.first >= 0; ht_range.first--) {
+                const Haplotype<index_t>& ht = haplotypes[ht_range.first];
+                index_t ht_maxright = haplotype_maxrights[ht_range.first];
+                assert_geq(ht_maxright, ht.right);
+                if(ht_maxright < cmp_ht.left) break;
+                if(ht.right < cmp_ht.left || ht.left > cmp_ht.left) continue;
+                if(ht.alts.size() <= 0) continue;
+                bool added = false;
+                for(index_t h = 0; h < ht_list.size(); h++) {
+                    if(ht_list[h].first == ht_range.first) {
+                        added = true;
+                        break;
+                    }
+                }
+                if(added) continue;
+                ht_list.expand();
+                ht_list.back().first = ht_range.first;
+                assert_gt(ht.alts.size(), 0);
+                ht_list.back().second = ht.alts.size();
+                for(index_t a = 0; a < ht.alts.size(); a++) {
+                    index_t alti = ht.alts[a];
+                    assert_lt(alti, alts.size());
+                    const ALT<index_t>& alt = alts[alti];
+                    assert(alt.snp());
+                    ht_list.back().second = a;
+                    if(cmp_ht.left <= alt.pos) break;
+                }
+                if(ht_list.back().second == ht.alts.size()) {
+                    ht_list.pop_back();
+                }
+            }
+        }
+
+        for(; ht_range.second < haplotypes.size(); ht_range.second++) {
+                const Haplotype<index_t>& ht = haplotypes[ht_range.second];
+                if(ht.left < cmp_ht.right) continue;
+                if(ht.left >= cmp_ht.right + rdlen) break;
+                if(ht.alts.size() <= 0) continue;
+                bool added = false;
+                for(index_t h = 0; h < ht_list.size(); h++) {
+                    if(ht_list[h].first == ht_range.second) {
+                        added = true;
+                        break;
+                    }
+                }
+                if(added) continue;
+                ht_list.expand();
+                ht_list.back().first = ht_range.second;
+                assert_gt(ht.alts.size(), 0);
+                ht_list.back().second = 0;
+        }
+    }
+}
+
+
 /*
  *
  */
 template <typename index_t>
 index_t GenomeHit<index_t>::alignWithALTs_recur(
-                                                const EList<ALT<index_t> >& alts,
-                                                index_t                     joinedOff,
-                                                const BTDnaString&          rdseq,
-                                                index_t                     rdoff_add,
-                                                index_t                     rdoff,
-                                                index_t                     rdlen,
-                                                const BitPairReference&     ref,
-                                                EList<SStringExpandable<char> >& raw_refbufs,
+                                                const EList<ALT<index_t> >&       alts,
+                                                const EList<Haplotype<index_t> >& haplotypes,
+                                                const EList<index_t>&             haplotype_maxrights,
+                                                index_t                           joinedOff,
+                                                const BTDnaString&                rdseq,
+                                                index_t                           rdoff_add,
+                                                index_t                           rdoff,
+                                                index_t                           rdlen,
+                                                const BitPairReference&           ref,
+                                                EList<SStringExpandable<char> >&  raw_refbufs,
                                                 ASSERT_ONLY(SStringExpandable<uint32_t> destU32,)
-                                                EList<Edit>&                tmp_edits,
-                                                int&                        best_rdoff,
-                                                const char*                 rfseq,
-                                                index_t                     tidx,
-                                                int                         rfoff,
-                                                index_t                     rflen,
-                                                bool                        left,
-                                                EList<Edit>&                edits,
-                                                index_t                     mm,
-                                                ELList<Edit, 128, 4>*       candidate_edits,
-                                                index_t                     tmp_numNs,
-                                                index_t*                    numNs,
-                                                index_t                     dep,
-                                                const index_t               max_numALTsTried,
-                                                index_t&                    numALTsTried,
-                                                ALT_TYPE                    prev_alt_type)
+                                                EList<Edit>&                      tmp_edits,
+                                                int&                              best_rdoff,
+                                                const char*                       rfseq,
+                                                index_t                           tidx,
+                                                int                               rfoff,
+                                                index_t                           rflen,
+                                                bool                              left,
+                                                EList<Edit>&                      edits,
+                                                index_t                           mm,
+                                                ELList<pair<index_t, index_t> >&  ht_llist,
+                                                Haplotype<index_t>&               cmp_ht,
+                                                ELList<Edit, 128, 4>*             candidate_edits,
+                                                index_t                           tmp_numNs,
+                                                index_t*                          numNs,
+                                                index_t                           dep,
+                                                const GraphPolicy&                gpol,
+                                                index_t&                          numALTsTried,
+                                                ALT_TYPE                          prev_alt_type)
 {
-    if(numALTsTried > max_numALTsTried + dep) return 0;
+    if(numALTsTried > gpol.maxAltsTried() + dep) return 0;
     assert_gt(rdlen, 0);
     assert_gt(rflen, 0);
+    if(ht_llist.size() <= dep) ht_llist.expand();
     if(raw_refbufs.size() <= dep) raw_refbufs.expand();
     if(rfoff < -16) return 0;
+    size_t contig_len = ref.approxLen(tidx);
+    if(rfoff >= contig_len) return 0;
+    if(rfoff >= 0 && rfoff + rflen > contig_len) {
+        rflen = contig_len - rfoff;
+    } else if(rfoff < 0 && rflen > contig_len) {
+        rflen = contig_len;
+    }
+    if(rflen == 0) return 0;
     if(rfseq == NULL) {
         SStringExpandable<char>& raw_refbuf = raw_refbufs[dep];
         raw_refbuf.resize(rflen + 16 + 16);
@@ -2475,6 +2682,7 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
             tmp_edits.erase(0, tmp_mm);
             tmp_mm = 0;
         }
+        
         // Find SNPs included in this region
         pair<int, int> alt_range(0, 0);
         if(alts.size() > 0) {
@@ -2483,6 +2691,9 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
             assert_leq(mm_min_rd_i, rdoff);
             index_t rd_diff = rdoff - mm_min_rd_i;
             rd_diff = (rd_diff > minK ? rd_diff - minK : 0);
+            if(gpol.enableCODIS()) {
+                rd_diff = 0;
+            }
             if(rd_diff >= joinedOff) {
                 cmp_alt.pos = joinedOff;
             } else {
@@ -2507,6 +2718,49 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
                 }
             }
         }
+        
+        // Update and find Haplotypes
+        EList<pair<index_t, index_t> >& ht_list = ht_llist[dep];
+        ht_list.clear();
+        if(gpol.useHaplotype() && haplotypes.size() > 0) {
+            if(dep > 0) {
+                EList<pair<index_t, index_t> >& ht_prev_list = ht_llist[dep-1];
+                for(index_t p = 0; p < ht_prev_list.size(); p++) {
+                    const pair<index_t, index_t>& ht_ref = ht_prev_list[p];
+                    const Haplotype<index_t>& ht = haplotypes[ht_ref.first];
+                    assert_lt(ht_ref.second, ht.alts.size());
+                    index_t alt_id = ht.alts[ht_ref.second];
+                    assert_gt(tmp_edits.size(), 0);
+                    const ALT<index_t>& alt = alts[tmp_edits[0].snpID];
+                    const ALT<index_t>& ht_alt = alts[alt_id];
+                    if(!alt.isSame(ht_alt)) continue;
+                    if(ht_ref.second == 0) {
+                        cmp_ht.left = cmp_ht.right = joinedOff;
+                        add_haplotypes(alts,
+                                       haplotypes,
+                                       haplotype_maxrights,
+                                       cmp_ht,
+                                       ht_list,
+                                       rdlen);
+                    } else {
+                        ht_list.push_back(ht_ref);
+                        ht_list.back().second--;
+                    }
+                }
+            }
+            if(ht_list.size() <= 0) {
+                cmp_ht.left = cmp_ht.right = joinedOff;
+                add_haplotypes(alts,
+                               haplotypes,
+                               haplotype_maxrights,
+                               cmp_ht,
+                               ht_list,
+                               rdlen,
+                               true, // left_ext?
+                               dep == 0); // initial?
+            }
+        }
+        
         assert_geq(rdoff, 0);
         const index_t orig_nedits = (index_t)tmp_edits.size();
         for(; alt_range.second > alt_range.first; alt_range.second--) {
@@ -2547,6 +2801,24 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
                 }
                 break;
             }
+            
+            // Check to see if there is a haplotype that supports this alt
+            if(ht_list.size() > 0 && alt.snp()) {
+                bool ht_found = false;
+                for(index_t h = 0; h < ht_list.size(); h++) {
+                    const pair<index_t, index_t>& ht_ref = ht_list[h];
+                    const Haplotype<index_t>& ht = haplotypes[ht_ref.first];
+                    assert_lt(ht_ref.second, ht.alts.size());
+                    index_t ht_alti = ht.alts[ht_ref.second];
+                    const ALT<index_t>& ht_alt = alts[ht_alti];
+                    if(alts[alt_range.second].isSame(ht_alt)) {
+                        ht_found = true;
+                        break;
+                    }
+                }
+                if(!ht_found) continue;
+            }
+
             if(alt.type == ALT_SNP_SGL) {
                 if(rd_bp == (int)alt.seq) {
                     int rf_bp = rfseq[rf_i];
@@ -2682,6 +2954,8 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
                 }
                 index_t alignedLen = alignWithALTs_recur(
                                                          alts,
+                                                         haplotypes,
+                                                         haplotype_maxrights,
                                                          next_joinedOff,
                                                          rdseq,
                                                          rdoff_add,
@@ -2699,11 +2973,13 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
                                                          left,
                                                          edits,
                                                          mm,
+                                                         ht_llist,
+                                                         cmp_ht,
                                                          candidate_edits,
                                                          tmp_numNs,
                                                          numNs,
                                                          dep + 1,
-                                                         max_numALTsTried,
+                                                         gpol,
                                                          numALTsTried,
                                                          alt.type);
                 if(alignedLen == next_rdlen) return rdlen;
@@ -2756,12 +3032,16 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
         if(mm_max_rd_i == rflen) {
             return mm_max_rd_i;
         }
+        
         // Find SNPs included in this region
         pair<index_t, index_t> alt_range;
         {
             ALT<index_t> cmp_alt;
             const index_t minK = 16;
             index_t rd_diff = (max_rd_i > minK ? max_rd_i - minK : 0);
+            if(gpol.enableCODIS()) {
+                rd_diff = 0;
+            }
             cmp_alt.pos = joinedOff + rd_diff;
             alt_range.first = alt_range.second = (index_t)alts.bsearchLoBound(cmp_alt);
             if(alt_range.first >= alts.size()) return 0;
@@ -2791,6 +3071,51 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
             tmp_edits.resize(tmp_edits.size() - tmp_mm);
             tmp_mm = 0;
         }
+        
+        // Update and find Haplotypes
+        EList<pair<index_t, index_t> >& ht_list = ht_llist[dep];
+        ht_list.clear();
+        if(gpol.useHaplotype() && haplotypes.size() > 0) {
+            if(dep > 0) {
+                EList<pair<index_t, index_t> >& ht_prev_list = ht_llist[dep-1];
+                for(index_t p = 0; p < ht_prev_list.size(); p++) {
+                    const pair<index_t, index_t>& ht_ref = ht_prev_list[p];
+                    const Haplotype<index_t>& ht = haplotypes[ht_ref.first];
+                    if(ht_ref.second < ht.alts.size()) {
+                        index_t alt_id = ht.alts[ht_ref.second];
+                        assert_gt(tmp_edits.size(), 0);
+                        const ALT<index_t>& alt = alts[tmp_edits.back().snpID];
+                        const ALT<index_t>& ht_alt = alts[alt_id];
+                        if(!alt.isSame(ht_alt)) continue;
+                    }
+                    if(ht_ref.second + 1 >= ht.alts.size() && joinedOff > ht.right) {
+                        cmp_ht.left = cmp_ht.right = joinedOff;
+                        add_haplotypes(alts,
+                                       haplotypes,
+                                       haplotype_maxrights,
+                                       cmp_ht,
+                                       ht_list,
+                                       rdlen,
+                                       false); // left_ext?
+                    } else {
+                        ht_list.push_back(ht_ref);
+                        ht_list.back().second++;
+                    }
+                }
+            }
+            if(ht_list.size() <= 0) {
+                cmp_ht.left = cmp_ht.right = joinedOff;
+                add_haplotypes(alts,
+                               haplotypes,
+                               haplotype_maxrights,
+                               cmp_ht,
+                               ht_list,
+                               rdlen,
+                               false, // left_ext?
+                               dep == 0 && rdoff_add == 0); // initial?
+            }
+        }
+        
         const index_t orig_nedits = (index_t)tmp_edits.size();
         for(; alt_range.first < alt_range.second; alt_range.first++) {
             const ALT<index_t>& alt = alts[alt_range.first];
@@ -2809,6 +3134,25 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
             assert_leq(rd_i, max_rd_i);
             int rf_bp = rfseq[rf_i];
             int rd_bp = rdseq[rdoff + rd_i];
+            
+            // Check to see if there is a haplotype that supports this alt
+            if(ht_list.size() > 0 && alt.snp()) {
+                bool ht_found = false;
+                for(index_t h = 0; h < ht_list.size(); h++) {
+                    const pair<index_t, index_t>& ht_ref = ht_list[h];
+                    const Haplotype<index_t>& ht = haplotypes[ht_ref.first];
+                    if(ht_ref.second >= ht.alts.size())
+                        continue;
+                    index_t ht_alti = ht.alts[ht_ref.second];
+                    const ALT<index_t>& ht_alt = alts[ht_alti];
+                    if(alts[alt_range.first].isSame(ht_alt)) {
+                        ht_found = true;
+                        break;
+                    }
+                }
+                if(!ht_found) continue;
+            }
+            
             if(alt.type == ALT_SNP_SGL) {
                 if(rd_bp == (int)alt.seq) {
                     Edit e(
@@ -2824,7 +3168,16 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
                     alt_compatible = true;
                 }
             } else if(alt.type == ALT_SNP_DEL) {
-                if(rd_i > 0) {
+                bool try_del = rd_i > 0;
+                if(rd_i == 0 && dep > 0) {
+                    // Avoid consecutive deletions
+                    assert_gt(tmp_edits.size(), 0);
+                    const Edit& e = tmp_edits.back();
+                    if(e.type != EDIT_TYPE_READ_GAP) {
+                        try_del = true;
+                    }
+                }
+                if(try_del) {
                     if(rf_i + alt.len <= rflen) {
                         for(index_t i = 0; i < alt.len; i++) {
                             rf_bp = rfseq[rf_i + i];
@@ -2894,7 +3247,16 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
                     }
                 }
             } else if(alt.type == ALT_SPLICESITE) {
-                if(rd_i > 0) {
+                bool try_splice = rd_i > 0;
+                if(rd_i == 0 && dep > 0) {
+                    // Avoid consecutive introns
+                    assert_gt(tmp_edits.size(), 0);
+                    const Edit& e = tmp_edits.back();
+                    if(e.type != EDIT_TYPE_SPL) {
+                        try_splice = true;
+                    }
+                }
+                if(try_splice) {
                     assert_lt(rd_i, rflen);
                     index_t intronLen = alt.right - alt.left + 1;
                     Edit e(rd_i + rdoff_add,
@@ -2951,6 +3313,8 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
                 }
                 index_t alignedLen = alignWithALTs_recur(
                                                          alts,
+                                                         haplotypes,
+                                                         haplotype_maxrights,
                                                          next_joinedOff,
                                                          rdseq,
                                                          rdoff_add + rd_i,
@@ -2968,11 +3332,13 @@ index_t GenomeHit<index_t>::alignWithALTs_recur(
                                                          left,
                                                          edits,
                                                          mm,
+                                                         ht_llist,
+                                                         cmp_ht,
                                                          candidate_edits,
                                                          tmp_numNs,
                                                          numNs,
                                                          dep + 1,
-                                                         max_numALTsTried,
+                                                         gpol,
                                                          numALTsTried,
                                                          alt.type);
                 if(alignedLen > 0) {
@@ -4063,7 +4429,7 @@ public:
                                                       gfm,
                                                       altdb,
                                                       ref,
-                                                      gpol.maxAltsTried());
+                                                      gpol);
                 }
                 if(partialHit._hit_type == CANDIDATE_HIT && genomeHits.size() >= maxGenomeHitSize) break;
             }
@@ -4437,7 +4803,7 @@ bool HI_Aligner<index_t, local_index_t>::alignMate(
                                                       gfm,
                                                       altdb,
                                                       ref,
-                                                      gpol.maxAltsTried());
+                                                      gpol);
                 }
                 max_hitlen = hitlen;
             }
@@ -4475,11 +4841,8 @@ bool HI_Aligner<index_t, local_index_t>::alignMate(
                          _minsc[ordi],
                          rnd,
                          (index_t)_minK_local,
-                         (index_t)tpol.minIntronLen(),
-                         (index_t)tpol.maxIntronLen(),
-                         tpol.minAnchorLen(),
-                         tpol.minAnchorLen_noncan(),
-                         gpol.maxAltsTried(),
+                         tpol,
+                         gpol,
                          leftext,
                          rightext);
         hybridSearch_recur(
@@ -4818,7 +5181,7 @@ bool HI_Aligner<index_t, local_index_t>::reportHit(
         if(hit.splicing_dir() == SPL_UNKNOWN)
             return false;
     }
-    if(tpol.no_spliced_alignment()) {
+    if(!tpol.no_spliced_alignment() && tpol.avoid_pseudogene()) {
         if(!spliced.first) {
             assert(!spliced.second);
             const index_t max_exon_size = 10000;
diff --git a/hisat2 b/hisat2
index 19935e9..19272d5 100755
--- a/hisat2
+++ b/hisat2
@@ -74,9 +74,9 @@ my @signame     = ();
 
 # Get description of arguments from HISAT so that we can distinguish HISAT
 # args from wrapper args
-sub getBt2Desc($) {
+sub getHt2Desc($) {
 	my $d = shift;
-	my $cmd = "$align_prog --wrapper basic-0 --arg-desc";
+	my $cmd = "'$align_prog' --wrapper basic-0 --arg-desc";
 	open(my $fh, "$cmd |") || Fail("Failed to run command '$cmd'\n");
 	while(readline $fh) {
 		chomp;
@@ -90,7 +90,7 @@ sub getBt2Desc($) {
 
 my %desc = ();
 my %wrapped = ("1" => 1, "2" => 1);
-getBt2Desc(\%desc);
+getHt2Desc(\%desc);
 
 # Given an option like -1, determine whether it's wrapped (i.e. should be
 # handled by this script rather than being passed along to HISAT)
@@ -98,20 +98,20 @@ sub isWrapped($) { return defined($wrapped{$_[0]}); }
 
 my @orig_argv = @ARGV;
 
-my @bt2w_args = (); # options for wrapper
-my @bt2_args  = (); # options for HISAT
+my @ht2w_args = (); # options for wrapper
+my @ht2_args  = (); # options for HISAT
 my $saw_dd = 0;
 for(0..$#ARGV) {
 	if($ARGV[$_] eq "--") {
 		$saw_dd = 1;
 		next;
 	}
-	push @bt2w_args, $ARGV[$_] if !$saw_dd;
-	push @bt2_args,  $ARGV[$_] if  $saw_dd;
+	push @ht2w_args, $ARGV[$_] if !$saw_dd;
+	push @ht2_args,  $ARGV[$_] if  $saw_dd;
 }
 if(!$saw_dd) {
-	@bt2_args = @bt2w_args;
-	@bt2w_args= ();
+	@ht2_args = @ht2w_args;
+	@ht2w_args= ();
 }
 
 my $debug = 0;
@@ -121,75 +121,75 @@ my $cap_out = undef;       # Filename for passthrough
 my $no_unal = 0;
 my $large_idx = 0;
 # Remove whitespace
-for my $i (0..$#bt2_args) {
-	$bt2_args[$i]=~ s/^\s+//; $bt2_args[$i] =~ s/\s+$//;
+for my $i (0..$#ht2_args) {
+	$ht2_args[$i]=~ s/^\s+//; $ht2_args[$i] =~ s/\s+$//;
 }
 
 # We've handled arguments that the user has explicitly directed either to the
 # wrapper or to hisat, now we capture some of the hisat arguments that
 # ought to be handled in the wrapper
-for(my $i = 0; $i < scalar(@bt2_args); $i++) {
-	next unless defined($bt2_args[$i]);
-	my $arg = $bt2_args[$i];
+for(my $i = 0; $i < scalar(@ht2_args); $i++) {
+	next unless defined($ht2_args[$i]);
+	my $arg = $ht2_args[$i];
 	my @args = split(/=/, $arg);
 	if(scalar(@args) > 2) {
 		$args[1] = join("=", @args[1..$#args]);
 	}
 	$arg = $args[0];
 	if($arg eq "-U" || $arg eq "--unpaired") {
-		$bt2_args[$i] = undef;
+		$ht2_args[$i] = undef;
 		$arg =~ s/^-U//; $arg =~ s/^--unpaired//;
 		if($arg ne "") {
 			# Argument was part of this token
 			my @args = split(/,/, $arg);
-			for my $a (@args) { push @bt2w_args, ("-U", $a); }
+			for my $a (@args) { push @ht2w_args, ("-U", $a); }
 		} else {
 			# Argument is in the next token
-			$i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
+			$i < scalar(@ht2_args)-1 || Fail("Argument expected in next token!\n");
 			$i++;
-			my @args = split(/,/, $bt2_args[$i]);
-			for my $a (@args) { push @bt2w_args, ("-U", $a); }
-			$bt2_args[$i] = undef;
+			my @args = split(/,/, $ht2_args[$i]);
+			for my $a (@args) { push @ht2w_args, ("-U", $a); }
+			$ht2_args[$i] = undef;
 		}
 	}
 	if($arg =~ /^--?([12])/ && $arg !~ /^--?12/) {
 		my $mate = $1;
-		$bt2_args[$i] = undef;
+		$ht2_args[$i] = undef;
 		$arg =~ s/^--?[12]//;
 		if($arg ne "") {
 			# Argument was part of this token
 			my @args = split(/,/, $arg);
-			for my $a (@args) { push @bt2w_args, ("-$mate", $a); }
+			for my $a (@args) { push @ht2w_args, ("-$mate", $a); }
 		} else {
 			# Argument is in the next token
-			$i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
+			$i < scalar(@ht2_args)-1 || Fail("Argument expected in next token!\n");
 			$i++;
-			my @args = split(/,/, $bt2_args[$i]);
-			for my $a (@args) { push @bt2w_args, ("-$mate", $a); }
-			$bt2_args[$i] = undef;
+			my @args = split(/,/, $ht2_args[$i]);
+			for my $a (@args) { push @ht2w_args, ("-$mate", $a); }
+			$ht2_args[$i] = undef;
 		}
 	}
 	if($arg eq "--debug") {
 		$debug = 1;
-		$bt2_args[$i] = undef;
+		$ht2_args[$i] = undef;
 	}
 	if($arg eq "--no-unal") {
 		$no_unal = 1;
-		$bt2_args[$i] = undef;
+		$ht2_args[$i] = undef;
 	}
 	if($arg eq "--large-index") {
 		$large_idx = 1;
-		$bt2_args[$i] = undef;
+		$ht2_args[$i] = undef;
 	}
 	for my $rarg ("un-conc", "al-conc", "al-conc-disc", "un", "al") {
 		if($arg =~ /^--${rarg}$/ || $arg =~ /^--${rarg}-gz$/ || $arg =~ /^--${rarg}-bz2$/) {
-			$bt2_args[$i] = undef;
+			$ht2_args[$i] = undef;
 			if(scalar(@args) > 1 && $args[1] ne "") {
 				$read_fns{$rarg} = $args[1];
 			} else {
-				$i < scalar(@bt2_args)-1 || Fail("--${rarg}* option takes an argument.\n");
-				$read_fns{$rarg} = $bt2_args[$i+1];
-				$bt2_args[$i+1] = undef;
+				$i < scalar(@ht2_args)-1 || Fail("--${rarg}* option takes an argument.\n");
+				$read_fns{$rarg} = $ht2_args[$i+1];
+				$ht2_args[$i+1] = undef;
 			}
 			$read_compress{$rarg} = "";
 			$read_compress{$rarg} = "gzip"  if $arg eq "--${rarg}-gz";
@@ -204,22 +204,22 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
 my $passthru = 0;
 if(scalar(keys %read_fns) > 0 || $no_unal) {
 	$passthru = 1;
-	push @bt2_args, "--passthrough";
+	push @ht2_args, "--passthrough";
 	$cap_out = "-";
-	for(my $i = 0; $i < scalar(@bt2_args); $i++) {
-		next unless defined($bt2_args[$i]);
-		my $arg = $bt2_args[$i];
+	for(my $i = 0; $i < scalar(@ht2_args); $i++) {
+		next unless defined($ht2_args[$i]);
+		my $arg = $ht2_args[$i];
 		if($arg eq "-S" || $arg eq "--output") {
-			$i < scalar(@bt2_args)-1 || Fail("-S/--output takes an argument.\n");
-			$cap_out = $bt2_args[$i+1];
-			$bt2_args[$i] = undef;
-			$bt2_args[$i+1] = undef;
+			$i < scalar(@ht2_args)-1 || Fail("-S/--output takes an argument.\n");
+			$cap_out = $ht2_args[$i+1];
+			$ht2_args[$i] = undef;
+			$ht2_args[$i+1] = undef;
 		}
 	}
 }
 my @tmp = ();
-for (@bt2_args) { push(@tmp, $_) if defined($_); }
- at bt2_args = @tmp;
+for (@ht2_args) { push(@tmp, $_) if defined($_); }
+ at ht2_args = @tmp;
 
 my @unps = ();
 my @mate1s = ();
@@ -235,11 +235,11 @@ my $readpipe = undef;
 my $log_fName = undef;
 my $help = 0;
 
-my @bt2w_args_cp = (@bt2w_args>0) ? @bt2w_args : @bt2_args;
+my @ht2w_args_cp = (@ht2w_args>0) ? @ht2w_args : @ht2_args;
 Getopt::Long::Configure("pass_through","no_ignore_case");
 
 my @old_ARGV = @ARGV;
- at ARGV = @bt2w_args_cp;
+ at ARGV = @ht2w_args_cp;
 
 GetOptions(
 	"1=s"                           => \@mate1s,
@@ -265,8 +265,8 @@ if ($log_fName) {
 }
 
 Info("Before arg handling:\n");
-Info("  Wrapper args:\n[ @bt2w_args ]\n");
-Info("  Binary args:\n[ @bt2_args ]\n");
+Info("  Wrapper args:\n[ @ht2w_args ]\n");
+Info("  Binary args:\n[ @ht2_args ]\n");
 
 sub cat_file($$) {
 	my ($ifn, $ofh) = @_;
@@ -332,7 +332,7 @@ if(wrapInput(\@unps, \@mate1s, \@mate2s)) {
 		# Make a named pipe for delivering mate #1s
 		my $m1fn = "$temp_dir/$$.inpipe1";
 		push @to_delete, $m1fn;
-		push @bt2_args, "-1 $m1fn";
+		push @ht2_args, "-1 $m1fn";
 		# Create named pipe 1 for writing
 		if(!$no_pipes) {
 			mkfifo($m1fn, 0700) || Fail("mkfifo($m1fn) failed.\n");
@@ -349,7 +349,7 @@ if(wrapInput(\@unps, \@mate1s, \@mate2s)) {
 		# Make a named pipe for delivering mate #2s
 		my $m2fn = "$temp_dir/$$.inpipe2";
 		push @to_delete, $m2fn;
-		push @bt2_args, "-2 $m2fn";
+		push @ht2_args, "-2 $m2fn";
 		# Create named pipe 2 for writing
 		if(!$no_pipes) {
 			mkfifo($m2fn, 0700) || Fail("mkfifo($m2fn) failed.\n");
@@ -371,7 +371,7 @@ if(wrapInput(\@unps, \@mate1s, \@mate2s)) {
 		# Make a named pipe for delivering unpaired reads
 		my $ufn = "$temp_dir/$$.unp";
 		push @to_delete, $ufn;
-		push @bt2_args, "-U $ufn";
+		push @ht2_args, "-U $ufn";
 		# Create named pipe 2 for writing
 		if(!$no_pipes) {
 			mkfifo($ufn, 0700) || Fail("mkfifo($ufn) failed.\n");
@@ -389,11 +389,11 @@ if(wrapInput(\@unps, \@mate1s, \@mate2s)) {
 } else {
 	if(scalar(@mate2s) > 0) {
 		# Just pass all the mate arguments along to the binary
-		push @bt2_args, ("-1", join(",", @mate1s));
-		push @bt2_args, ("-2", join(",", @mate2s));
+		push @ht2_args, ("-1", join(",", @mate1s));
+		push @ht2_args, ("-2", join(",", @mate2s));
 	}
 	if(scalar(@unps) > 0) {
-		push @bt2_args, ("-U", join(",", @unps));
+		push @ht2_args, ("-U", join(",", @unps));
 	}
 }
 
@@ -406,7 +406,7 @@ if(defined($ref_str)) {
 	push @to_delete, $ofn;
 	system("$build_bin $ofn $ofn") == 0 ||
 		Fail("hisat2-build returned non-0 exit level.\n");
-	push @bt2_args, ("--index", "$ofn");
+	push @ht2_args, ("--index", "$ofn");
 	push @to_delete, ("$ofn.1.".$idx_ext, "$ofn.2.".$idx_ext, 
 	                  "$ofn.3.".$idx_ext, "$ofn.4.".$idx_ext,
 			  "$ofn.5.".$idx_ext, "$ofn.6.".$idx_ext,
@@ -415,9 +415,9 @@ if(defined($ref_str)) {
 }
 
 Info("After arg handling:\n");
-Info("  Binary args:\n[ @bt2_args ]\n");
+Info("  Binary args:\n[ @ht2_args ]\n");
 
-my $index_name = Extract_IndexName_From(@bt2_args);
+my $index_name = Extract_IndexName_From(@ht2_args);
 
 if ($large_idx) {
     Info("Using a large index enforced by user.\n");
@@ -444,7 +444,7 @@ else {
 my $debug_str = ($debug ? "-debug" : "");
 
 # Construct command invoking hisat2-align
-my $cmd = "$align_prog$debug_str --wrapper basic-0 ".join(" ", @bt2_args);
+my $cmd = "'$align_prog$debug_str' --wrapper basic-0 ".join(" ", @ht2_args);
 
 # Possibly add read input on an anonymous pipe
 $cmd = "$readpipe $cmd" if defined($readpipe);
@@ -453,7 +453,7 @@ Info("$cmd\n");
 my $ret;
 if(defined($cap_out)) {
 	# Open HISAT2 pipe
-	open(BT, "$cmd |") || Fail("Could not open HISAT2 pipe: '$cmd |'\n");
+	open(HT, "$cmd |") || Fail("Could not open HISAT2 pipe: '$cmd |'\n");
 	# Open output pipe
 	my $ofh = *STDOUT;
 	my @fhs_to_close = ();
@@ -511,7 +511,7 @@ if(defined($cap_out)) {
 			}
 		}
 	}
-	while(<BT>) {
+	while(<HT>) {
 		chomp;
 		my $filt = 0;
 		unless(substr($_, 0, 1) eq "@") {
@@ -525,14 +525,14 @@ if(defined($cap_out)) {
 			if($passthru) {
 				if(scalar(keys %read_fhs) == 0) {
 					# Next line is read with some whitespace escaped
-					my $l = <BT>;
+					my $l = <HT>;
 				} else {
 					my $mate1 = (($fl &  64) != 0);
 					my $mate2 = (($fl & 128) != 0);
 					my $unp = !$mate1 && !$mate2;
 					my $pair = !$unp;
 					# Next line is read with some whitespace escaped
-					my $l = <BT>;
+					my $l = <HT>;
 					chomp($l);
 					$l =~ s/%(..)/chr(hex($1))/eg;
 					if((defined($read_fhs{un}) || defined($read_fhs{al})) && $unp) {
@@ -569,7 +569,7 @@ if(defined($cap_out)) {
 	}
 	for my $k (@fhs_to_close) { close($k); }
 	close($ofh);
-	close(BT);
+	close(HT);
 	$ret = $?;
 } else {
 	$ret = system($cmd);
diff --git a/hisat2.cpp b/hisat2.cpp
index b87f063..b73bb40 100644
--- a/hisat2.cpp
+++ b/hisat2.cpp
@@ -58,6 +58,8 @@
 
 using namespace std;
 
+MemoryTally gMemTally;
+
 static EList<string> mates1;  // mated reads (first mate)
 static EList<string> mates2;  // mated reads (second mate)
 static EList<string> mates12; // mated reads (1st/2nd interleaved in 1 file)
@@ -261,6 +263,7 @@ static bool pseudogeneStop;
 static bool tranMapOnly; // transcriptome mapping only
 static bool tranAssm;    // alignments selected for downstream transcript assembly such as StringTie and Cufflinks
 static string tranAssm_program;
+static bool avoid_pseudogene;
 
 #ifdef USE_SRA
 static EList<string> sra_accs;
@@ -278,6 +281,12 @@ static bool rmChrName;  // remove "chr" from reference names (e.g., chr18 to 18)
 static bool addChrName; // add "chr" to reference names (e.g., 18 to chr18)
 
 static size_t max_alts_tried;
+static bool use_haplotype;
+static bool enable_codis;
+
+static bool templateLenAdjustment;
+static string alignSumFile; // write alignment summary stat. to this file
+static bool newAlignSummary;
 
 #define DMAX std::numeric_limits<double>::max()
 
@@ -488,6 +497,7 @@ static void resetOptions() {
     tranMapOnly = false;
     tranAssm = false;
     tranAssm_program = "";
+    avoid_pseudogene = false;
     
 #ifdef USE_SRA
     sra_accs.clear();
@@ -497,6 +507,12 @@ static void resetOptions() {
     addChrName = false;
     
     max_alts_tried = 16;
+    use_haplotype = false;
+    enable_codis = false;
+    
+    templateLenAdjustment = true;
+    alignSumFile = "";
+    newAlignSummary = false;
 }
 
 static const char *short_options = "fF:qbzhcu:rv:s:aP:t3:5:w:p:k:M:1:2:I:X:CQ:N:i:L:U:x:S:g:O:D:R:";
@@ -706,12 +722,18 @@ static struct option long_options[] = {
     {(char*)"downstream-transcriptome-assembly",   no_argument, 0,        ARG_TRANSCRIPTOME_ASSEMBLY},
     {(char*)"dta",             no_argument,        0,        ARG_TRANSCRIPTOME_ASSEMBLY},
     {(char*)"dta-cufflinks",   no_argument,        0,        ARG_TRANSCRIPTOME_ASSEMBLY_CUFFLINKS},
+    {(char*)"avoid-pseudogene",no_argument,        0,        ARG_AVOID_PSEUDOGENE},
+    {(char*)"no-templatelen-adjustment",    no_argument,        0,        ARG_NO_TEMPLATELEN_ADJUSTMENT},
 #ifdef USE_SRA
     {(char*)"sra-acc",         required_argument,  0,        ARG_SRA_ACC},
 #endif
     {(char*)"remove-chrname",  no_argument,        0,        ARG_REMOVE_CHRNAME},
     {(char*)"add-chrname",     no_argument,        0,        ARG_ADD_CHRNAME},
     {(char*)"max-altstried",   required_argument,  0,        ARG_MAX_ALTSTRIED},
+    {(char*)"haplotype",       no_argument,        0,        ARG_HAPLOTYPE},
+    {(char*)"enable-codis",    no_argument,        0,        ARG_CODIS},
+    {(char*)"summary-file",    required_argument,  0,        ARG_SUMMARY_FILE},
+    {(char*)"new-summary",     no_argument,        0,        ARG_NEW_SUMMARY},
 	{(char*)0, 0, 0, 0} // terminator
 };
 
@@ -848,15 +870,17 @@ static void printUsage(ostream& out) {
         << "  --novel-splicesite-infile <path>   provide a list of novel splice sites" << endl
         << "  --no-temp-splicesite               disable the use of splice sites found" << endl
         << "  --no-spliced-alignment             disable spliced alignment" << endl
-        << "  --rna-strandness <string>          Specify strand-specific information (unstranded)" << endl
-        << "  --tmo                              Reports only those alignments within known transcriptome" << endl
-        << "  --dta                              Reports alignments tailored for transcript assemblers" << endl
-        << "  --dta-cufflinks                    Reports alignments tailored specifically for cufflinks" << endl
+        << "  --rna-strandness <string>          specify strand-specific information (unstranded)" << endl
+        << "  --tmo                              reports only those alignments within known transcriptome" << endl
+        << "  --dta                              reports alignments tailored for transcript assemblers" << endl
+        << "  --dta-cufflinks                    reports alignments tailored specifically for cufflinks" << endl
+        << "  --avoid-pseudogene                 tries to avoid aligning reads to pseudogenes (experimental option)" << endl
+        << "  --no-templatelen-adjustment        disables template length adjustment for RNA-seq reads" << endl
         << endl
 		<< " Scoring:" << endl
-		<< "  --ma <int>         match bonus (0 for --end-to-end, 2 for --local) " << endl
-		<< "  --mp <int>,<int>   max and min penalties for mismatch; lower qual = lower penalty <2,6>" << endl
-        << "  --sp <int>,<int>   max and min penalties for soft-clipping; lower qual = lower penalty <1,2>" << endl
+		//<< "  --ma <int>         match bonus (0 for --end-to-end, 2 for --local) " << endl
+		<< "  --mp <int>,<int>   max and min penalties for mismatch; lower qual = lower penalty <6,2>" << endl
+        << "  --sp <int>,<int>   max and min penalties for soft-clipping; lower qual = lower penalty <2,1>" << endl
         << "  --no-softclip      no soft-clipping" << endl
 		<< "  --np <int>         penalty for non-A/C/G/Ts in read/ref (1)" << endl
 		<< "  --rdg <int>,<int>  read gap open, extend penalties (5,3)" << endl
@@ -891,7 +915,9 @@ static void printUsage(ostream& out) {
 	    << "  (Note: for --un, --al, --un-conc, or --al-conc, add '-gz' to the option name, e.g." << endl
 		<< "  --un-gz <path>, to gzip compress output, or add '-bz2' to bzip2 compress output.)" << endl;
 	}
-	out << "  --quiet            print nothing to stderr except serious errors" << endl
+    out << "  --summary-file     print alignment summary to this file." << endl
+        << "  --new-summary      print alignment summary in a new style, which is more machine-friendly." << endl
+        << "  --quiet            print nothing to stderr except serious errors" << endl
 	//  << "  --refidx           refer to ref. seqs by 0-based index rather than name" << endl
 		<< "  --met-file <path>  send metrics to file at <path> (off)" << endl
 		<< "  --met-stderr       send metrics to stderr (off)" << endl
@@ -1510,7 +1536,7 @@ static void parseOption(int next_option, const char *arg) {
         }
         case ARG_NO_SOFTCLIP: {
             ostringstream convert;
-            convert << std::numeric_limits<typeof(penScMax)>::max();
+            convert << std::numeric_limits<int>::max();
             polstr += ";SCP=Q,";
             polstr += convert.str();
             polstr += ",";
@@ -1647,6 +1673,10 @@ static void parseOption(int next_option, const char *arg) {
             tranAssm_program = "cufflinks";
             break;
         }
+        case ARG_AVOID_PSEUDOGENE: {
+            avoid_pseudogene = true;
+            break;
+        }
 #ifdef USE_SRA
         case ARG_SRA_ACC: {
             tokenize(arg, ",", sra_accs); format = SRA_FASTA;
@@ -1665,6 +1695,26 @@ static void parseOption(int next_option, const char *arg) {
             max_alts_tried = parseInt(8, "--max-altstried arg must be at least 8", arg);
             break;
         }
+        case ARG_HAPLOTYPE: {
+            use_haplotype = true;
+            break;
+        }
+        case ARG_CODIS: {
+            enable_codis = true;
+            break;
+        }
+        case ARG_NO_TEMPLATELEN_ADJUSTMENT: {
+            templateLenAdjustment = false;
+            break;
+        }
+        case ARG_SUMMARY_FILE: {
+            alignSumFile = arg;
+            break;
+        }
+        case ARG_NEW_SUMMARY: {
+            newAlignSummary = true;
+            break;
+        }
 		default:
 			printUsage(cerr);
 			throw 1;
@@ -3020,7 +3070,7 @@ static void multiseedSearchWorker_hisat2(void *vp) {
 	
     // Instantiate an object for holding reporting-related parameters.
     if(maxSeeds == 0) {
-        maxSeeds = khits;
+        maxSeeds = max<size_t>(5, khits * 2);
     }
     ReportingParams rp(
                        (allHits ? std::numeric_limits<THitInt>::max() : khits), // -k
@@ -3452,7 +3502,8 @@ static void multiseedSearchWorker_hisat2(void *vp) {
                                      prm,                  // per-read metrics
                                      sc,                   // scoring scheme
                                      !seedSumm,            // suppress seed summaries?
-                                     seedSumm);            // suppress alignments?
+                                     seedSumm,             // suppress alignments?
+                                     templateLenAdjustment);
 				assert(!retry || msinkwrap.empty());
 			} // while(retry)
 		} // if(rdid >= skipReads && rdid < qUpto)
@@ -3613,7 +3664,8 @@ static void driver(
                                      gVerbose, // whether to be talkative
                                      startVerbose, // talkative during initialization
                                      false /*passMemExc*/,
-                                     sanityCheck);
+                                     sanityCheck,
+                                     use_haplotype); //use haplotypes?
 	if(sanityCheck && !os.empty()) {
 		// Sanity check number of patterns and pattern lengths in GFM
 		// against original strings
@@ -3796,9 +3848,13 @@ static void driver(
                                  no_spliced_alignment,
                                  tranMapOnly,
                                  tranAssm,
-                                 xsOnly);
+                                 xsOnly,
+                                 avoid_pseudogene);
         
-        GraphPolicy gpol(max_alts_tried);
+        GraphPolicy gpol(max_alts_tried,
+                         use_haplotype,
+                         altdb->haplotypes().size() > 0 && use_haplotype,
+                         enable_codis);
         
         init_junction_prob();
         bool write = novelSpliceSiteOutfile != "" || useTempSpliceSite;
@@ -3876,11 +3932,24 @@ static void driver(
 			if(repThresh == 0) {
 				repThresh = std::numeric_limits<size_t>::max();
 			}
-			mssink->finish(
-				repThresh,
-				gReportDiscordant,
-				gReportMixed,
-				hadoopOut);
+			mssink->finish(cerr,
+                           repThresh,
+                           gReportDiscordant,
+                           gReportMixed,
+                           newAlignSummary,
+                           hadoopOut);
+            if(alignSumFile != "") {
+                ofstream sumfile(alignSumFile.c_str(), ios::out);
+                if(sumfile.is_open()) {
+                    mssink->finish(sumfile,
+                                   repThresh,
+                                   gReportDiscordant,
+                                   gReportMixed,
+                                   newAlignSummary,
+                                   false); // hadoopOut
+                    sumfile.close();
+                }
+            }
 		}
         if(ssdb != NULL) {
             if(novelSpliceSiteOutfile != "") {
diff --git a/hisat2_build.cpp b/hisat2_build.cpp
index 8282886..bbe46e6 100644
--- a/hisat2_build.cpp
+++ b/hisat2_build.cpp
@@ -45,6 +45,7 @@
 #include <iostream>
 #include <vector>
 
+MemoryTally gMemTally;
 // Build parameters
 int verbose;
 static int sanityCheck;
@@ -104,7 +105,7 @@ static void resetOptions() {
 	nsToAs         = false; // convert reference Ns to As prior to indexing
 	autoMem        = true;  // automatically adjust memory usage parameters
 	packed         = false; //
-	writeRef       = true;  // write compact reference to .3.bt2/.4.bt2
+	writeRef       = true;  // write compact reference to .3.ht2/.4.ht2
 	justRef        = false; // *just* write compact reference, don't index
 	reverseEach    = false;
     nthreads       = 1;
@@ -154,9 +155,9 @@ static void printUsage(ostream& out) {
 		tool_name = "hisat2-build";
 	}
     
-	out << "Usage: hisat2-build [options]* <reference_in> <bt2_index_base>" << endl
+	out << "Usage: hisat2-build [options]* <reference_in> <ht2_index_base>" << endl
 	    << "    reference_in            comma-separated list of files with ref sequences" << endl
-	    << "    hisat2_index_base          write " << gfm_ext << " data to files with this dir/basename" << endl
+	    << "    hisat2_index_base       write " << gfm_ext << " data to files with this dir/basename" << endl
         << "Options:" << endl
         << "    -c                      reference sequences given on cmd line (as" << endl
         << "                            <reference_in>)" << endl;
@@ -170,8 +171,8 @@ static void printUsage(ostream& out) {
 	    << "    --bmaxdivn <int>        max bucket sz as divisor of ref len (default: 4)" << endl
 	    << "    --dcv <int>             diff-cover period for blockwise (default: 1024)" << endl
 	    << "    --nodc                  disable diff-cover (algorithm becomes quadratic)" << endl
-	    << "    -r/--noref              don't build .3/.4.bt2 (packed reference) portion" << endl
-	    << "    -3/--justref            just build .3/.4.bt2 (packed reference) portion" << endl
+	    << "    -r/--noref              don't build .3/.4.ht2 (packed reference) portion" << endl
+	    << "    -3/--justref            just build .3/.4.ht2 (packed reference) portion" << endl
 	    << "    -o/--offrate <int>      SA is sampled every 2^offRate BWT chars (default: 5)" << endl
 	    << "    -t/--ftabchars <int>    # of chars consumed in initial lookup (default: 10)" << endl
         << "    --localoffrate <int>    SA (local) is sampled every 2^offRate BWT chars (default: 3)" << endl
diff --git a/hisat2_genotype.py b/hisat2_genotype.py
deleted file mode 100755
index 31c3aef..0000000
--- a/hisat2_genotype.py
+++ /dev/null
@@ -1,1099 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2016, Daehwan Kim <infphilo at gmail.com>
-#
-# This file is part of HISAT 2.
-#
-# HISAT 2 is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT 2 is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, re
-import inspect, random
-import math
-from argparse import ArgumentParser, FileType
-
-
-"""
-"""
-def read_genome(genome_file):
-    chr_dic, chr_names = {}, []
-    chr_name, sequence = "", ""
-    for line in genome_file:
-        if line.startswith(">"):
-            if chr_name and sequence:
-                chr_dic[chr_name] = sequence
-                chr_names.append(chr_name)
-            chr_name = line.strip().split()[0][1:]
-            sequence = ""
-        else:
-            sequence += line.strip()
-    if chr_name and sequence:
-        chr_dic[chr_name] = sequence
-        chr_names.append(chr_name)
-    return chr_dic, chr_names
-
-
-"""
-"""
-def genotype(reference_type,
-             base_fname,
-             fastq,
-             read_fnames,
-             threads,
-             simulate_interval,
-             num_mismatch,
-             verbose,
-             daehwan_debug):
-    # Current script directory
-    curr_script = os.path.realpath(inspect.getsourcefile(genotype))
-    ex_path = os.path.dirname(curr_script)
-
-    # Load genomic sequences
-    chr_dic, chr_names = read_genome(open("%s.fa" % base_fname))
-
-    # variants, backbone sequence, and other sequeces
-    genotype_fnames = ["%s.fa" % base_fname,
-                       "%s.gene" % base_fname,
-                       "%s.snp" % base_fname,
-                       "%s.haplotype" % base_fname,
-                       "%s.link" % base_fname,
-                       "%s.coord" % base_fname,
-                       "%s.clnsig" % base_fname]
-    # hisat2 graph index files
-    genotype_fnames += ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)]
-
-    def check_files(fnames):
-        for fname in fnames:
-            if not os.path.exists(fname):
-                return False
-        return True
-
-    if not check_files(genotype_fnames):
-        print >> sys.stderr, "Error: some files are missing!"
-        sys.exit(1)
-
-    # Align reads, and sort the alignments into a BAM file
-    hisat2 = os.path.join(ex_path, "hisat2")
-    aligner_cmd = [hisat2,
-                   "--no-unal",
-                   "-p", str(threads)]
-    # aligner_cmd += ["--mm"]
-    aligner_cmd += ["-x", "%s" % base_fname]
-
-    assert len(read_fnames) > 0
-    if not fastq:
-        aligner_cmd += ["-f"]
-    single = len(read_fnames) == 1
-    if single:
-        aligner_cmd += [read_fnames[0]]
-    else:
-        aligner_cmd += ["-1", read_fnames[0],
-                        "-2", read_fnames[1]]
-
-    if verbose:
-        print >> sys.stderr, ' '.join(aligner_cmd)
-
-    align_proc = subprocess.Popen(aligner_cmd,
-                                  stdout=subprocess.PIPE,
-                                  stderr=open("/dev/null", 'w'))
-
-    sambam_cmd = ["samtools",
-                  "view",
-                  "-bS",
-                  "-"]
-    sambam_proc = subprocess.Popen(sambam_cmd,
-                                   stdin=align_proc.stdout,
-                                   stdout=open("hla_input_unsorted.bam", 'w'),
-                                   stderr=open("/dev/null", 'w'))
-    sambam_proc.communicate()
-    bamsort_cmd = ["samtools",
-                   "sort",
-                   "--threads", str(threads),
-                   "hla_input_unsorted.bam"]
-    bamsort_proc = subprocess.Popen(bamsort_cmd,
-                                    stdout=open("hla_input.bam", 'w'),
-                                    stderr=open("/dev/null", 'w'))
-    bamsort_proc.communicate()
-
-    bamindex_cmd = ["samtools",
-                    "index",
-                    "hla_input.bam"]
-    bamindex_proc = subprocess.Popen(bamindex_cmd,
-                                     stderr=open("/dev/null", 'w'))
-    bamindex_proc.communicate()
-
-    os.system("rm hla_input_unsorted.bam")
-
-    # Read partial alleles from hla.data (temporary)
-    partial_alleles = set()
-    """
-    for line in open("IMGTHLA/hla.dat"):
-        if not line.startswith("DE"):
-            continue
-        allele_name = line.split()[1][4:-1]
-        gene = allele_name.split('*')[0]
-        if line.find("partial") != -1:
-            partial_alleles.add(allele_name)
-    """
-
-    # Read HLA alleles (names and sequences)
-    genes, gene_loci, gene_seqs = {}, {}, {}
-    for line in open("%s.gene" % base_fname):
-        family, allele_name, chr, left, right = line.strip().split()
-        gene_name = "%s-%s" % (family, allele_name.split('*')[0])
-        assert gene_name not in genes
-        genes[gene_name] = allele_name
-        left, right = int(left), int(right)
-        """
-        exons = []
-        for exon in exon_str.split(','):
-            exon_left, exon_right = exon.split('-')
-            exons.append([int(exon_left), int(exon_right)])
-        """
-        gene_loci[gene_name] = [allele_name, chr, left, right]
-        assert chr in chr_dic
-        chr_seq = chr_dic[chr]
-        assert left < right
-        assert right < len(chr_seq)
-        gene_seqs[gene_name] = chr_dic[chr][left:right+1]
-
-    # Read link information
-    Links, var_genes, allele_vars = {}, {}, {}
-    for line in open("%s.link" % base_fname):
-        var_id, alleles = line.strip().split('\t')
-        alleles = alleles.split()
-        assert not var_id in Links
-        Links[var_id] = alleles
-        for allele in alleles:
-            if allele not in allele_vars:
-                allele_vars[allele] = set()
-            allele_vars[allele].add(var_id)
-            gene_name = "HLA-%s" % (allele.split('*')[0])
-            var_genes[var_id] = gene_name
-
-    # gene alleles
-    allele_names = {}
-    for gene_name in genes.keys():
-        if gene_name not in allele_names:
-            allele_names[gene_name] = []
-        gene_name2 = gene_name.split('-')[1]
-        for allele_name in allele_vars.keys():
-            allele_name1 = allele_name.split('*')[0]
-            if gene_name2 == allele_name1:
-                allele_names[gene_name].append(allele_name)
-
-
-    # Read HLA variants, and link information
-    Vars, Var_list = {}, {}
-    for line in open("%s.snp" % base_fname):
-        var_id, var_type, chr, pos, data = line.strip().split('\t')
-        pos = int(pos)
-
-        # daehwan - for debugging purposes
-        if var_id not in var_genes:
-            continue
-        
-        assert var_id in var_genes
-        gene_name = var_genes[var_id]
-        """
-        if reference_type != "gene":
-            allele, dist = None, 0
-            for tmp_gene, values in refHLA_loci.items():
-                allele_name, chr, left, right, exons = values
-                if allele == None:
-                    allele = allele_name
-                    dist = abs(pos - left)
-                else:
-                    if dist > abs(pos - left):
-                        allele = allele_name
-                        dist = abs(pos - left)
-        """
-            
-        if not gene_name in Vars:
-            Vars[gene_name] = {}
-            assert not gene_name in Var_list
-            Var_list[gene_name] = []
-            
-        assert not var_id in Vars[gene_name]
-        """
-        left = 0
-        if reference_type != "gene":
-            _, _, left, _, _ = refHLA_loci[gene]
-        """
-        Vars[gene_name][var_id] = [var_type, pos, data]
-        Var_list[gene_name].append([pos, var_id])
-
-    for gene_name, in_var_list in Var_list.items():
-        Var_list[gene_name] = sorted(in_var_list)
-    def lower_bound(Var_list, pos):
-        low, high = 0, len(Var_list)
-        while low < high:
-            m = (low + high) / 2
-            m_pos = Var_list[m][0]
-            if m_pos < pos:
-                low = m + 1
-            elif m_pos > pos:
-                high = m
-            else:
-                assert m_pos == pos
-                while m > 0:
-                    if Var_list[m-1][0] < pos:
-                        break
-                    m -= 1
-                return m
-        return low       
-           
-    
-    # HLA gene allele lengths
-    """
-    HLA_lengths = {}
-    for HLA_gene, HLA_alleles in HLAs.items():
-        HLA_lengths[HLA_gene] = {}
-        for allele_name, seq in HLA_alleles.items():
-            HLA_lengths[HLA_gene][allele_name] = len(seq)
-    """
-
-    # Cigar regular expression
-    cigar_re = re.compile('\d+\w')
-
-    test_list = [[sorted(genes.keys())]]
-    for test_i in range(len(test_list)):
-        test_HLA_list = test_list[test_i]
-        for test_HLA_names in test_HLA_list:
-            print >> sys.stderr, "\t%s" % (test_HLA_names)
-            for gene in test_HLA_names:
-                ref_allele = genes[gene]
-                ref_seq = gene_seqs[gene]
-                # ref_exons = refHLA_loci[gene][-1]
-
-                # Read alignments
-                alignview_cmd = ["samtools",
-                                 "view"]
-                alignview_cmd += ["hla_input.bam"]
-                base_locus = 0
-                _, chr, left, right = gene_loci[gene]
-                base_locus = left
-                alignview_cmd += ["%s:%d-%d" % (chr, left + 1, right + 1)]
-
-                bamview_proc = subprocess.Popen(alignview_cmd,
-                                                stdout=subprocess.PIPE,
-                                                stderr=open("/dev/null", 'w'))
-
-                sort_read_cmd = ["sort", "-k", "1", "-n"]
-                alignview_proc = subprocess.Popen(sort_read_cmd,
-                                                  stdin=bamview_proc.stdout,
-                                                  stdout=subprocess.PIPE,
-                                                  stderr=open("/dev/null", 'w'))
-
-                # Count alleles
-                HLA_counts, HLA_cmpt = {}, {}
-                coverage = [0 for i in range(len(ref_seq) + 1)]
-                num_reads, total_read_len = 0, 0
-                prev_read_id = None
-                prev_exon = False
-                for line in alignview_proc.stdout:
-                    cols = line.strip().split()
-                    read_id, flag, chr, pos, mapQ, cigar_str = cols[:6]
-                    read_seq, qual = cols[9], cols[10]
-                    num_reads += 1
-                    total_read_len += len(read_seq)
-                    flag, pos = int(flag), int(pos)
-                    pos -= 1
-                    if pos < 0:
-                        continue
-
-                    if flag & 0x4 != 0:
-                        continue
-
-                    NM, Zs, MD = "", "", ""
-                    for i in range(11, len(cols)):
-                        col = cols[i]
-                        if col.startswith("Zs"):
-                            Zs = col[5:]
-                        elif col.startswith("MD"):
-                            MD = col[5:]
-                        elif col.startswith("NM"):
-                            NM = int(col[5:])
-
-                    if NM > num_mismatch:
-                        continue
-
-                    # daehwan - for debugging purposes
-                    debug = False
-                    if read_id in ["2339"] and False:
-                        debug = True
-                        print "read_id: %s)" % read_id, pos, cigar_str, "NM:", NM, MD, Zs
-                        print "            ", read_seq
-
-                    vars = []
-                    if Zs:
-                        vars = Zs.split(',')
-
-                    assert MD != ""
-                    MD_str_pos, MD_len = 0, 0
-                    read_pos, left_pos = 0, pos
-                    right_pos = left_pos
-                    cigars = cigar_re.findall(cigar_str)
-                    cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-                    cmp_list = []
-                    for i in range(len(cigars)):
-                        cigar_op, length = cigars[i]
-                        if cigar_op == 'M':
-                            first = True
-                            MD_len_used = 0
-                            while True:
-                                if not first or MD_len == 0:
-                                    if MD[MD_str_pos].isdigit():
-                                        num = int(MD[MD_str_pos])
-                                        MD_str_pos += 1
-                                        while MD_str_pos < len(MD):
-                                            if MD[MD_str_pos].isdigit():
-                                                num = num * 10 + int(MD[MD_str_pos])
-                                                MD_str_pos += 1
-                                            else:
-                                                break
-                                        MD_len += num
-                                # Insertion or full match followed
-                                if MD_len >= length:
-                                    MD_len -= length
-                                    cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used])
-                                    break
-                                first = False
-                                read_base = read_seq[read_pos + MD_len]
-                                MD_ref_base = MD[MD_str_pos]
-                                MD_str_pos += 1
-                                assert MD_ref_base in "ACGT"
-                                cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used])
-                                cmp_list.append(["mismatch", right_pos + MD_len, 1])
-                                MD_len_used = MD_len + 1
-                                MD_len += 1
-                                # Full match
-                                if MD_len == length:
-                                    MD_len = 0
-                                    break
-                        elif cigar_op == 'I':
-                            cmp_list.append(["insertion", right_pos, length])
-                        elif cigar_op == 'D':
-                            if MD[MD_str_pos] == '0':
-                                MD_str_pos += 1
-                            assert MD[MD_str_pos] == '^'
-                            MD_str_pos += 1
-                            while MD_str_pos < len(MD):
-                                if not MD[MD_str_pos] in "ACGT":
-                                    break
-                                MD_str_pos += 1
-                            cmp_list.append(["deletion", right_pos, length])
-                        elif cigar_op == 'S':
-                            cmp_list.append(["soft", right_pos, length])
-                        else:                    
-                            assert cigar_op == 'N'
-                            cmp_list.append(["intron", right_pos, length])
-
-                        if cigar_op in "MND":
-                            right_pos += length
-
-                        if cigar_op in "MIS":
-                            read_pos += length
-
-                    """
-                    exon = False
-                    for exon in ref_exons:
-                        exon_left, exon_right = exon
-                        if right_pos <= exon_left or pos > exon_right:
-                            continue
-                        else:
-                            exon = True
-                            break
-                    """
-
-                    if left_pos < base_locus or \
-                            right_pos - base_locus > len(ref_seq):
-                        continue
-                
-                    def add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, exon = True):
-                        max_count = max(HLA_count_per_read.values())
-                        cur_cmpt = set()
-                        for allele, count in HLA_count_per_read.items():
-                            if count < max_count:
-                                continue
-                            """
-                            if allele in exclude_allele_list:
-                                continue
-                            """
-                            cur_cmpt.add(allele)                    
-                            if not allele in HLA_counts:
-                                HLA_counts[allele] = 1
-                            else:
-                                HLA_counts[allele] += 1
-
-                        if len(cur_cmpt) == 0:
-                            return
-
-                        # daehwan - for debugging purposes                            
-                        alleles = ["", ""]
-                        # alleles = ["B*40:304", "B*40:02:01"]
-                        allele1_found, allele2_found = False, False
-                        for allele, count in HLA_count_per_read.items():
-                            if count < max_count:
-                                continue
-                            if allele == alleles[0]:
-                                allele1_found = True
-                            elif allele == alleles[1]:
-                                allele2_found = True
-                        if allele1_found != allele2_found:
-                            print alleles[0], HLA_count_per_read[alleles[0]]
-                            print alleles[1], HLA_count_per_read[alleles[1]]
-                            if allele1_found:
-                                print ("%s\tread_id %s - %d vs. %d]" % (alleles[0], prev_read_id, max_count, HLA_count_per_read[alleles[1]]))
-                            else:
-                                print ("%s\tread_id %s - %d vs. %d]" % (alleles[1], prev_read_id, max_count, HLA_count_per_read[alleles[0]]))
-                            print read_seq
-
-                        cur_cmpt = sorted(list(cur_cmpt))
-                        cur_cmpt = '-'.join(cur_cmpt)
-                        add = 1
-                        """
-                        if partial and not exon:
-                            add *= 0.2
-                        """
-                        if not cur_cmpt in HLA_cmpt:
-                            HLA_cmpt[cur_cmpt] = add
-                        else:
-                            HLA_cmpt[cur_cmpt] += add
-
-                    if read_id != prev_read_id:
-                        if prev_read_id != None:
-                            add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, prev_exon)
-
-                        HLA_count_per_read = {}
-                        for HLA_name in allele_names[gene]:
-                            if HLA_name.find("BACKBONE") != -1:
-                                continue
-                            HLA_count_per_read[HLA_name] = 0
-
-                    def add_count(var_id, add):
-                        assert var_id in Links
-                        alleles = Links[var_id]
-                        for allele in alleles:
-                            if allele.find("BACKBONE") != -1:
-                                continue
-                            HLA_count_per_read[allele] += add
-                            # daehwan - for debugging purposes
-                            if debug:
-                                if allele in ["DQA1*05:05:01:01", "DQA1*05:05:01:02"]:
-                                    print allele, add, var_id
-
-                    # Decide which allele(s) a read most likely came from
-                    # also sanity check - read length, cigar string, and MD string
-                    for var_id, data in Vars[gene].items():
-                        var_type, var_pos, var_data = data
-                        if var_type != "deletion":
-                            continue
-                        if left_pos >= var_pos and right_pos <= var_pos + int(var_data):
-                            add_count(var_id, -1)                            
-                    ref_pos, read_pos, cmp_cigar_str, cmp_MD = left_pos, 0, "", ""
-                    cigar_match_len, MD_match_len = 0, 0            
-                    for cmp in cmp_list:
-                        type = cmp[0]
-                        length = cmp[2]
-                        if type == "match":
-                            var_idx = lower_bound(Var_list[gene], ref_pos)
-                            while var_idx < len(Var_list[gene]):
-                                var_pos, var_id = Var_list[gene][var_idx]
-                                if ref_pos + length <= var_pos:
-                                    break
-                                if ref_pos <= var_pos:
-                                    var_type, _, var_data = Vars[gene][var_id]
-                                    if var_type == "insertion":
-                                        if ref_pos < var_pos and ref_pos + length > var_pos + len(var_data):
-                                            add_count(var_id, -1)
-                                            # daehwan - for debugging purposes
-                                            if debug:
-                                                print cmp, var_id, Links[var_id]
-                                    elif var_type == "deletion":
-                                        del_len = int(var_data)
-                                        if ref_pos < var_pos and ref_pos + length > var_pos + del_len:
-                                            # daehwan - for debugging purposes
-                                            if debug:
-                                                print cmp, var_id, Links[var_id], -1, Vars[gene][var_id]
-                                            # Check if this might be one of the two tandem repeats (the same left coordinate)
-                                            cmp_left, cmp_right = cmp[1], cmp[1] + cmp[2]
-                                            test1_seq1 = ref_seq[cmp_left-base_locus:cmp_right-base_locus]
-                                            test1_seq2 = ref_seq[cmp_left-base_locus:var_pos-base_locus] + ref_seq[var_pos + del_len - base_locus:cmp_right + del_len - base_locus]
-                                            # Check if this happens due to small repeats (the same right coordinate - e.g. 19 times of TTTC in DQA1*05:05:01:02)
-                                            cmp_left -= read_pos
-                                            cmp_right += (len(read_seq) - read_pos - cmp[2])
-                                            test2_seq1 = ref_seq[cmp_left+int(var_data)-base_locus:cmp_right-base_locus]
-                                            test2_seq2 = ref_seq[cmp_left-base_locus:var_pos-base_locus] + ref_seq[var_pos+int(var_data)-base_locus:cmp_right-base_locus]
-                                            if test1_seq1 != test1_seq2 and test2_seq1 != test2_seq2:
-                                                add_count(var_id, -1)
-                                    else:
-                                        if debug:
-                                            print cmp, var_id, Links[var_id], -1
-                                        add_count(var_id, -1)
-                                var_idx += 1
-
-                            read_pos += length
-                            ref_pos += length
-                            cigar_match_len += length
-                            MD_match_len += length
-                        elif type == "mismatch":
-                            read_base = read_seq[read_pos]
-                            var_idx = lower_bound(Var_list[gene], ref_pos)
-                            while var_idx < len(Var_list[gene]):
-                                var_pos, var_id = Var_list[gene][var_idx]
-                                if ref_pos < var_pos:
-                                    break
-                                if ref_pos == var_pos:
-                                    var_type, _, var_data = Vars[gene][var_id]
-                                    if var_type == "single":
-                                        if var_data == read_base:
-                                            # daehwan - for debugging purposes
-                                            if debug:
-                                                print cmp, var_id, 1, var_data, read_base, Links[var_id]
-
-                                            # daehwan - for debugging purposes
-                                            if False:
-                                                read_qual = ord(qual[read_pos])
-                                                add_count(var_id, (read_qual - 60) / 60.0)
-                                            else:
-                                                add_count(var_id, 1)
-                                        # daehwan - check out if this routine is appropriate
-                                        # else:
-                                        #    add_count(var_id, -1)
-                                var_idx += 1
-                            cmp_MD += ("%d%s" % (MD_match_len, ref_seq[ref_pos-base_locus]))
-                            MD_match_len = 0
-                            cigar_match_len += 1
-                            read_pos += 1
-                            ref_pos += 1
-                        elif type == "insertion":
-                            ins_seq = read_seq[read_pos:read_pos+length]
-                            var_idx = lower_bound(Var_list[gene], ref_pos)
-                            # daehwan - for debugging purposes
-                            if debug:
-                                print left_pos, cigar_str, MD, vars
-                                print ref_pos, ins_seq, Var_list[gene][var_idx], Vars[gene][Var_list[gene][var_idx][1]]
-                                # sys.exit(1)
-                            while var_idx < len(Var_list[gene]):
-                                var_pos, var_id = Var_list[gene][var_idx]
-                                if ref_pos < var_pos:
-                                    break
-                                if ref_pos == var_pos:
-                                    var_type, _, var_data = Vars[gene][var_id]
-                                    if var_type == "insertion":                                
-                                        if var_data == ins_seq:
-                                            # daehwan - for debugging purposes
-                                            if debug:
-                                                print cmp, var_id, 1, Links[var_id]
-                                            add_count(var_id, 1)
-                                var_idx += 1
-
-                            if cigar_match_len > 0:
-                                cmp_cigar_str += ("%dM" % cigar_match_len)
-                                cigar_match_len = 0
-                            read_pos += length
-                            cmp_cigar_str += ("%dI" % length)
-                        elif type == "deletion":
-                            del_len = length
-                            # Deletions can be shifted bidirectionally
-                            temp_ref_pos = ref_pos
-                            while temp_ref_pos > 0:
-                                last_bp = ref_seq[temp_ref_pos + del_len - 1 - base_locus]
-                                prev_bp = ref_seq[temp_ref_pos - 1 - base_locus]
-                                if last_bp != prev_bp:
-                                    break
-                                temp_ref_pos -= 1
-                            var_idx = lower_bound(Var_list[gene], temp_ref_pos)
-                            while var_idx < len(Var_list[gene]):
-                                var_pos, var_id = Var_list[gene][var_idx]
-                                if temp_ref_pos < var_pos:
-                                    first_bp = ref_seq[temp_ref_pos - base_locus]
-                                    next_bp = ref_seq[temp_ref_pos + del_len - base_locus]
-                                    if first_bp == next_bp:
-                                        temp_ref_pos += 1
-                                        continue
-                                    else:
-                                        break
-                                if temp_ref_pos == var_pos:
-                                    var_type, _, var_data = Vars[gene][var_id]
-                                    if var_type == "deletion":
-                                        var_len = int(var_data)
-                                        if var_len == length:
-                                            if debug:
-                                                print cmp, var_id, 1, Links[var_id]
-                                                print ref_seq[var_pos - 10-base_locus:var_pos-base_locus], ref_seq[var_pos-base_locus:var_pos+int(var_data)-base_locus], ref_seq[var_pos+int(var_data)-base_locus:var_pos+int(var_data)+10-base_locus]
-                                            add_count(var_id, 1)
-                                var_idx += 1
-
-                            if cigar_match_len > 0:
-                                cmp_cigar_str += ("%dM" % cigar_match_len)
-                                cigar_match_len = 0
-                            cmp_MD += ("%d" % MD_match_len)
-                            MD_match_len = 0
-                            cmp_cigar_str += ("%dD" % length)
-                            cmp_MD += ("^%s" % ref_seq[ref_pos-base_locus:ref_pos+length-base_locus])
-                            ref_pos += length
-                        elif type == "soft":
-                            if cigar_match_len > 0:
-                                cmp_cigar_str += ("%dM" % cigar_match_len)
-                                cigar_match_len = 0
-                            read_pos += length
-                            cmp_cigar_str += ("%dS" % length)
-                        else:
-                            assert type == "intron"
-                            if cigar_match_len > 0:
-                                cmp_cigar_str += ("%dM" % cigar_match_len)
-                                cigar_match_len = 0
-                            cmp_cigar_str += ("%dN" % length)
-                            ref_pos += length                    
-                    if cigar_match_len > 0:
-                        cmp_cigar_str += ("%dM" % cigar_match_len)
-                    cmp_MD += ("%d" % MD_match_len)
-                    if read_pos != len(read_seq) or \
-                            cmp_cigar_str != cigar_str or \
-                            cmp_MD != MD:
-                        print >> sys.stderr, "Error:", cigar_str, MD
-                        print >> sys.stderr, "\tcomputed:", cmp_cigar_str, cmp_MD
-                        print >> sys.stderr, "\tcmp list:", cmp_list
-                        assert False            
-
-                    prev_read_id = read_id
-                    # prev_exon = exon
-
-                if num_reads <= 0:
-                    continue
-
-                if prev_read_id != None:
-                    add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read)
-
-                HLA_counts = [[allele, count] for allele, count in HLA_counts.items()]
-                def HLA_count_cmp(a, b):
-                    if a[1] != b[1]:
-                        return b[1] - a[1]
-                    assert a[0] != b[0]
-                    if a[0] < b[0]:
-                        return -1
-                    else:
-                        return 1
-                HLA_counts = sorted(HLA_counts, cmp=HLA_count_cmp)
-                for count_i in range(len(HLA_counts)):
-                    count = HLA_counts[count_i]
-                    print >> sys.stderr, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
-                    if count_i >= 9:
-                        break
-                print >> sys.stderr
-
-                def normalize(prob):
-                    total = sum(prob.values())
-                    for allele, mass in prob.items():
-                        prob[allele] = mass / total
-
-                def normalize2(prob, length):
-                    total = 0
-                    for allele, mass in prob.items():
-                        assert allele in length
-                        total += (mass / length[allele])
-                    for allele, mass in prob.items():
-                        assert allele in length
-                        prob[allele] = mass / length[allele] / total
-
-                def prob_diff(prob1, prob2):
-                    diff = 0.0
-                    for allele in prob1.keys():
-                        if allele in prob2:
-                            diff += abs(prob1[allele] - prob2[allele])
-                        else:
-                            diff += prob1[allele]
-                    return diff
-
-                def HLA_prob_cmp(a, b):
-                    if a[1] != b[1]:
-                        if a[1] < b[1]:
-                            return 1
-                        else:
-                            return -1
-                    assert a[0] != b[0]
-                    if a[0] < b[0]:
-                        return -1
-                    else:
-                        return 1
-
-                HLA_prob, HLA_prob_next = {}, {}
-                for cmpt, count in HLA_cmpt.items():
-                    alleles = cmpt.split('-')
-                    for allele in alleles:
-                        if allele not in HLA_prob:
-                            HLA_prob[allele] = 0.0
-                        HLA_prob[allele] += (float(count) / len(alleles))
-
-                """
-                assert gene in HLA_lengths
-                HLA_length = HLA_lengths[gene]
-                """
-                HLA_length = {}
-                
-                # normalize2(HLA_prob, HLA_length)
-                normalize(HLA_prob)
-                def next_prob(HLA_cmpt, HLA_prob, HLA_length):
-                    HLA_prob_next = {}
-                    for cmpt, count in HLA_cmpt.items():
-                        alleles = cmpt.split('-')
-                        alleles_prob = 0.0
-                        for allele in alleles:
-                            assert allele in HLA_prob
-                            alleles_prob += HLA_prob[allele]
-                        for allele in alleles:
-                            if allele not in HLA_prob_next:
-                                HLA_prob_next[allele] = 0.0
-                            HLA_prob_next[allele] += (float(count) * HLA_prob[allele] / alleles_prob)
-                    # normalize2(HLA_prob_next, HLA_length)
-                    normalize(HLA_prob_next)
-                    return HLA_prob_next
-
-                diff, iter = 1.0, 0
-                while diff > 0.0001 and iter < 1000:
-                    HLA_prob_next = next_prob(HLA_cmpt, HLA_prob, HLA_length)
-                    diff = prob_diff(HLA_prob, HLA_prob_next)
-                    HLA_prob = HLA_prob_next
-                    iter += 1
-
-                """
-                for allele, prob in HLA_prob.items():
-                    allele_len = len(HLAs[gene][allele])
-                    HLA_prob[allele] /= float(allele_len)
-                normalize(HLA_prob)
-                """
-                HLA_prob = [[allele, prob] for allele, prob in HLA_prob.items()]
-
-                HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp)
-                success = [False for i in range(len(test_HLA_names))]
-                found_list = [False for i in range(len(test_HLA_names))]
-                for prob_i in range(len(HLA_prob)):
-                    prob = HLA_prob[prob_i]
-                    print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, prob[0], prob[1] * 100.0)
-                    if prob_i >= 9:
-                        break
-                print >> sys.stderr
-
-                """
-                if len(test_HLA_names) == 2:
-                    HLA_prob, HLA_prob_next = {}, {}
-                    for cmpt, count in HLA_cmpt.items():
-                        alleles = cmpt.split('-')
-                        for allele1 in alleles:
-                            for allele2 in HLA_names[gene]:
-                                if allele1 < allele2:
-                                    allele_pair = "%s-%s" % (allele1, allele2)
-                                else:
-                                    allele_pair = "%s-%s" % (allele2, allele1)
-                                if not allele_pair in HLA_prob:
-                                    HLA_prob[allele_pair] = 0.0
-                                HLA_prob[allele_pair] += (float(count) / len(alleles))
-
-                    if len(HLA_prob) <= 0:
-                        continue
-
-                    # Choose top allele pairs
-                    def choose_top_alleles(HLA_prob):
-                        HLA_prob_list = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()]
-                        HLA_prob_list = sorted(HLA_prob_list, cmp=HLA_prob_cmp)
-                        HLA_prob = {}
-                        best_prob = HLA_prob_list[0][1]
-                        for i in range(len(HLA_prob_list)):
-                            allele_pair, prob = HLA_prob_list[i]
-                            if prob * 2 <= best_prob:
-                                break                        
-                            HLA_prob[allele_pair] = prob
-                        normalize(HLA_prob)
-                        return HLA_prob
-                    HLA_prob = choose_top_alleles(HLA_prob)
-
-                    def next_prob(HLA_cmpt, HLA_prob):
-                        HLA_prob_next = {}
-                        for cmpt, count in HLA_cmpt.items():
-                            alleles = cmpt.split('-')
-                            prob = 0.0
-                            for allele in alleles:
-                                for allele_pair in HLA_prob.keys():
-                                    if allele in allele_pair:
-                                        prob += HLA_prob[allele_pair]
-                            for allele in alleles:
-                                for allele_pair in HLA_prob.keys():
-                                    if not allele in allele_pair:
-                                        continue
-                                    if allele_pair not in HLA_prob_next:
-                                        HLA_prob_next[allele_pair] = 0.0
-                                    HLA_prob_next[allele_pair] += (float(count) * HLA_prob[allele_pair] / prob)
-                        normalize(HLA_prob_next)
-                        return HLA_prob_next
-
-                    diff, iter = 1.0, 0
-                    while diff > 0.0001 and iter < 1000:
-                        HLA_prob_next = next_prob(HLA_cmpt, HLA_prob)
-                        diff = prob_diff(HLA_prob, HLA_prob_next)
-                        HLA_prob = HLA_prob_next
-                        HLA_prob = choose_top_alleles(HLA_prob)
-                        iter += 1
-
-                    HLA_prob = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()]
-                    HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp)
-
-                    success = [False]
-                    for prob_i in range(len(HLA_prob)):
-                        allele_pair, prob = HLA_prob[prob_i]
-                        allele1, allele2 = allele_pair.split('-')
-                        if best_alleles and prob_i < 1:
-                            print >> sys.stdout, "PairModel %s (abundance: %.2f%%)" % (allele_pair, prob * 100.0)
-                        if simulation:
-                            if allele1 in test_HLA_names and allele2 in test_HLA_names:
-                                rank_i = prob_i
-                                while rank_i > 0:
-                                    if HLA_prob[rank_i-1][1] == prob:                                        
-                                        rank_i -= 1
-                                    else:
-                                        break
-                                print >> sys.stderr, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, allele_pair, prob * 100.0)
-                                if rank_i == 0:
-                                    success[0] = True
-                                break
-                        print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, allele_pair, prob * 100.0)
-                        if not simulation and prob_i >= 9:
-                            break
-                    print >> sys.stderr
-                """
-
-    # Read variants with clinical significance
-    clnsigs = {}
-    for line in open("%s.clnsig" % base_fname):
-        var_id, var_gene, var_clnsig = line.strip().split('\t')
-        clnsigs[var_id] = [var_gene, var_clnsig]
-
-    vars, Var_list = {}, {}
-    for line in open("%s.snp" % base_fname):
-        var_id, type, chr, left, data = line.strip().split()
-        if var_id not in clnsigs:
-            continue
-        left = int(left)
-        if type == "deletion":
-            data = int(data)
-        vars[var_id] = [chr, left, type, data]
-        if chr not in Var_list:
-            Var_list[chr] = []
-        Var_list[chr].append([left, var_id])
-
-    var_counts = {}
-
-    # Read alignments
-    alignview_cmd = ["samtools",
-                     "view",
-                     "hla_input.bam"]
-    bamview_proc = subprocess.Popen(alignview_cmd,
-                                    stdout=subprocess.PIPE,
-                                    stderr=open("/dev/null", 'w'))
-
-    for line in bamview_proc.stdout:
-        cols = line.strip().split()
-        read_id, flag, chr, pos, mapQ, cigar_str = cols[:6]
-        read_seq, qual = cols[9], cols[10]
-        flag, pos = int(flag), int(pos)
-        pos -= 1
-        if pos < 0:
-            continue
-
-        if flag & 0x4 != 0:
-            continue
-
-        if chr not in Var_list:
-            continue
-
-        assert chr in chr_dic
-        chr_seq = chr_dic[chr]
-
-        NM, Zs, MD, NH = "", "", "", ""
-        for i in range(11, len(cols)):
-            col = cols[i]
-            if col.startswith("Zs"):
-                Zs = col[5:]
-            elif col.startswith("MD"):
-                MD = col[5:]
-            elif col.startswith("NM"):
-                NM = int(col[5:])
-            elif col.startswith("NH"):
-                NH = int(col[5:])
-
-        assert NH != ""
-        NH = int(NH)
-        if NH > 1:
-            continue
-
-        if NM > num_mismatch:
-            continue
-
-        read_vars = []
-        if Zs:
-            read_vars = Zs.split(',')
-        for read_var in read_vars:
-            _, _, var_id = read_var.split('|')
-            if var_id not in clnsigs:
-                continue
-            if var_id not in var_counts:
-                var_counts[var_id] = [1, 0]
-            else:
-                var_counts[var_id][0] += 1
-
-        assert MD != ""
-        MD_str_pos, MD_len = 0, 0
-        read_pos, left_pos = 0, pos
-        right_pos = left_pos
-        cigars = cigar_re.findall(cigar_str)
-        cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-        cmp_list = []
-        for i in range(len(cigars)):
-            cigar_op, length = cigars[i]
-            if cigar_op == 'M':
-                chr_var_list = Var_list[chr]
-                var_idx = lower_bound(chr_var_list, right_pos)
-                while var_idx < len(chr_var_list):
-                    var_pos, var_id = chr_var_list[var_idx]
-                    if var_pos >= right_pos + length:
-                        break
-                    if var_pos >= right_pos:
-                        assert var_id in vars
-                        _, _, var_type, var_data = vars[var_id]
-                        contradict = False
-                        if var_type == "single":
-                            contradict = (read_seq[read_pos + var_pos - right_pos] == chr_seq[var_pos])
-                        elif var_type == "insertion":
-                            contradict = (right_pos < var_pos)
-                        else:
-                            contradict = True
-                        if contradict:
-                            if var_id not in var_counts:
-                                var_counts[var_id] = [0, 1]
-                            else:
-                                var_counts[var_id][1] += 1
-                    
-                    var_idx += 1
-                    
-            if cigar_op in "MND":
-                right_pos += length
-
-            if cigar_op in "MIS":
-                read_pos += length
-
-    for var_id, counts in var_counts.items():
-        if counts[0] < 2: # or counts[0] * 3 < counts[1]:
-            continue
-        assert var_id in vars
-        var_chr, var_left, var_type, var_data = vars[var_id]
-        assert var_id in clnsigs
-        var_gene, var_clnsig = clnsigs[var_id]
-        print >> sys.stderr, "\t\t\t%s %s: %s:%d %s %s (%s): %d-%d" % \
-                (var_gene, var_id, var_chr, var_left, var_type, var_data, var_clnsig, counts[0], counts[1])
-
-
-                
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description='HISAT2 genotyping')
-    parser.add_argument("--reference-type",
-                        dest="reference_type",
-                        type=str,
-                        default="gene",
-                        help="Reference type: gene, chromosome, and genome (default: gene)")
-    parser.add_argument("--base-name",
-                        dest="base_fname",
-                        type=str,
-                        default="genotype_genome",
-                        help="base filename for genotype genome")
-    parser.add_argument('-f',
-                        dest='fastq',
-                        action='store_false',
-                        help='FASTA file')    
-    parser.add_argument("-U",
-                        dest="read_fname_U",
-                        type=str,
-                        default="",
-                        help="filename for single-end reads")
-    parser.add_argument("-1",
-                        dest="read_fname_1",
-                        type=str,
-                        default="",
-                        help="filename for paired-end reads")
-    parser.add_argument("-2",
-                        dest="read_fname_2",
-                        type=str,
-                        default="",
-                        help="filename for paired-end reads")    
-    parser.add_argument("-p", "--threads",
-                        dest="threads",
-                        type=int,
-                        default=1,
-                        help="Number of threads")
-    parser.add_argument("--simulate-interval",
-                        dest="simulate_interval",
-                        type=int,
-                        default=1,
-                        help="Reads simulated at every these base pairs (default: 1)")
-    parser.add_argument("--num-mismatch",
-                        dest="num_mismatch",
-                        type=int,
-                        default=0,
-                        help="Maximum number of mismatches per read alignment to be considered (default: 0)")
-    parser.add_argument('-v', '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        help='also print some statistics to stderr')
-    parser.add_argument("--daehwan-debug",
-                        dest="daehwan_debug",
-                        type=str,
-                        default="",
-                        help="e.g., test_id:10,read_id:10000,basic_test")
-
-    args = parser.parse_args()
-    if not args.reference_type in ["gene", "chromosome", "genome"]:
-        print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type)
-        sys.exit(1)
-    daehwan_debug = {}
-    if args.daehwan_debug != "":
-        for item in args.daehwan_debug.split(','):
-            if ':' in item:
-                key, value = item.split(':')
-                daehwan_debug[key] = value
-            else:
-                daehwan_debug[item] = 1
-
-    if args.read_fname_U != "":
-        read_fnames = [args.read_fname_U]
-    else:
-        if args.read_fname_1 == "" or args.read_fname_2 == "":
-            print >> sys.stderr, "Error: please specify read file names correctly: -U or -1 and -2"
-            sys.exit(1)
-        read_fnames = [args.read_fname_1, args.read_fname_2] 
-
-    random.seed(1)
-    genotype(args.reference_type,
-             args.base_fname,
-             args.fastq,
-             read_fnames,
-             args.threads,
-             args.simulate_interval,
-             args.num_mismatch,
-             args.verbose,
-             daehwan_debug)
diff --git a/hisat2_inspect.cpp b/hisat2_inspect.cpp
index 88cc284..6475fde 100644
--- a/hisat2_inspect.cpp
+++ b/hisat2_inspect.cpp
@@ -31,6 +31,8 @@
 
 using namespace std;
 
+MemoryTally gMemTally;
+
 static bool showVersion = false; // just print version and quit?
 int verbose             = 0;  // be talkative
 static int names_only   = 0;  // just print the sequence names in the index
@@ -96,7 +98,7 @@ static void printUsage(ostream& out) {
     << "  --ss               Print splice sites" << endl
     << "  --ss-all           Print splice sites including those not in the global index" << endl
     << "  --exon             Print exons" << endl
-	<< "  -e/--bt2-ref       Reconstruct reference from ." << gfm_ext << " (slow, preserves colors)" << endl
+	<< "  -e/--ht2-ref       Reconstruct reference from ." << gfm_ext << " (slow, preserves colors)" << endl
 	<< "  -v/--verbose       Verbose output (for debugging)" << endl
 	<< "  -h/--help          print detailed description of tool and its options" << endl
 	<< "  --help             print this usage message" << endl
@@ -365,7 +367,8 @@ static void print_snps(
                      verbose,              // be talkative?
                      verbose,              // be talkative at startup?
                      false,                // pass up memory exceptions?
-                     false);               // sanity check?
+                     false,                // sanity check?
+                     false);               // use haplotypes?
     gfm.loadIntoMemory(
                        -1,     // need entire reverse
                        true,   // load SA sample
@@ -442,11 +445,12 @@ static void print_splicesites(
                      false,                // load SA sample?
                      false,                // load ftab?
                      false,                // load rstarts?
-                     true,                // load splice sites?
+                     true,                 // load splice sites?
                      verbose,              // be talkative?
                      verbose,              // be talkative at startup?
                      false,                // pass up memory exceptions?
-                     false);               // sanity check?
+                     false,                // sanity check?
+                     false);               // use haplotypes?
     gfm.loadIntoMemory(
                        -1,     // need entire reverse
                        true,   // load SA sample
@@ -517,7 +521,8 @@ static void print_exons(
                      verbose,              // be talkative?
                      verbose,              // be talkative at startup?
                      false,                // pass up memory exceptions?
-                     false);               // sanity check?
+                     false,                // sanity check?
+                     false);               // use haplotypes?
     gfm.loadIntoMemory(
                        -1,     // need entire reverse
                        true,   // load SA sample
@@ -586,11 +591,12 @@ static void print_index_summary(
                      false,                // load SA sample?
                      false,                // load ftab?
                      false,                // load rstarts?
-                     true,                // load splice sites?
+                     true,                 // load splice sites?
                      verbose,              // be talkative?
                      verbose,              // be talkative at startup?
                      false,                // pass up memory exceptions?
-                     false);               // sanity check?
+                     false,                // sanity check?
+                     false);               // use haplotypes?
 	EList<string> p_refnames;
 	readEbwtRefnames<index_t>(fname, p_refnames);
     cout << "Index version" << "\t2." << major << '.' << minor;
@@ -672,7 +678,8 @@ static void driver(
                                        false,                // be talkative?
                                        false,                // be talkative at startup?
                                        false,                // pass up memory exceptions?
-                                       false);               // sanity check?
+                                       false,                // sanity check?
+                                       false);               // use haplotypes?
         
         gfm.loadIntoMemory(
                            -1,     // need entire reverse
diff --git a/hisat2_test_BRCA_genotyping.py b/hisat2_test_BRCA_genotyping.py
deleted file mode 100755
index 68f26bb..0000000
--- a/hisat2_test_BRCA_genotyping.py
+++ /dev/null
@@ -1,827 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2016, Daehwan Kim <infphilo at gmail.com>
-#
-# This file is part of HISAT 2.
-#
-# HISAT 2 is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT 2 is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, re
-import inspect
-import random
-from argparse import ArgumentParser, FileType
-
-
-"""
-"""
-def test_BRCA_genotyping(reference_type,
-                         brca_list,
-                         aligners,
-                         read_fname,
-                         alignment_fname,
-                         threads,
-                         simulate_interval,
-                         enable_coverage,
-                         num_mismatch,
-                         verbose,
-                         daehwan_debug):
-    # Is it simulation?
-    simulation = (not read_fname and not alignment_fname)
-    
-    # File location for ClinVar
-    clinvar_url_base = "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38"
-    clinvar_fname = "clinvar_20160203.vcf.gz"
-
-    if not os.path.exists(clinvar_fname):
-        os.system("wget %s/%s" % (clinvar_url_base, clinvar_fname))
-        assert os.path.exists(clinvar_fname)
-    
-    # Current script directory
-    curr_script = os.path.realpath(inspect.getsourcefile(test_BRCA_genotyping))
-    ex_path = os.path.dirname(curr_script)
-
-    def check_files(fnames):
-        for fname in fnames:
-            if not os.path.exists(fname):
-                return False
-        return True
-
-    # Download HISAT2 index
-    HISAT2_fnames = ["grch38",
-                     "genome.fa",
-                     "genome.fa.fai"]
-    if not check_files(HISAT2_fnames):
-        os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz")
-        hisat2_inspect = os.path.join(ex_path, "hisat2-inspect")
-        os.system("%s grch38/genome > genome.fa" % hisat2_inspect)
-        os.system("samtools faidx genome.fa")
-
-    # Check if the pre-existing files (hla*) are compatible with the current parameter setting
-    if os.path.exists("brca.ref"):
-        left = 0
-        BRCA_genes = set()
-        for line in open("brca.ref"):
-            BRCA_name, chr, left, _ = line.strip().split()
-            BRCA_gene = BRCA_name.split('*')[0]
-            BRCA_genes.add(BRCA_gene)
-            left = int(left)
-        delete_brca_files = False
-        if reference_type == "gene":
-            if left > 0:
-                delete_hla_files = True
-        elif reference_type == "chromosome":
-            if left == 0:
-                delete_hla_files = True
-        else:
-            assert reference_type == "genome"
-        if not set(brca_list).issubset(BRCA_genes):
-            delete_brca_files = True
-        if delete_brca_files:
-            os.system("rm brca*")
-
-    # Extract BRCA variants, backbone sequence, and other sequeces
-    BRCA_fnames = ["brca_backbone.fa",
-                   "brca.ref",
-                   "brca.snp",
-                   "brca.haplotype",
-                   "brca.clnsig"]
-
-    if not check_files(BRCA_fnames):
-        extract_brca_script = os.path.join(ex_path, "hisat2_extract_snps_haplotypes_VCF.py")
-        extract_cmd = [extract_brca_script,
-                       "genome.fa",
-                       "--base", "brca",
-                       "--reference-type", "gene",
-                       "--genotype-vcf", clinvar_fname,
-                       "--genotype-gene-list", ','.join(brca_list),
-                       "--extra-files"]
-        extract_cmd += ["--inter-gap", "30",
-                        "--intra-gap", "50"]
-
-        proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-        proc.communicate()
-        if not check_files(BRCA_fnames):
-            print >> sys.stderr, "Error: extract_BRCA_vars failed!"
-            sys.exit(1)
-
-    # Build HISAT2 graph indexes based on the above information
-    HLA_hisat2_graph_index_fnames = ["brca.graph.%d.ht2" % (i+1) for i in range(8)]
-    if not check_files(HLA_hisat2_graph_index_fnames):
-        hisat2_build = os.path.join(ex_path, "hisat2-build")
-        build_cmd = [hisat2_build,
-                     "-p", str(threads),
-                     "--snp", "brca.snp",
-                     "--haplotype", "brca.haplotype",
-                     "brca_backbone.fa",
-                     "brca.graph"]
-        proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-        proc.communicate()        
-        if not check_files(HLA_hisat2_graph_index_fnames):
-            print >> sys.stderr, "Error: indexing BRCA genes failed!  Perhaps, you may have forgotten to build hisat2 executables?"
-            sys.exit(1)
-
-    """
-    # Build HISAT2 linear indexes based on the above information
-    HLA_hisat2_linear_index_fnames = ["hla.linear.%d.ht2" % (i+1) for i in range(8)]
-    if reference_type == "gene" and not check_files(HLA_hisat2_linear_index_fnames):
-        hisat2_build = os.path.join(ex_path, "hisat2-build")
-        build_cmd = [hisat2_build,
-                     "hla_backbone.fa,hla_sequences.fa",
-                     "hla.linear"]
-        proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-        proc.communicate()        
-        if not check_files(HLA_hisat2_graph_index_fnames):
-            print >> sys.stderr, "Error: indexing HLA failed!"
-            sys.exit(1)
-
-    # Build Bowtie2 indexes based on the above information
-    HLA_bowtie2_index_fnames = ["hla.%d.bt2" % (i+1) for i in range(4)]
-    HLA_bowtie2_index_fnames += ["hla.rev.%d.bt2" % (i+1) for i in range(2)]
-    if reference_type == "gene" and not check_files(HLA_bowtie2_index_fnames):
-        build_cmd = ["bowtie2-build",
-                     "hla_backbone.fa,hla_sequences.fa",
-                     "hla"]
-        proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'))
-        proc.communicate()        
-        if not check_files(HLA_bowtie2_index_fnames):
-            print >> sys.stderr, "Error: indexing HLA failed!"
-            sys.exit(1)
-    """
-
-    # Read BRCA variants
-    Vars, Var_list = {}, {}
-    for line in open("brca.snp"):
-        var_id, var_type, allele, pos, data = line.strip().split('\t')
-        pos = int(pos)
-        if reference_type != "gene":
-            allele, dist = None, 0
-            for tmp_gene, values in refHLA_loci.items():
-                allele_name, chr, left, right = values
-                if allele == None:
-                    allele = allele_name
-                    dist = abs(pos - left)
-                else:
-                    if dist > abs(pos - left):
-                        allele = allele_name
-                        dist = abs(pos - left)
-            
-        gene = allele
-        if not gene in Vars:
-            Vars[gene] = {}
-            assert not gene in Var_list
-            Var_list[gene] = []
-            
-        assert not var_id in Vars[gene]
-        left = 0
-        if reference_type != "gene":
-            _, _, left, _ = refHLA_loci[gene]
-        Vars[gene][var_id] = [var_type, pos - left, data]
-        Var_list[gene].append([pos - left, var_id])
-
-    for gene, in_var_list in Var_list.items():
-        Var_list[gene] = sorted(in_var_list)
-    def lower_bound(Var_list, pos):
-        low, high = 0, len(Var_list)
-        while low < high:
-            m = (low + high) / 2
-            m_pos = Var_list[m][0]
-            if m_pos < pos:
-                low = m + 1
-            elif m_pos > pos:
-                high = m
-            else:
-                assert m_pos == pos
-                while m > 0:
-                    if Var_list[m-1][0] < pos:
-                        break
-                    m -= 1
-                return m
-        return low
-
-    # Read BRCA variants' clinical significance
-    Vars_CLNSIG = {}
-    for line in open("brca.clnsig"):
-        var_id, CLNSIG = line.strip().split('\t')
-        assert var_id not in Vars_CLNSIG
-        Vars_CLNSIG[var_id] = CLNSIG
-
-    BRCAs = {}
-    def read_BRCA_alleles(fname, BRCAs):
-        for line in open(fname):
-            if line.startswith(">"):
-                BRCA_gene = line.strip().split()[0][1:]
-                if not BRCA_gene in BRCAs:
-                    BRCAs[BRCA_gene] = ""
-            else:
-                BRCAs[BRCA_gene] += line.strip()
-        return BRCAs
-    if reference_type == "gene":
-        read_BRCA_alleles("brca_backbone.fa", BRCAs)
-    # read_BRCA_alleles("brca_sequences.fa", BRCAs)
-
-    # Test BRCA genotyping
-    test_list = []
-    if simulation:
-        test_passed = {}
-        test_list = []
-        genes = list(set(brca_list) & BRCA_genes)
-        for gene in genes:
-            for var_id in Vars[gene].keys():
-                if var_id not in Vars_CLNSIG:
-                    continue
-                test_list.append([gene, [var_id]])
-    else:
-        test_list = [brca_list]
-
-    for test_i in range(len(test_list)):
-        if "test_id" in daehwan_debug:
-            daehwan_test_ids = daehwan_debug["test_id"].split('-')
-            if str(test_i + 1) not in daehwan_test_ids:
-                continue
-
-        print >> sys.stderr, "Test %d" % (test_i + 1)
-        gene, test_var_ids = test_list[test_i]
-
-        # daehwan - for debugging purposes
-        # test_HLA_list = [["A*11:50Q", "A*11:01:01:01", "A*01:01:01:01"]]
-        print >> sys.stderr, "\t%s" % (gene)
-        for test_var_id in test_var_ids:
-            assert test_var_id in Vars[gene]
-            var_type, var_pos, var_data = Vars[gene][test_var_id]
-            
-            print >> sys.stderr, "\t\t%s:" % (test_var_id), var_type, var_pos, var_data, Vars_CLNSIG[test_var_id]
-
-        var_true_counts = {}
-        if simulation:
-            BRCA_seq = BRCAs[gene]
-            BRCA_reads_1, BRCA_reads_2 = [], []
-            # Simulate reads from two HLA alleles
-            def simulate_reads(seq, test_vars, simulate_interval = 1, frag_len = 250, read_len = 100):
-                comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
-                reads_1, reads_2 = [], []
-                v = 0
-                for i in range(0, len(seq) - frag_len + 1, simulate_interval):
-                    while v < len(test_vars):
-                        var_type, var_pos, var_data = test_vars[v]
-                        if var_type == 'D':
-                            var_pos += (int(var_data) - 1)
-                        if var_pos >= i:
-                            break
-                        v += 1
-                        
-                    def simulate_read(left, right, test_var):
-                        include_var = False
-                        if test_var != None:
-                            var_type, var_pos, var_data = test_var
-                            var_pos2 = var_pos
-                            if var_type == 'deletion':
-                                var_pos2 += int(var_data)
-                            if var_pos > left and var_pos2 < right:
-                                include_var = True
-
-                        if include_var:
-                            if var_type == 'single':
-                                return seq[left:var_pos] + var_data + seq[var_pos+1:right]
-                            elif var_type == 'deletion':
-                                return seq[left:var_pos] + seq[var_pos2:right]
-                            else:
-                                assert var_type == 'insertion'
-                                return seq[left:var_pos] + var_data + seq[var_pos:right]
-                        else:
-                            return seq[left:right]
-
-                    test_var = None
-                    if v < len(test_vars):
-                        test_var = test_vars[v]
-                    reads_1.append(simulate_read(i, i+read_len, test_var))
-                    tmp_read_2 = simulate_read(i+frag_len-read_len,i+frag_len, test_var)
-                    tmp_read_2 = reversed(tmp_read_2)
-                    read_2 = ""
-                    for s in tmp_read_2:
-                        if s in comp_table:
-                            read_2 += comp_table[s]
-                        else:
-                            read_2 += s
-                    reads_2.append(read_2)
-                return reads_1, reads_2
-
-            test_vars = []
-            for test_var_id in test_var_ids:
-                assert test_var_id in Vars[gene]
-                test_vars.append(Vars[gene][test_var_id])
-
-            tmp_reads_1, tmp_reads_2 = simulate_reads(BRCA_seq, test_vars, simulate_interval)
-            BRCA_reads_1 += tmp_reads_1
-            BRCA_reads_2 += tmp_reads_2
-
-            # Write reads into a fasta read file
-            def write_reads(reads, idx):
-                read_file = open('brca_input_%d.fa' % idx, 'w')
-                for read_i in range(len(reads)):
-                    print >> read_file, ">%d" % (read_i + 1)
-                    print >> read_file, reads[read_i]
-                read_file.close()
-            write_reads(BRCA_reads_1, 1)
-            write_reads(BRCA_reads_2, 2)
-
-        for aligner, index_type in aligners:
-            if index_type == "graph":
-                print >> sys.stderr, "\n\t\t%s %s on %s" % (aligner, index_type, reference_type)
-            else:
-                print >> sys.stderr, "\n\t\t%s %s" % (aligner, index_type)
-
-            if alignment_fname == "":
-                # Align reads, and sort the alignments into a BAM file
-                if aligner == "hisat2":
-                    hisat2 = os.path.join(ex_path, "hisat2")
-                    aligner_cmd = [hisat2,
-                                   "--no-unal",
-                                   "--mm"]
-                    if index_type == "linear":
-                        aligner_cmd += ["-k", "10"]
-                    aligner_cmd += ["-x", "brca.%s" % index_type]
-                elif aligner == "bowtie2":
-                    aligner_cmd = [aligner,
-                                   "--no-unal",
-                                   "-k", "10",
-                                   "-x", "brca"]
-                else:
-                    assert False
-                if simulation:
-                    if "test_id" in daehwan_debug:
-                        aligner_cmd += ["-f", "brca_input_1.fa"]
-                    else:
-                        aligner_cmd += ["-f",
-                                        "-1", "brca_input_1.fa",
-                                        "-2", "brca_input_2.fa"]
-                else:
-                    assert len(read_fname) in [1,2]
-                    aligner_cmd += ["-p", str(threads)]
-                    if len(read_fname) == 1:
-                        aligner_cmd += [read_fname[0]]
-                    else:
-                        aligner_cmd += ["-1", "%s" % read_fname[0],
-                                        "-2", "%s" % read_fname[1]]
-
-                align_proc = subprocess.Popen(aligner_cmd,
-                                              stdout=subprocess.PIPE,
-                                              stderr=open("/dev/null", 'w'))
-
-                sambam_cmd = ["samtools",
-                              "view",
-                              "-bS",
-                              "-"]
-                sambam_proc = subprocess.Popen(sambam_cmd,
-                                               stdin=align_proc.stdout,
-                                               stdout=open("brca_input_unsorted.bam", 'w'),
-                                               stderr=open("/dev/null", 'w'))
-                sambam_proc.communicate()
-                if index_type == "graph":
-                    bamsort_cmd = ["samtools",
-                                   "sort",
-                                   "brca_input_unsorted.bam",
-                                   "brca_input"]
-                    bamsort_proc = subprocess.Popen(bamsort_cmd,
-                                                    stderr=open("/dev/null", 'w'))
-                    bamsort_proc.communicate()
-
-                    bamindex_cmd = ["samtools",
-                                    "index",
-                                    "brca_input.bam"]
-                    bamindex_proc = subprocess.Popen(bamindex_cmd,
-                                                     stderr=open("/dev/null", 'w'))
-                    bamindex_proc.communicate()
-
-                    os.system("rm brca_input_unsorted.bam")            
-                else:
-                    os.system("mv brca_input_unsorted.bam brca_input.bam")
-
-            for tt in [0]:
-                ref_seq = BRCAs[gene]
-
-                # Read alignments
-                alignview_cmd = ["samtools",
-                                 "view"]
-                if alignment_fname == "":
-                    alignview_cmd += ["brca_input.bam"]
-                else:
-                    if not os.path.exists(alignment_fname + ".bai"):
-                        os.system("samtools index %s" % alignment_fname)
-                    alignview_cmd += [alignment_fname]
-                base_locus = 0
-                if index_type == "graph":
-                    if reference_type == "gene":
-                        alignview_cmd += ["%s" % gene]
-                    else:
-                        assert reference_type in ["chromosome", "genome"]
-                        _, chr, left, right = refHLA_loci[gene]
-                        base_locus = left
-                        alignview_cmd += ["%s:%d-%d" % (chr, left + 1, right + 1)]
-
-                    bamview_proc = subprocess.Popen(alignview_cmd,
-                                                    stdout=subprocess.PIPE,
-                                                    stderr=open("/dev/null", 'w'))
-
-                    sort_read_cmd = ["sort", "-k", "1", "-n"]
-                    alignview_proc = subprocess.Popen(sort_read_cmd,
-                                                      stdin=bamview_proc.stdout,
-                                                      stdout=subprocess.PIPE,
-                                                      stderr=open("/dev/null", 'w'))
-                else:
-                    alignview_proc = subprocess.Popen(alignview_cmd,
-                                                 stdout=subprocess.PIPE,
-                                                 stderr=open("/dev/null", 'w'))
-
-                # Count alleles
-                var_test_counts = {}
-                num_reads, total_read_len = 0, 0
-                prev_read_id = None
-                if index_type == "graph":
-                    # Cigar regular expression
-                    cigar_re = re.compile('\d+\w')
-                    for line in alignview_proc.stdout:
-                        cols = line.strip().split()
-                        read_id, flag, chr, pos, mapQ, cigar_str = cols[:6]
-                        read_seq = cols[9]
-                        num_reads += 1
-                        total_read_len += len(read_seq)
-                        flag, pos = int(flag), int(pos)
-                        pos -= (base_locus + 1)
-                        if pos < 0:
-                            continue
-
-                        if flag & 0x4 != 0:
-                            continue
-
-                        NM, Zs, MD = "", "", ""
-                        for i in range(11, len(cols)):
-                            col = cols[i]
-                            if col.startswith("Zs"):
-                                Zs = col[5:]
-                            elif col.startswith("MD"):
-                                MD = col[5:]
-                            elif col.startswith("NM"):
-                                NM = int(col[5:])
-
-                        if NM > num_mismatch:
-                            continue
-
-                        vars = []
-                        if Zs:
-                            vars = Zs.split(',')
-
-                        assert MD != ""
-                        MD_str_pos, MD_len = 0, 0
-                        read_pos, left_pos = 0, pos
-                        right_pos = left_pos
-                        cigars = cigar_re.findall(cigar_str)
-                        cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-                        cmp_list = []
-                        for i in range(len(cigars)):
-                            cigar_op, length = cigars[i]
-                            if cigar_op == 'M':
-                                # Update coverage
-                                if enable_coverage:
-                                    if right_pos + length < len(coverage):
-                                        coverage[right_pos] += 1
-                                        coverage[right_pos + length] -= 1
-                                    elif right_pos < len(coverage):
-                                        coverage[right_pos] += 1
-                                        coverage[-1] -= 1
-
-                                first = True
-                                MD_len_used = 0
-                                while True:
-                                    if not first or MD_len == 0:
-                                        if MD[MD_str_pos].isdigit():
-                                            num = int(MD[MD_str_pos])
-                                            MD_str_pos += 1
-                                            while MD_str_pos < len(MD):
-                                                if MD[MD_str_pos].isdigit():
-                                                    num = num * 10 + int(MD[MD_str_pos])
-                                                    MD_str_pos += 1
-                                                else:
-                                                    break
-                                            MD_len += num
-                                    # Insertion or full match followed
-                                    if MD_len >= length:
-                                        MD_len -= length
-                                        cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used])
-                                        break
-                                    first = False
-                                    read_base = read_seq[read_pos + MD_len]
-                                    MD_ref_base = MD[MD_str_pos]
-                                    MD_str_pos += 1
-                                    assert MD_ref_base in "ACGT"
-                                    cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used])
-                                    cmp_list.append(["mismatch", right_pos + MD_len, 1])
-                                    MD_len_used = MD_len + 1
-                                    MD_len += 1
-                                    # Full match
-                                    if MD_len == length:
-                                        MD_len = 0
-                                        break
-                            elif cigar_op == 'I':
-                                cmp_list.append(["insertion", right_pos, length])
-                            elif cigar_op == 'D':
-                                if MD[MD_str_pos] == '0':
-                                    MD_str_pos += 1
-                                assert MD[MD_str_pos] == '^'
-                                MD_str_pos += 1
-                                while MD_str_pos < len(MD):
-                                    if not MD[MD_str_pos] in "ACGT":
-                                        break
-                                    MD_str_pos += 1
-                                cmp_list.append(["deletion", right_pos, length])
-                            elif cigar_op == 'S':
-                                cmp_list.append(["soft", right_pos, length])
-                            else:                    
-                                assert cigar_op == 'N'
-                                cmp_list.append(["intron", right_pos, length])
-
-                            if cigar_op in "MND":
-                                right_pos += length
-
-                            if cigar_op in "MIS":
-                                read_pos += length
-
-                        if right_pos > len(ref_seq):
-                            continue
-
-                        # Decide which allele(s) a read most likely came from
-                        # also sanity check - read length, cigar string, and MD string
-                        def add_count(var_id, num):
-                            if var_id not in var_test_counts:
-                                var_test_counts[var_id] = 0
-                            var_test_counts[var_id] += num
-                            
-                        for var_id, data in Vars[gene].items():
-                            var_type, var_pos, var_data = data
-                            if var_type != "deletion":
-                                continue
-                            if left_pos >= var_pos and right_pos <= var_pos + int(var_data):
-                                add_count(var_id, -1)                            
-                        ref_pos, read_pos, cmp_cigar_str, cmp_MD = left_pos, 0, "", ""
-                        cigar_match_len, MD_match_len = 0, 0            
-                        for cmp in cmp_list:
-                            type = cmp[0]
-                            length = cmp[2]
-                            if type == "match":
-                                var_idx = lower_bound(Var_list[gene], ref_pos)
-                                while var_idx < len(Var_list[gene]):
-                                    var_pos, var_id = Var_list[gene][var_idx]
-                                    if ref_pos + length <= var_pos:
-                                        break
-                                    if ref_pos <= var_pos:
-                                        var_type, _, var_data = Vars[gene][var_id]
-                                        if var_type == "insertion":
-                                            if ref_pos < var_pos and ref_pos + length > var_pos + len(var_data):
-                                                add_count(var_id, -1)
-                                        elif var_type == "deletion":
-                                            del_len = int(var_data)
-                                            if ref_pos < var_pos and ref_pos + length > var_pos + del_len:
-                                                # Check if this might be one of the two tandem repeats (the same left coordinate)
-                                                cmp_left, cmp_right = cmp[1], cmp[1] + cmp[2]
-                                                test1_seq1 = ref_seq[cmp_left:cmp_right]
-                                                test1_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos + del_len:cmp_right + del_len]
-                                                # Check if this happens due to small repeats (the same right coordinate - e.g. 19 times of TTTC in DQA1*05:05:01:02)
-                                                cmp_left -= read_pos
-                                                cmp_right += (len(read_seq) - read_pos - cmp[2])
-                                                test2_seq1 = ref_seq[cmp_left+int(var_data):cmp_right]
-                                                test2_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos+int(var_data):cmp_right]
-                                                if test1_seq1 != test1_seq2 and test2_seq1 != test2_seq2:
-                                                    add_count(var_id, -1)
-                                        else:
-                                            add_count(var_id, -1)
-                                    var_idx += 1
-
-                                read_pos += length
-                                ref_pos += length
-                                cigar_match_len += length
-                                MD_match_len += length
-                            elif type == "mismatch":
-                                read_base = read_seq[read_pos]
-                                var_idx = lower_bound(Var_list[gene], ref_pos)
-                                while var_idx < len(Var_list[gene]):
-                                    var_pos, var_id = Var_list[gene][var_idx]
-                                    if ref_pos < var_pos:
-                                        break
-                                    if ref_pos == var_pos:
-                                        var_type, _, var_data = Vars[gene][var_id]
-                                        if var_type == "single":
-                                            if var_data == read_base:
-                                                add_count(var_id, 1)
-                                    var_idx += 1
-
-                                cmp_MD += ("%d%s" % (MD_match_len, ref_seq[ref_pos]))
-                                MD_match_len = 0
-                                cigar_match_len += 1
-                                read_pos += 1
-                                ref_pos += 1
-                            elif type == "insertion":
-                                ins_seq = read_seq[read_pos:read_pos+length]
-                                var_idx = lower_bound(Var_list[gene], ref_pos)
-                                while var_idx < len(Var_list[gene]):
-                                    var_pos, var_id = Var_list[gene][var_idx]
-                                    if ref_pos < var_pos:
-                                        break
-                                    if ref_pos == var_pos:
-                                        var_type, _, var_data = Vars[gene][var_id]
-                                        if var_type == "insertion":                                
-                                            if var_data == ins_seq:
-                                                add_count(var_id, 1)
-                                    var_idx += 1
-
-                                if cigar_match_len > 0:
-                                    cmp_cigar_str += ("%dM" % cigar_match_len)
-                                    cigar_match_len = 0
-                                read_pos += length
-                                cmp_cigar_str += ("%dI" % length)
-                            elif type == "deletion":
-                                del_len = length
-                                # Deletions can be shifted bidirectionally
-                                temp_ref_pos = ref_pos
-                                while temp_ref_pos > 0:
-                                    last_bp = ref_seq[temp_ref_pos + del_len - 1]
-                                    prev_bp = ref_seq[temp_ref_pos - 1]
-                                    if last_bp != prev_bp:
-                                        break
-                                    temp_ref_pos -= 1
-                                var_idx = lower_bound(Var_list[gene], temp_ref_pos)
-                                while var_idx < len(Var_list[gene]):
-                                    var_pos, var_id = Var_list[gene][var_idx]
-                                    if temp_ref_pos < var_pos:
-                                        first_bp = ref_seq[temp_ref_pos]
-                                        next_bp = ref_seq[temp_ref_pos + del_len]
-                                        if first_bp == next_bp:
-                                            temp_ref_pos += 1
-                                            continue
-                                        else:
-                                            break
-                                    if temp_ref_pos == var_pos:
-                                        var_type, _, var_data = Vars[gene][var_id]
-                                        if var_type == "deletion":
-                                            var_len = int(var_data)
-                                            if var_len == length:
-                                                add_count(var_id, 1)
-                                    var_idx += 1
-                                if cigar_match_len > 0:
-                                    cmp_cigar_str += ("%dM" % cigar_match_len)
-                                    cigar_match_len = 0
-                                cmp_MD += ("%d" % MD_match_len)
-                                MD_match_len = 0
-                                cmp_cigar_str += ("%dD" % length)
-                                cmp_MD += ("^%s" % ref_seq[ref_pos:ref_pos+length])
-                                ref_pos += length
-                            elif type == "soft":
-                                if cigar_match_len > 0:
-                                    cmp_cigar_str += ("%dM" % cigar_match_len)
-                                    cigar_match_len = 0
-                                read_pos += length
-                                cmp_cigar_str += ("%dS" % length)
-                            else:
-                                assert type == "intron"
-                                if cigar_match_len > 0:
-                                    cmp_cigar_str += ("%dM" % cigar_match_len)
-                                    cigar_match_len = 0
-                                cmp_cigar_str += ("%dN" % length)
-                                ref_pos += length                    
-                        if cigar_match_len > 0:
-                            cmp_cigar_str += ("%dM" % cigar_match_len)
-                        cmp_MD += ("%d" % MD_match_len)
-                        if read_pos != len(read_seq) or \
-                                cmp_cigar_str != cigar_str or \
-                                cmp_MD != MD:
-                            print >> sys.stderr, "Error:", cigar_str, MD
-                            print >> sys.stderr, "\tcomputed:", cmp_cigar_str, cmp_MD
-                            print >> sys.stderr, "\tcmp list:", cmp_list
-                            assert False
-
-                        prev_read_id = read_id
-
-                    if num_reads <= 0:
-                        continue
-
-                for var_id, count in var_test_counts.items():
-                    if count <= 0:
-                        continue
-                    print >> sys.stderr, "\t\t\t%s: %d" % (var_id, count)
-
-    if simulation:
-        for aligner_type, passed in test_passed.items():
-            print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list))
-    
-        
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description='test HLA genotyping')
-    parser.add_argument("--reference-type",
-                        dest="reference_type",
-                        type=str,
-                        default="gene",
-                        help="Reference type: gene, chromosome, and genome (default: gene)")
-    parser.add_argument("--brca-list",
-                        dest="brca_list",
-                        type=str,
-                        default="BRCA1,BRCA2",
-                        help="A comma-separated list of BRCA genes (default: BRCA1,BRCA2)")
-    parser.add_argument("--aligner-list",
-                        dest="aligners",
-                        type=str,
-                        default="hisat2.graph",
-                        help="A comma-separated list of aligners (default: hisat2.graph)")
-    parser.add_argument("--reads",
-                        dest="read_fname",
-                        type=str,
-                        default="",
-                        help="Fastq read file name")
-    parser.add_argument("--alignment",
-                        dest="alignment_fname",
-                        type=str,
-                        default="",
-                        help="BAM file name")
-    parser.add_argument("-p", "--threads",
-                        dest="threads",
-                        type=int,
-                        default=1,
-                        help="Number of threads")
-    parser.add_argument("--simulate-interval",
-                        dest="simulate_interval",
-                        type=int,
-                        default=1,
-                        help="Reads simulated at every these base pairs (default: 1)")
-    parser.add_argument("--coverage",
-                        dest="coverage",
-                        action='store_true',
-                        help="Experimental purpose (assign reads based on coverage)")
-    parser.add_argument("--num-mismatch",
-                        dest="num_mismatch",
-                        type=int,
-                        default=0,
-                        help="Maximum number of mismatches per read alignment to be considered (default: 0)")
-    parser.add_argument('-v', '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        help='also print some statistics to stderr')
-    parser.add_argument("--daehwan-debug",
-                        dest="daehwan_debug",
-                        type=str,
-                        default="",
-                        help="e.g., test_id:10,read_id:10000,basic_test")
-
-    args = parser.parse_args()
-    if not args.reference_type in ["gene", "chromosome", "genome"]:
-        print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type)
-        sys.exit(1)
-    args.brca_list = args.brca_list.split(',')
-    if args.aligners == "":
-        print >> sys.stderr, "Error: --aligners must be non-empty."
-        sys.exit(1)    
-    args.aligners = args.aligners.split(',')
-    for i in range(len(args.aligners)):
-        args.aligners[i] = args.aligners[i].split('.')
-    if args.read_fname:
-        args.read_fname = args.read_fname.split(',')
-    else:
-        args.read_fname = []
-    if args.alignment_fname != "" and \
-            not os.path.exists(args.alignment_fname):
-        print >> sys.stderr, "Error: %s doesn't exist." % args.alignment_fname
-        sys.exit(1)
-    daehwan_debug = {}
-    if args.daehwan_debug != "":
-        for item in args.daehwan_debug.split(','):
-            if ':' in item:
-                key, value = item.split(':')
-                daehwan_debug[key] = value
-            else:
-                daehwan_debug[item] = 1
-
-    random.seed(1)
-    test_BRCA_genotyping(args.reference_type,
-                         args.brca_list,
-                         args.aligners,
-                         args.read_fname,
-                         args.alignment_fname,
-                         args.threads,
-                         args.simulate_interval,
-                         args.coverage,
-                         args.num_mismatch,
-                         args.verbose,
-                         daehwan_debug)
diff --git a/hisat2_test_HLA_genotyping.py b/hisat2_test_HLA_genotyping.py
deleted file mode 100755
index dca5f61..0000000
--- a/hisat2_test_HLA_genotyping.py
+++ /dev/null
@@ -1,2699 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright 2015, Daehwan Kim <infphilo at gmail.com>
-#
-# This file is part of HISAT 2.
-#
-# HISAT 2 is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT 2 is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, re
-import inspect, random
-import math
-from datetime import datetime, date, time
-from argparse import ArgumentParser, FileType
-from hisat2_modules import assembly_graph
-
-
-"""
-"""
-def simulate_reads(HLAs,
-                   test_HLA_list,
-                   Vars,
-                   Links,
-                   simulate_interval = 1,
-                   perbase_errorrate = 0.0):
-    HLA_reads_1, HLA_reads_2 = [], []
-    num_pairs = []
-    for test_HLA_names in test_HLA_list:
-        gene = test_HLA_names[0].split('*')[0]
-        num_pairs.append([])
-
-        # Simulate reads from two HLA alleles
-        def simulate_reads_impl(seq,
-                                seq_map,
-                                ex_seq,
-                                ex_desc,
-                                simulate_interval = 1,
-                                perbase_errorrate = 0.0,
-                                frag_len = 250,
-                                read_len = 100):
-            # Introduce sequencing errors
-            def introduce_seq_err(read_seq, pos):
-                read_seq = list(read_seq)
-                for i in range(read_len):
-                    map_pos = seq_map[pos + i]
-                    if ex_desc[map_pos] != "":
-                        continue
-                    if random.random() * 100 < perbase_errorrate:
-                        if read_seq[i] == 'A':
-                            alt_bases = ['C', 'G', 'T']
-                        elif read_seq[i] == 'C':
-                            alt_bases = ['A', 'G', 'T']
-                        elif read_seq[i] == 'G':
-                            alt_bases = ['A', 'C', 'T']
-                        else:
-                            assert read_seq[i] == 'T'
-                            alt_bases = ['A', 'C', 'G']
-                        random.shuffle(alt_bases)
-                        alt_base = alt_bases[0]
-                        read_seq[i] = alt_base
-                read_seq = ''.join(read_seq)
-                return read_seq                            
-                            
-            # Get read alignment, e.g., 260|R_483_61M5D38M23D1M_46|S|hv154,3|S|hv162,10|D|hv185,38|D|hv266
-            def get_info(read_seq, pos):
-                info = "%d_" % (seq_map[pos] + 1)
-                total_match, match, sub_match = 0, 0, 0
-                var_str = ""
-                for i in range(pos, pos + read_len):
-                    map_i = seq_map[i]
-                    assert ex_seq[map_i] != 'D'
-                    total_match += 1
-                    match += 1
-                    if ex_desc[map_i] != "" or read_seq[i-pos] != ex_seq[map_i]:
-                        if var_str != "":
-                            var_str += ','
-                        var_str += ("%d|S|%s" % (sub_match, ex_desc[map_i] if ex_desc[map_i] != "" else "unknown"))
-                        sub_match = 0
-                    else:
-                        sub_match += 1
-                    if i + 1 < pos + read_len and ex_seq[map_i+1] == 'D':
-                        assert match > 0
-                        info += ("%dM" % match)
-                        match = 0
-                        del_len = 1
-                        while map_i + 1 + del_len < len(ex_seq):
-                            if ex_seq[map_i + 1 + del_len] != 'D':
-                                break
-                            del_len += 1
-                        info += ("%dD" % del_len)
-                        if var_str != "":
-                            var_str += ','
-                        var_str += ("%s|D|%s" % (sub_match, ex_desc[map_i + 1]))
-                        sub_match = 0
-                assert match > 0
-                info += ("%dM" % match)
-                assert total_match == read_len
-                if var_str:
-                    info += "_"
-                    info += var_str                
-                return info
-                
-            comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
-            reads_1, reads_2 = [], []
-            for i in range(0, len(seq) - frag_len + 1, simulate_interval):
-                pos1 = i
-                seq1 = seq[pos1:pos1+read_len]
-                if perbase_errorrate > 0.0:
-                    seq1 = introduce_seq_err(seq1, pos1)
-                info1 = get_info(seq1, pos1)
-                reads_1.append([seq1, info1])
-                
-                pos2 = i + frag_len - read_len
-                seq2 = seq[pos2:pos2+read_len]
-                if perbase_errorrate > 0.0:
-                    seq2 = introduce_seq_err(seq2, pos2)                
-                info2 = get_info(seq2, pos2)
-                tmp_read_2 = reversed(seq2)
-                read_2 = ""
-                for s in tmp_read_2:
-                    if s in comp_table:
-                        read_2 += comp_table[s]
-                    else:
-                        read_2 += s
-                reads_2.append([read_2, info2])
-            return reads_1, reads_2
-
-        # for each allele in a list of alleles such as ['A*32:29', 'B*07:02:01']
-        for test_HLA_name in test_HLA_names:
-            HLA_seq = HLAs[gene][test_HLA_name]
-            HLA_ex_seq = list(HLAs[gene]["%s*BACKBONE" % gene])
-            HLA_ex_desc = [''] * len(HLA_ex_seq)
-            HLA_seq_map = [i for i in range(len(HLA_seq))]
-
-            # Extract variants included in the allele
-            vars = []
-            for var, allele_list in Links.items():
-                if test_HLA_name in allele_list:
-                    vars.append(var)
-
-            # Build annotated sequence for the allele w.r.t backbone sequence
-            for var in vars:
-                var_type, var_pos, var_data = Vars[gene][var]
-                if var_type == "single":
-                    HLA_ex_seq[var_pos] = var_data
-                    HLA_ex_desc[var_pos] = var
-                else:
-                    assert var_type == "deletion"
-                    del_len = int(var_data)
-                    assert var_pos + del_len <= len(HLA_ex_seq)
-                    HLA_ex_seq[var_pos:var_pos+del_len] = ['D'] * del_len
-                    HLA_ex_desc[var_pos:var_pos+del_len] = [var] * del_len
-            HLA_ex_seq = ''.join(HLA_ex_seq)
-
-            # Build mapping from the allele to the annotated sequence
-            prev_j = 0
-            for i in range(len(HLA_seq)):
-                for j in range(prev_j, len(HLA_ex_seq)):
-                    if HLA_ex_seq[j] != 'D':
-                        break
-                HLA_seq_map[i] = j
-                prev_j = j + 1
-            
-            tmp_reads_1, tmp_reads_2 = simulate_reads_impl(HLA_seq,
-                                                           HLA_seq_map,
-                                                           HLA_ex_seq,
-                                                           HLA_ex_desc,                                                           
-                                                           simulate_interval,
-                                                           perbase_errorrate)
-            HLA_reads_1 += tmp_reads_1
-            HLA_reads_2 += tmp_reads_2
-            num_pairs[-1].append(len(tmp_reads_1))
-
-    # Write reads into a fasta read file
-    def write_reads(reads, idx):
-        read_file = open('hla_input_%d.fa' % idx, 'w')
-        for read_i in range(len(reads)):
-            print >> read_file, ">%d|%s_%s" % (read_i + 1, "LR"[idx-1], reads[read_i][1])
-            print >> read_file, reads[read_i][0]
-        read_file.close()
-    write_reads(HLA_reads_1, 1)
-    write_reads(HLA_reads_2, 2)
-
-    return num_pairs
-
-
-"""
-Align reads, and sort the alignments into a BAM file
-"""
-def align_reads(ex_path,
-                aligner,
-                simulation,
-                index_type,
-                read_fname,
-                fastq,
-                threads,
-                out_fname,
-                verbose):
-    if aligner == "hisat2":
-        hisat2 = os.path.join(ex_path, "hisat2")
-        aligner_cmd = [hisat2, "--mm"]
-        if not simulation:
-            aligner_cmd += ["--no-unal"]            
-        # No detection of novel insertions and deletions
-        aligner_cmd += ["--rdg", "10000,10000"] # deletion
-        aligner_cmd += ["--rfg", "10000,10000"] # insertion
-        DNA = True
-        if DNA:
-            aligner_cmd += ["--no-spliced-alignment"] # no spliced alignment
-            # aligner_cmd += ["--min-intronlen", "100000"]
-        if index_type == "linear":
-            aligner_cmd += ["-k", "10"]
-        else:
-            aligner_cmd += ["--max-altstried", "64"]
-        aligner_cmd += ["-x", "hla.%s" % index_type]
-    elif aligner == "bowtie2":
-        aligner_cmd = [aligner,
-                       "--no-unal",
-                       "-k", "10",
-                       "-x", "hla"]
-    else:
-        assert False
-    assert len(read_fname) in [1,2]
-    aligner_cmd += ["-p", str(threads)]
-    if not fastq:
-        aligner_cmd += ["-f"]
-    if len(read_fname) == 1:
-        aligner_cmd += ["-U", read_fname[0]]
-    else:
-        aligner_cmd += ["-1", "%s" % read_fname[0],
-                        "-2", "%s" % read_fname[1]]
-    if verbose >= 1:
-        print >> sys.stderr, ' '.join(aligner_cmd)
-    align_proc = subprocess.Popen(aligner_cmd,
-                                  stdout=subprocess.PIPE,
-                                  stderr=open("/dev/null", 'w'))
-
-    sambam_cmd = ["samtools",
-                  "view",
-                  "-bS",
-                  "-"]
-    sambam_proc = subprocess.Popen(sambam_cmd,
-                                   stdin=align_proc.stdout,
-                                   stdout=open(out_fname + ".unsorted", 'w'),
-                                   stderr=open("/dev/null", 'w'))
-    sambam_proc.communicate()
-    if index_type == "graph":
-        bamsort_cmd = ["samtools",
-                       "sort",
-                       out_fname + ".unsorted",
-                       "-o", out_fname]
-        bamsort_proc = subprocess.Popen(bamsort_cmd,
-                                        stderr=open("/dev/null", 'w'))
-        bamsort_proc.communicate()
-
-        bamindex_cmd = ["samtools",
-                        "index",
-                        out_fname]
-        bamindex_proc = subprocess.Popen(bamindex_cmd,
-                                         stderr=open("/dev/null", 'w'))
-        bamindex_proc.communicate()
-
-    os.system("rm %s" % (out_fname + ".unsorted"))            
-
-
-"""
-""" 
-def normalize(prob):
-    total = sum(prob.values())
-    for allele, mass in prob.items():
-        prob[allele] = mass / total
-
-        
-"""
-"""
-def prob_diff(prob1, prob2):
-    diff = 0.0
-    for allele in prob1.keys():
-        if allele in prob2:
-            diff += abs(prob1[allele] - prob2[allele])
-        else:
-            diff += prob1[allele]
-    return diff
-
-
-"""
-"""
-def HLA_prob_cmp(a, b):
-    if a[1] != b[1]:
-        if a[1] < b[1]:
-            return 1
-        else:
-            return -1
-    assert a[0] != b[0]
-    if a[0] < b[0]:
-        return -1
-    else:
-        return 1
-
-
-"""
-"""
-def single_abundance(HLA_cmpt,
-                     HLA_length):
-    def normalize2(prob, length):
-        total = 0
-        for allele, mass in prob.items():
-            assert allele in length
-            total += (mass / length[allele])
-        for allele, mass in prob.items():
-            assert allele in length
-            prob[allele] = mass / length[allele] / total
-
-    HLA_prob, HLA_prob_next = {}, {}
-    for cmpt, count in HLA_cmpt.items():
-        alleles = cmpt.split('-')
-        for allele in alleles:
-            if allele not in HLA_prob:
-                HLA_prob[allele] = 0.0
-            HLA_prob[allele] += (float(count) / len(alleles))
-
-    normalize(HLA_prob)
-    def next_prob(HLA_cmpt, HLA_prob, HLA_length):
-        HLA_prob_next = {}
-        for cmpt, count in HLA_cmpt.items():
-            alleles = cmpt.split('-')
-            alleles_prob = 0.0
-            for allele in alleles:
-                if allele not in HLA_prob:
-                    continue
-                alleles_prob += HLA_prob[allele]
-            if alleles_prob <= 0.0:
-                continue
-            for allele in alleles:
-                if allele not in HLA_prob:
-                    continue
-                if allele not in HLA_prob_next:
-                    HLA_prob_next[allele] = 0.0
-                HLA_prob_next[allele] += (float(count) * HLA_prob[allele] / alleles_prob)
-        normalize(HLA_prob_next)
-        return HLA_prob_next
-
-    diff, iter = 1.0, 0
-    while diff > 0.0001 and iter < 1000:
-        HLA_prob_next = next_prob(HLA_cmpt, HLA_prob, HLA_length)
-        diff = prob_diff(HLA_prob, HLA_prob_next)
-        HLA_prob = HLA_prob_next
-
-        if iter >= 10:
-            HLA_prob2 = {}
-            for allele, prob in HLA_prob.items():
-                if prob >= 0.005:
-                    HLA_prob2[allele] = prob
-            HLA_prob = HLA_prob2
-
-        # DK - debugging purposes
-        if iter % 10 == 0 and False:
-            print "iter", iter
-            for allele, prob in HLA_prob.items():
-                if prob >= 0.01:
-                    print >> sys.stderr, "\t", iter, allele, prob, str(datetime.now())
-        
-        iter += 1
-        
-    """
-    for allele, prob in HLA_prob.items():
-        allele_len = HLA_length[allele]
-        HLA_prob[allele] /= float(allele_len)
-    """
-    
-    normalize(HLA_prob)
-    HLA_prob = [[allele, prob] for allele, prob in HLA_prob.items()]
-    HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp)
-    return HLA_prob
-
-    
-"""
-"""
-def joint_abundance(HLA_cmpt,
-                    HLA_length):
-    allele_names = set()
-    for cmpt in HLA_cmpt.keys():
-        allele_names |= set(cmpt.split('-'))
-    
-    HLA_prob, HLA_prob_next = {}, {}
-    for cmpt, count in HLA_cmpt.items():
-        alleles = cmpt.split('-')
-        for allele1 in alleles:
-            for allele2 in allele_names:
-                if allele1 < allele2:
-                    allele_pair = "%s-%s" % (allele1, allele2)
-                else:
-                    allele_pair = "%s-%s" % (allele2, allele1)
-                if not allele_pair in HLA_prob:
-                    HLA_prob[allele_pair] = 0.0
-                HLA_prob[allele_pair] += (float(count) / len(alleles))
-
-    if len(HLA_prob) <= 0:
-        return HLA_prob
-
-    # Choose top allele pairs
-    def choose_top_alleles(HLA_prob):
-        HLA_prob_list = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()]
-        HLA_prob_list = sorted(HLA_prob_list, cmp=HLA_prob_cmp)
-        HLA_prob = {}
-        best_prob = HLA_prob_list[0][1]
-        for i in range(len(HLA_prob_list)):
-            allele_pair, prob = HLA_prob_list[i]
-            if prob * 2 <= best_prob:
-                break                        
-            HLA_prob[allele_pair] = prob
-        normalize(HLA_prob)
-        return HLA_prob
-    HLA_prob = choose_top_alleles(HLA_prob)
-
-    def next_prob(HLA_cmpt, HLA_prob):
-        HLA_prob_next = {}
-        for cmpt, count in HLA_cmpt.items():
-            alleles = cmpt.split('-')
-            prob = 0.0
-            for allele in alleles:
-                for allele_pair in HLA_prob.keys():
-                    if allele in allele_pair:
-                        prob += HLA_prob[allele_pair]
-            for allele in alleles:
-                for allele_pair in HLA_prob.keys():
-                    if not allele in allele_pair:
-                        continue
-                    if allele_pair not in HLA_prob_next:
-                        HLA_prob_next[allele_pair] = 0.0
-                    HLA_prob_next[allele_pair] += (float(count) * HLA_prob[allele_pair] / prob)
-        normalize(HLA_prob_next)
-        return HLA_prob_next
-
-    diff, iter = 1.0, 0
-    while diff > 0.0001 and iter < 1000:
-        HLA_prob_next = next_prob(HLA_cmpt, HLA_prob)
-        diff = prob_diff(HLA_prob, HLA_prob_next)
-        HLA_prob = HLA_prob_next
-        HLA_prob = choose_top_alleles(HLA_prob)
-        iter += 1
-
-    HLA_prob = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()]
-    HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp)
-    return HLA_prob
-
-
-"""
-"""
-def lower_bound(Var_list, pos):
-    low, high = 0, len(Var_list)
-    while low < high:
-        m = (low + high) / 2
-        m_pos = Var_list[m][0]
-        if m_pos < pos:
-            low = m + 1
-        elif m_pos > pos:
-            high = m
-        else:
-            assert m_pos == pos
-            while m > 0:
-                if Var_list[m-1][0] < pos:
-                    break
-                m -= 1
-            return m
-    return low
-
-
-"""
-   var: ['single', 3300, 'G']
-   exons: [[301, 373], [504, 822], [1084, 1417], [2019, 2301], [2404, 2520], [2965, 2997], [3140, 3187], [3357, 3361]]
-"""
-def var_in_exon(var, exons):
-    exonic = False
-    var_type, var_left, var_data = var
-    var_right = var_left
-    if var_type == "deletion":
-        var_right = var_left + int(var_data) - 1
-    for exon_left, exon_right in exons:
-        if var_left >= exon_left and var_right <= exon_right:
-            return True
-    return False
-
-
-"""
-Report variant IDs whose var is within exonic regions
-"""
-def get_exonic_vars(Vars, exons):
-    vars = set()
-    for var_id, var in Vars.items():
-        var_type, var_left, var_data = var
-        var_right = var_left
-        if var_type == "deletion":
-            var_right = var_left + int(var_data) - 1
-        for exon_left, exon_right in exons:
-            if var_left >= exon_left and var_right <= exon_right:
-                vars.add(var_id)
-    return vars
-
-
-"""
-Get representative alleles among those that share the same exonic sequences
-"""
-def get_rep_alleles(Links, exon_vars):
-    allele_vars = {}
-    for var, alleles in Links.items():
-        if var not in exon_vars:
-            continue
-        for allele in alleles:
-            if allele not in allele_vars:
-                allele_vars[allele] = set()
-            allele_vars[allele].add(var)
-
-    allele_groups = {}
-    for allele, vars in allele_vars.items():
-        vars = '-'.join(vars)
-        if vars not in allele_groups:
-            allele_groups[vars] = []
-        allele_groups[vars].append(allele)
-
-    allele_reps = {} # allele representatives
-    allele_rep_groups = {} # allele groups by allele representatives
-    for allele_members in allele_groups.values():
-        assert len(allele_members) > 0
-        allele_rep = allele_members[0]
-        allele_rep_groups[allele_rep] = allele_members
-        for allele_member in allele_members:
-            assert allele_member not in allele_reps
-            allele_reps[allele_member] = allele_rep
-
-    return allele_reps, allele_rep_groups
-    
-
-"""
-Identify alternative alignments
-"""
-def get_alternatives(ref_seq, Vars, Var_list, verbose):
-    # Check deletions' alternatives
-    def get_alternatives_recur(ref_seq,
-                               Vars,
-                               Var_list,
-                               Alts,
-                               var_id,
-                               left,
-                               alt_list,
-                               var_j,
-                               latest_pos,
-                               debug = False):
-        def add_alt(Alts, alt_list, var_id, j_id):
-            if j_id.isdigit():
-                if var_id not in Alts:
-                    Alts[var_id] = [["1"]]
-                else:
-                    if Alts[var_id][-1][-1].isdigit():
-                        Alts[var_id][-1][-1] = str(int(Alts[var_id][-1][-1]) + 1)
-                    else:
-                        Alts[var_id][-1].append("1")
-            else:
-                if var_id not in Alts:
-                    Alts[var_id] = [[j_id]]
-                else:
-                    if Alts[var_id][-1][-1].isdigit():
-                        Alts[var_id][-1][-1] = j_id
-                    else:
-                        Alts[var_id][-1].append(j_id)
-                Alts[var_id][-1].append("0")
-                        
-            if not j_id.isdigit():
-                alt_list.append(j_id)
-                alts = '-'.join(alt_list)
-                if alts not in Alts:
-                    Alts[alts] = [[var_id]]
-                else:
-                    Alts[alts].append([var_id])
-                
-        var_type, var_pos, var_data = Vars[var_id]
-        if left: # Look in left direction
-            if var_j < 0:
-                return
-            j_pos, j_id = Var_list[var_j]
-            alt_del = []
-            if var_id != j_id and j_pos < var_pos + del_len:
-                # Check bases between SNPs
-                while latest_pos > j_pos:
-                    if debug: print latest_pos - 1, ref_seq[latest_pos - 1], latest_pos - 1 - del_len, ref_seq[latest_pos - 1 - del_len]
-                    if ref_seq[latest_pos - 1] != ref_seq[latest_pos - 1 - del_len]:
-                        break
-                    latest_pos -= 1
-                    add_alt(Alts, alt_list, var_id, str(latest_pos))
-                if latest_pos - 1 > j_pos:
-                    return
-                if j_pos == latest_pos - 1:
-                    j_type, _, j_data = Vars[j_id]
-                    if j_type == "single":
-                        if debug: print Vars[j_id]
-                        off = var_pos + del_len - j_pos
-                        if debug: print var_pos - off, ref_seq[var_pos - off]
-                        if debug: print j_pos, ref_seq[j_pos]
-                        if j_data == ref_seq[var_pos - off]:
-                            add_alt(Alts, alt_list, var_id, j_id)
-                            latest_pos = j_pos
-                    elif j_type == "deletion":
-                        j_del_len = int(j_data)
-                        if var_pos < j_pos and var_pos + del_len >= j_pos + j_del_len:
-                            alt_list2 = alt_list[:] + [j_id]
-                            latest_pos2 = j_pos
-                            alt_del = [alt_list2, latest_pos2]
-                
-            get_alternatives_recur(ref_seq,
-                                   Vars,
-                                   Var_list,
-                                   Alts,
-                                   var_id,
-                                   left,
-                                   alt_list,
-                                   var_j - 1,
-                                   latest_pos,
-                                   debug)
-
-            if alt_del:
-                alt_list2, latest_pos2 = alt_del
-                if var_id not in Alts:
-                    Alts[var_id] = [alt_list2[:]]
-                else:
-                    Alts[var_id].append(alt_list2[:])
-                alt_idx = len(Alts[var_id]) - 1
-                get_alternatives_recur(ref_seq,
-                                       Vars,
-                                       Var_list,
-                                       Alts,
-                                       var_id,
-                                       left,
-                                       alt_list2,
-                                       var_j - 1,
-                                       latest_pos2,
-                                       debug)
-                # Remove this Deletion if not supported by additional bases?
-                assert alt_idx < len(Alts[var_id])
-                # DK - for debugging purposes
-                if Alts[var_id][alt_idx][-1] == j_id:
-                    Alts[var_id] = Alts[var_id][:alt_idx] + Alts[var_id][alt_idx+1:]
-              
-        else: # Look in right direction
-            if var_j >= len(Var_list):
-                return
-            j_pos, j_id = Var_list[var_j]
-            alt_del = []
-            if var_id != j_id and j_pos >= var_pos:
-                # Check bases between SNPs
-                while latest_pos < j_pos:
-                    if ref_seq[latest_pos + 1] != ref_seq[var_pos + del_len - 1 - (latest_pos - var_pos)]:
-                        break
-                    latest_pos += 1
-                    add_alt(Alts, alt_list, var_id, str(latest_pos))
-                if latest_pos + 1 < j_pos:
-                    return
-                if j_pos == latest_pos + 1:
-                    j_type, _, j_data = Vars[j_id]
-                    if j_type == "single":
-                        if debug: print Vars[j_id]
-                        off = j_pos - var_pos
-                        if debug: print var_pos + off, ref_seq[var_pos + off]
-                        if debug: print var_pos + del_len + off, ref_seq[var_pos + del_len + off]
-
-                        # DK - for debugging purposes
-                        if var_pos + del_len + off >= len(ref_seq):
-                            print >> sys.stderr, var_id, var
-                            print >> sys.stderr, "var_pos: %d, del_len: %d, off: %d" % (var_pos, del_len, off)
-                            print >> sys.stderr, "ref_seq: %d, %d" % (len(ref_seq), var_pos + del_len + off)
-                            sys.exit(1)
-                        
-                        if j_data == ref_seq[var_pos + del_len + off]:
-                            add_alt(Alts, alt_list, var_id, j_id)
-                            latest_pos = j_pos
-                    elif j_type == "deletion":
-                        j_del_len = int(j_data)
-                        if j_pos + j_del_len < var_pos + del_len:
-                            alt_list2 = alt_list[:] + [j_id]
-                            latest_pos2 = j_pos + j_del_len - 1
-                            alt_del = [alt_list2, latest_pos2]
-
-            get_alternatives_recur(ref_seq,
-                                   Vars,
-                                   Var_list,
-                                   Alts,
-                                   var_id,
-                                   left,
-                                   alt_list,
-                                   var_j + 1,
-                                   latest_pos,
-                                   debug)
-
-            if alt_del:
-                alt_list2, latest_pos2 = alt_del
-                if var_id not in Alts:
-                    Alts[var_id] = [alt_list2[:]]
-                else:
-                    Alts[var_id].append(alt_list2[:])
-                alt_idx = len(Alts[var_id]) - 1
-                get_alternatives_recur(ref_seq,
-                                       Vars,
-                                       Var_list,
-                                       Alts,
-                                       var_id,
-                                       left,
-                                       alt_list2,
-                                       var_j + 1,
-                                       latest_pos2,
-                                       debug)
-                # Remove this Deletion if not supported by additional bases?
-                assert alt_idx < len(Alts[var_id])
-                if Alts[var_id][alt_idx][-1] == j_id:
-                    Alts[var_id] = Alts[var_id][:alt_idx] + Alts[var_id][alt_idx+1:]
-
-    # Check deletions' alternatives
-    Alts_left, Alts_right = {}, {}
-    for var_i, var_id in Var_list:
-        var_type, var_pos, var_data = var = Vars[var_id]
-        if var_type != "deletion" or var_pos == 0:
-            continue
-        del_len = int(var_data)
-        if var_pos + del_len >= len(ref_seq):
-            assert var_pos + del_len == len(ref_seq)
-            continue
-        debug = (var_id == "hv1096a")
-        if debug:
-            print Vars[var_id]
-
-        alt_list = []
-        var_j = lower_bound(Var_list, var_pos + del_len - 1)
-        latest_pos = var_pos + del_len
-        if var_j < len(Var_list):
-            get_alternatives_recur(ref_seq,
-                                   Vars,
-                                   Var_list,
-                                   Alts_left,
-                                   var_id,
-                                   True, # left
-                                   alt_list,
-                                   var_j,
-                                   latest_pos,
-                                   debug)
-        alt_list = []
-        var_j = lower_bound(Var_list, var_pos)
-        latest_pos = var_pos - 1
-        assert var_j >= 0
-        get_alternatives_recur(ref_seq,
-                               Vars,
-                               Var_list,
-                               Alts_right,
-                               var_id,
-                               False, # right
-                               alt_list,
-                               var_j,
-                               latest_pos,
-                               debug)
-
-        if debug:
-            print "DK :-)"
-            sys.exit(1)
-
-    def debug_print_alts(Alts, dir):
-        for alt_list1, alt_list2 in Alts.items():
-            print "\t", dir, ":", alt_list1, alt_list2
-            out_str = "\t\t"
-            alt_list1 = alt_list1.split('-')
-            for i in range(len(alt_list1)):
-                alt = alt_list1[i]
-                var_type, var_pos, var_data = Vars[alt]
-                out_str += ("%s-%d-%s" % (var_type, var_pos, var_data))
-                if i + 1 < len(alt_list1):
-                    out_str += " "
-            for i in range(len(alt_list2)):
-                alt_list3 = alt_list2[i]
-                out_str += "\t["
-                for j in range(len(alt_list3)):
-                    alt = alt_list3[j]
-                    if alt.isdigit():
-                        out_str += alt
-                    else:
-                        var_type, var_pos, var_data = Vars[alt]
-                        out_str += ("%s-%d-%s" % (var_type, var_pos, var_data))
-                    if j + 1 < len(alt_list3):
-                        out_str += ", "
-                out_str += "]"
-            print out_str
-    if verbose >= 2: debug_print_alts(Alts_left, "left")
-    if verbose >= 2: debug_print_alts(Alts_right, "right")
-
-    return Alts_left, Alts_right
-
-
-"""
-Identify ambigious differences that may account for other alleles,
-  given a list of differences (cmp_list) between a read and a potential allele   
-"""
-def identify_ambigious_diffs(Vars, Alts_left, Alts_right, cmp_list, verbose):
-    cmp_left, cmp_right = 0, len(cmp_list) - 1
-    i = 0
-    while i < len(cmp_list):
-        cmp_i = cmp_list[i]
-        type, pos, length = cmp_i[:3]
-        # Check alternative alignments
-        if type in ["mismatch", "deletion"]:
-            var_id = cmp_i[3]
-            if var_id == "unknown":
-                i += 1
-                continue
-            
-            # Left direction
-            id_str = var_id
-            total_del_len = length if type == "deletion" else 0
-            for j in reversed(range(0, i)):
-                cmp_j = cmp_list[j]
-                j_type, j_pos, j_len = cmp_j[:3]
-                if j_type != "match":
-                    if len(cmp_j) < 4:
-                        continue
-                    j_var_id = cmp_j[3]
-                    id_str += ("-%s" % j_var_id)
-                    if j_type == "deletion":
-                        total_del_len += j_len
-            last_type, last_pos, last_len = cmp_list[0][:3]
-            assert last_type in ["match", "mismatch"]
-            left_pos = last_pos + total_del_len
-            if id_str in Alts_left:
-                orig_alts = id_str.split('-')
-                alts_list = Alts_left[id_str]
-                for alts in alts_list:
-                    if alts[-1].isdigit():
-                        assert type == "deletion"
-                        assert len(orig_alts) == 1
-                        alts_id_str = '-'.join(alts[:-1])
-                        alt_left_pos = pos
-                        alt_total_del_len = 0
-                        for alt in alts[:-1]:
-                            assert alt in Vars
-                            alt_type, alt_pos, alt_data = Vars[alt]
-                            alt_left_pos = alt_pos - 1
-                            if alt_type == "deletion":
-                                alt_total_del_len += int(alt_data)
-                        alt_left_pos = alt_left_pos + alt_total_del_len - int(alts[-1]) + 1
-                    else:
-                        alts_id_str = '-'.join(alts)
-                        assert alts_id_str in Alts_left
-                        for back_alts in Alts_left[alts_id_str]:
-                            back_id_str = '-'.join(back_alts)
-                            if back_id_str.find(id_str) != 0:
-                                continue
-                            assert len(orig_alts) < len(back_alts)
-                            assert back_alts[-1].isdigit()
-                            alt_left_pos = pos
-                            alt_total_del_len = 0
-                            for alt in back_alts[:len(orig_alts) + 1]:
-                                if alt.isdigit():
-                                    alt_left_pos = alt_left_pos - int(alt) + 1
-                                else:
-                                    assert alt in Vars
-                                    alt_type, alt_pos, alt_data = Vars[alt]
-                                    alt_left_pos = alt_pos - 1
-                                    if alt_type == "deletion":
-                                        alt_total_del_len += int(alt_data)
-                            alt_left_pos += alt_total_del_len
-                        if left_pos >= alt_left_pos:
-                            if verbose >= 2:
-                                print "LEFT:", cmp_list
-                                print "\t", type, "id_str:", id_str, "=>", alts_id_str, "=>", back_alts, "left_pos:", left_pos, "alt_left_pos:", alt_left_pos
-                            cmp_left = i + 1
-                            break
-    
-            # Right direction
-            if cmp_right + 1 == len(cmp_list):
-                id_str = var_id
-                total_del_len = length if type == "deletion" else 0
-                for j in range(i + 1, len(cmp_list)):
-                    cmp_j = cmp_list[j]
-                    j_type, j_pos, j_len = cmp_j[:3]
-                    if j_type != "match":
-                        if len(cmp_j) < 4:
-                            continue
-                        j_var_id = cmp_j[3]
-                        id_str += ("-%s" % j_var_id)
-                        if j_type == "deletion":
-                            total_del_len += j_len                        
-                last_type, last_pos, last_len = cmp_list[-1][:3]
-                assert last_type in ["match", "mismatch"]
-                right_pos = last_pos + last_len - 1 - total_del_len
-                if id_str in Alts_right:
-                    orig_alts = id_str.split('-')
-                    alts_list = Alts_right[id_str]
-                    for alts in alts_list:
-                        if alts[-1].isdigit():
-                            assert type == "deletion"
-                            assert len(orig_alts) == 1
-                            alts_id_str = '-'.join(alts[:-1])
-                            alt_right_pos = pos
-                            alt_total_del_len = 0
-                            for alt in alts[:-1]:
-                                assert alt in Vars
-                                alt_type, alt_pos, alt_data = Vars[alt]
-                                alt_right_pos = alt_pos
-                                if alt_type == "single":
-                                    alt_right_pos += 1
-                                else:
-                                    assert alt_type == "deletion"
-                                    alt_del_len = int(alt_data)
-                                    alt_right_pos += alt_del_len
-                                    alt_total_del_len += alt_del_len
-                            alt_right_pos = alt_right_pos - alt_total_del_len + int(alts[-1]) - 1
-                        else:
-                            alts_id_str = '-'.join(alts)
-                            assert alts_id_str in Alts_right
-                            for back_alts in Alts_right[alts_id_str]:
-                                back_id_str = '-'.join(back_alts)
-                                if back_id_str.find(id_str) != 0:
-                                    continue
-                                assert len(orig_alts) < len(back_alts)
-                                assert back_alts[-1].isdigit()
-                                alt_right_pos = pos
-                                alt_total_del_len = 0
-                                for alt in back_alts[:len(orig_alts) + 1]:
-                                    if alt.isdigit():
-                                        alt_right_pos = alt_right_pos + int(alt) - 1
-                                    else:
-                                        assert alt in Vars
-                                        alt_type, alt_pos, alt_data = Vars[alt]
-                                        alt_right_pos = alt_pos
-                                        if alt_type == "single":
-                                            alt_right_pos += 1
-                                        else:
-                                            assert alt_type == "deletion"
-                                            alt_del_len = int(alt_data)
-                                            alt_right_pos += alt_del_len
-                                            alt_total_del_len += alt_del_len
-                                alt_right_pos -= alt_total_del_len
-                                    
-                        if right_pos <= alt_right_pos:
-                            if verbose >= 2:
-                                print "RIGHT:", cmp_list
-                                print "\t", type, "id_str:", id_str, "=>", alts_id_str, "right_pos:", right_pos, "alt_right_pos:", alt_right_pos
-                            cmp_right = i - 1
-                            break
-        i += 1
-
-    return cmp_left, cmp_right
-
-
-"""
-Example,
-   gene_name, allele_name (input): A, A*32:01:01
-   allele (output): single-136-G-hv47,deletion-285-1-hv57, ... ,single-3473-T-hv1756,deletion-3495-1-hv1763,single-3613-C-hv1799 
-"""
-def get_allele(gene_name, allele_name, Vars, Var_list, Links):    
-    allele_haplotype = []
-    for _var_pos, _var_id in Var_list[gene_name]:
-        if allele_name in Links[_var_id]:
-            _var = Vars[gene_name][_var_id]
-            allele_haplotype.append("%s-%d-%s-%s" % (_var[0], _var[1], _var[2], _var_id))                                
-    allele_haplotype = ','.join(allele_haplotype)
-    return allele_haplotype
-
-
-"""
-"""
-def calculate_allele_coverage(allele_haplotype,
-                              N_haplotypes,
-                              exons,
-                              partial,
-                              exonic_only,
-                              output):
-    _var_count = {}
-    for read_haplotypes in N_haplotypes.values():
-        for read_haplotype in read_haplotypes:
-            if haplotype_cmp(allele_haplotype, read_haplotype) <= 0:
-                _, assembled = assemble_two_haplotypes(allele_haplotype.split(','), read_haplotype.split(','))
-            else:
-                _, assembled = assemble_two_haplotypes(read_haplotype.split(','), allele_haplotype.split(','))
-            read_vars = read_haplotype.split(',')
-            for read_var in read_vars:
-                _type, _left, _data, _id = read_var.split('-')
-                if _type in ["left", "right", "unknown"]:
-                    continue
-                if _id not in _var_count:
-                    _var_count[_id] = 1
-                else:
-                    _var_count[_id] += 1
-    total_var, covered_var = 0, 0
-    for allele_var in allele_haplotype.split(',')[1:-1]:
-        _type, _left, _data, _id = allele_var.split('-')
-        _left = int(_left)
-        _count = 0
-
-        if partial and \
-                exonic_only and \
-                not var_in_exon([_type, _left, _data], exons):
-            continue
-        
-        total_var += 1
-        if _id in _var_count:
-            _count = _var_count[_id]
-            covered_var += 1
-        if output:
-            print "\t %d %s %s (%s - %d)" % (_left, _type, _data, _id, _count)
-            
-    return covered_var, total_var
-
-
-"""
-"""
-def HLA_typing(ex_path,
-               simulation,
-               reference_type,
-               hla_list,
-               partial,
-               partial_alleles,
-               refHLAs,
-               HLAs,
-               HLA_names,
-               HLA_lengths,
-               refHLA_loci,
-               Vars,
-               Var_list,
-               Links,
-               HLAs_default,
-               Vars_default,
-               Var_list_default,
-               Links_default,
-               exclude_allele_list,
-               aligners,
-               num_mismatch,
-               assembly,
-               concordant_assembly,
-               exonic_only,
-               fastq,
-               read_fname,
-               alignment_fname,
-               num_frag_list,
-               threads,
-               enable_coverage,
-               best_alleles,
-               verbose):    
-    if simulation:
-        test_passed = {}
-    for aligner, index_type in aligners:
-        if index_type == "graph":
-            print >> sys.stderr, "\n\t\t%s %s on %s" % (aligner, index_type, reference_type)
-        else:
-            print >> sys.stderr, "\n\t\t%s %s" % (aligner, index_type)
-
-        remove_alignment_file = False
-        if alignment_fname == "":
-            # Align reads, and sort the alignments into a BAM file
-            remove_alignment_file = True
-            if simulation:
-                alignment_fname = "hla_output.bam"
-            else:
-                alignment_fname = read_fname[0].split('/')[-1]
-                alignment_fname = alignment_fname.split('.')[0] + ".bam"
-                
-            align_reads(ex_path,
-                        aligner,
-                        simulation,
-                        index_type,
-                        read_fname,
-                        fastq,
-                        threads,
-                        alignment_fname,
-                        verbose)
-            
-        for test_HLA_names in hla_list:
-            if simulation:
-                gene = test_HLA_names[0].split('*')[0]
-            else:
-                gene = test_HLA_names
-            ref_allele = refHLAs[gene]
-            ref_seq = HLAs[gene][ref_allele]
-            ref_exons = refHLA_loci[gene][-1]
-
-            if not os.path.exists(alignment_fname + ".bai"):
-                os.system("samtools index %s" % alignment_fname)
-            # Read alignments
-            alignview_cmd = ["samtools",
-                             "view",
-                             alignment_fname]
-            base_locus = 0
-            if index_type == "graph":
-                if reference_type == "gene":
-                    alignview_cmd += ["%s" % ref_allele]
-                else:
-                    assert reference_type in ["chromosome", "genome"]
-                    _, chr, left, right, _ = refHLA_loci[gene]
-                    base_locus = left
-                    alignview_cmd += ["%s:%d-%d" % (chr, left + 1, right + 1)]
-
-                bamview_proc = subprocess.Popen(alignview_cmd,
-                                                stdout=subprocess.PIPE,
-                                                stderr=open("/dev/null", 'w'))
-
-                sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting
-                alignview_proc = subprocess.Popen(sort_read_cmd,
-                                                  stdin=bamview_proc.stdout,
-                                                  stdout=subprocess.PIPE,
-                                                  stderr=open("/dev/null", 'w'))
-            else:
-                alignview_proc = subprocess.Popen(alignview_cmd,
-                                             stdout=subprocess.PIPE,
-                                             stderr=open("/dev/null", 'w'))
-
-            # Assembly graph
-            asm_graph = assembly_graph.Graph(ref_seq)
-
-            # List of nodes that represent alleles
-            allele_vars = {}
-            for var_id, allele_list in Links_default.items():
-                for allele_id in allele_list:
-                    if allele_id not in HLAs[gene]:
-                        continue
-                    if allele_id not in allele_vars:
-                        allele_vars[allele_id] = [var_id]
-                    else:
-                        allele_vars[allele_id].append(var_id)
-
-            allele_nodes = {}
-            for allele_id, var_ids in allele_vars.items():
-                seq = list(ref_seq)  # sequence that node represents
-                var = ["" for i in range(len(ref_seq))]  # how sequence is related to backbone
-                for var_id in var_ids:
-                    assert var_id in Vars[gene]
-                    var_type, var_pos, var_data = Vars[gene][var_id]
-                    assert var_pos >= 0 and var_pos < len(ref_seq)
-                    if var_type == "single":
-                        seq[var_pos] = var_data
-                        var[var_pos] = var_id
-                    else:
-                        assert var_type == "deletion"
-                        del_len = int(var_data)
-                        assert var_pos + del_len <= len(ref_seq)
-                        seq[var_pos:var_pos + del_len] = ['D'] * del_len
-                        var[var_pos:var_pos + del_len] = [var_id] * del_len
-
-                seq = ''.join(seq)
-                allele_nodes[allele_id] = assembly_graph.Node(0, seq, var)
-
-            # Extract variants that are within exons
-            exon_vars = get_exonic_vars(Vars[gene], ref_exons)
-
-            # Choose allele representives from those that share the same exonic sequences
-            allele_reps, allele_rep_groups = get_rep_alleles(Links, exon_vars)
-            allele_rep_set = set(allele_reps.values())
-
-            # For checking alternative alignments near the ends of alignments
-            Alts_left, Alts_right = get_alternatives(ref_seq, Vars[gene], Var_list[gene], verbose)
-
-            # Count alleles
-            HLA_counts, HLA_cmpt = {}, {}
-            HLA_gen_counts, HLA_gen_cmpt = {}, {}
-            num_reads, total_read_len = 0, 0
-
-            # For debugging purposes
-            debug_allele_names = set(test_HLA_names) if simulation and verbose >= 2 else set()
-
-            # Read information
-            prev_read_id = None
-            prev_right_pos = 0
-            prev_lines = []
-            if index_type == "graph":
-                # nodes for reads
-                read_nodes = []
-                read_vars_list = []
-                
-                # Cigar regular expression
-                cigar_re = re.compile('\d+\w')
-                for line in alignview_proc.stdout:
-                    line = line.strip()
-                    cols = line.split()
-                    read_id, flag, chr, pos, mapQ, cigar_str = cols[:6]
-                    orig_read_id = read_id
-                    if simulation:
-                        read_id = read_id.split('|')[0]
-                    read_seq, qual = cols[9], cols[10]
-                    num_reads += 1
-                    total_read_len += len(read_seq)
-                    flag, pos = int(flag), int(pos)
-                    pos -= (base_locus + 1)
-                    if pos < 0:
-                        continue
-
-                    # Unalined?
-                    if flag & 0x4 != 0:
-                        if simulation and verbose >= 2:
-                            print "Unaligned"
-                            print "\t", line                            
-                        continue
-
-                    # Concordantly mapped?
-                    if flag & 0x2 != 0:
-                        concordant = True
-                    else:
-                        concordant = False
-
-                    NM, Zs, MD, NH = "", "", "", ""
-                    for i in range(11, len(cols)):
-                        col = cols[i]
-                        if col.startswith("Zs"):
-                            Zs = col[5:]
-                        elif col.startswith("MD"):
-                            MD = col[5:]
-                        elif col.startswith("NM"):
-                            NM = int(col[5:])
-                        elif col.startswith("NH"):
-                            NH = int(col[5:])
-
-                    if NM > num_mismatch:
-                        continue
-
-                    # Only consider unique alignment
-                    if NH > 1:
-                        continue
-
-                    if Zs:
-                        Zs = Zs.split(',')
-
-                    assert MD != ""
-                    MD_str_pos, MD_len = 0, 0
-                    Zs_pos, Zs_i = 0, 0
-                    for _i in range(len(Zs)):
-                        Zs[_i] = Zs[_i].split('|')
-                    if Zs_i < len(Zs):
-                        Zs_pos += int(Zs[Zs_i][0])
-                    read_pos, left_pos = 0, pos
-                    right_pos = left_pos
-                    cigars = cigar_re.findall(cigar_str)
-                    cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-                    cmp_list = []
-
-                    # Extract variants w.r.t backbone from CIGAR string
-                    softclip = [0, 0]
-                    for i in range(len(cigars)):
-                        cigar_op, length = cigars[i]
-                        if cigar_op == 'M':
-                            # Update coverage
-                            if enable_coverage:
-                                if right_pos + length < len(coverage):
-                                    coverage[right_pos] += 1
-                                    coverage[right_pos + length] -= 1
-                                elif right_pos < len(coverage):
-                                    coverage[right_pos] += 1
-                                    coverage[-1] -= 1
-
-                            first = True
-                            MD_len_used = 0
-                            while True:
-                                if not first or MD_len == 0:
-                                    if MD[MD_str_pos].isdigit():
-                                        num = int(MD[MD_str_pos])
-                                        MD_str_pos += 1
-                                        while MD_str_pos < len(MD):
-                                            if MD[MD_str_pos].isdigit():
-                                                num = num * 10 + int(MD[MD_str_pos])
-                                                MD_str_pos += 1
-                                            else:
-                                                break
-                                        MD_len += num
-                                # Insertion or full match followed
-                                if MD_len >= length:
-                                    MD_len -= length
-                                    cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used])
-                                    break
-                                first = False
-                                read_base = read_seq[read_pos + MD_len]
-                                MD_ref_base = MD[MD_str_pos]
-                                MD_str_pos += 1
-                                assert MD_ref_base in "ACGT"
-                                cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used])
-
-                                _var_id = "unknown"
-                                if read_pos + MD_len == Zs_pos and Zs_i < len(Zs):
-                                    assert Zs[Zs_i][1] == 'S'
-                                    _var_id = Zs[Zs_i][2]
-                                    Zs_i += 1
-                                    Zs_pos += 1
-                                    if Zs_i < len(Zs):
-                                        Zs_pos += int(Zs[Zs_i][0])
-
-                                cmp_list.append(["mismatch", right_pos + MD_len, 1, _var_id])
-                                MD_len_used = MD_len + 1
-                                MD_len += 1
-                                # Full match
-                                if MD_len == length:
-                                    MD_len = 0
-                                    break
-                        elif cigar_op == 'I':
-                            cmp_list.append(["insertion", right_pos, length])
-                        elif cigar_op == 'D':
-                            if MD[MD_str_pos] == '0':
-                                MD_str_pos += 1
-                            assert MD[MD_str_pos] == '^'
-                            MD_str_pos += 1
-                            while MD_str_pos < len(MD):
-                                if not MD[MD_str_pos] in "ACGT":
-                                    break
-                                MD_str_pos += 1
-                            _var_id = "unknown"
-                            if read_pos == Zs_pos and Zs_i < len(Zs):
-                                assert Zs[Zs_i][1] == 'D'
-                                _var_id = Zs[Zs_i][2]
-                                Zs_i += 1
-                                if Zs_i < len(Zs):
-                                    Zs_pos += int(Zs[Zs_i][0])
-
-                            cmp_list.append(["deletion", right_pos, length, _var_id])
-                        elif cigar_op == 'S':
-                            if i == 0:
-                                softclip[0] = length
-                                Zs_pos += length
-                            else:
-                                assert i + 1 == len(cigars)
-                                softclip[1] = length
-                        else:                    
-                            assert cigar_op == 'N'
-                            cmp_list.append(["intron", right_pos, length])
-
-                        if cigar_op in "MND":
-                            right_pos += length
-
-                        if cigar_op in "MIS":
-                            read_pos += length
-
-                    # Remove softclip in cigar and modify read_seq and read_qual accordingly
-                    if sum(softclip) > 0:
-                        if softclip[0] > 0:
-                            cigars = cigars[1:]
-                            read_seq = read_seq[softclip[0]:]
-                            qual = qual[softclip[0]:]
-                        if softclip[1] > 0:
-                            cigars = cigars[:-1]
-                            read_seq = read_seq[:-softclip[1]]
-                            qual = qual[:-softclip[1]]
-
-                        cigar_str = ""
-                        for type, length in cigars:
-                            cigar_str += str(length)
-                            cigar_str += type
-                    
-                    if right_pos > len(ref_seq):
-                        continue
-
-                    def add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, include_alleles = set()):
-                        max_count = max(HLA_count_per_read.values())
-                        cur_cmpt = set()
-                        for allele, count in HLA_count_per_read.items():
-                            if count < max_count:
-                                continue
-
-                            if len(include_alleles) > 0 and allele not in include_alleles:
-                                continue
-                            
-                            cur_cmpt.add(allele)                    
-                            if allele not in HLA_counts:
-                                HLA_counts[allele] = 1
-                            else:
-                                HLA_counts[allele] += 1
-
-                        if len(cur_cmpt) == 0:
-                            return ""
-
-                        # DK - for debugging purposes                            
-                        alleles = ["", ""]
-                        # alleles = ["B*40:304", "B*40:02:01"]
-                        allele1_found, allele2_found = False, False
-                        if alleles[0] != "":
-                            for allele, count in HLA_count_per_read.items():
-                                if count < max_count:
-                                    continue
-                                if allele == alleles[0]:
-                                    allele1_found = True
-                                elif allele == alleles[1]:
-                                    allele2_found = True
-                            if allele1_found != allele2_found:
-                                print alleles[0], HLA_count_per_read[alleles[0]]
-                                print alleles[1], HLA_count_per_read[alleles[1]]
-                                if allele1_found:
-                                    print ("%s\tread_id %s - %d vs. %d]" % (alleles[0], prev_read_id, max_count, HLA_count_per_read[alleles[1]]))
-                                else:
-                                    print ("%s\tread_id %s - %d vs. %d]" % (alleles[1], prev_read_id, max_count, HLA_count_per_read[alleles[0]]))
-                                print read_seq
-
-                        cur_cmpt = sorted(list(cur_cmpt))
-                        cur_cmpt = '-'.join(cur_cmpt)
-                        if not cur_cmpt in HLA_cmpt:
-                            HLA_cmpt[cur_cmpt] = 1
-                        else:
-                            HLA_cmpt[cur_cmpt] += 1
-
-                        return cur_cmpt
-
-                    if read_id != prev_read_id:
-                        if prev_read_id != None:
-                            cur_cmpt = add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, allele_rep_set)
-                            add_stat(HLA_gen_cmpt, HLA_gen_counts, HLA_gen_count_per_read)
-                            for read_id_, read_node in read_nodes:
-                                asm_graph.add_node(read_id_, read_node)
-                            read_nodes, read_var_list = [], []
-
-                            if verbose >= 2:
-                                cur_cmpt = cur_cmpt.split('-')
-                                if not(set(cur_cmpt) & set(test_HLA_names)):
-                                    print "%s are chosen instead of %s" % ('-'.join(cur_cmpt), '-'.join(test_HLA_names))
-                                    for prev_line in prev_lines:
-                                        print "\t", prev_line
-
-                            prev_lines = []
-
-                        HLA_count_per_read, HLA_gen_count_per_read = {}, {}
-                        for HLA_name in HLA_names[gene]:
-                            if HLA_name.find("BACKBONE") != -1:
-                                continue
-                            HLA_count_per_read[HLA_name] = 0
-                            HLA_gen_count_per_read[HLA_name] = 0
-
-                    prev_lines.append(line)
-
-                    def add_count(count_per_read, var_id, add):
-                        alleles = Links[var_id]
-                        if verbose >= 2:
-                            if add > 0 and not (set(alleles) & debug_allele_names):
-                                print "Add:", add, debug_allele_names, "-", var_id
-                                print "\t", line
-                                print "\t", alleles
-                            if add < 0 and set(alleles) & debug_allele_names:
-                                print "Add:", add, debug_allele_names, "-", var_id
-                                print "\t", line
-
-                        for allele in alleles:
-                            count_per_read[allele] += add
-
-                    # Decide which allele(s) a read most likely came from
-                    for var_id, data in Vars[gene].items():
-                        var_type, var_pos, var_data = data
-                        if var_type != "deletion":
-                            continue
-                        if left_pos >= var_pos and right_pos <= var_pos + int(var_data):
-                            if var_id in exon_vars:
-                                add_count(HLA_count_per_read, var_id, -1)
-                            add_count(HLA_gen_count_per_read, var_id, -1)
-
-                    # Node
-                    read_node_pos, read_node_seq, read_node_var = -1, "", []
-                    read_vars = []
-
-                    # Positive and negative evidence
-                    positive_vars, negative_vars = set(), set()
-
-                    # Sanity check - read length, cigar string, and MD string
-                    ref_pos, read_pos, cmp_cigar_str, cmp_MD = left_pos, 0, "", ""
-                    cigar_match_len, MD_match_len = 0, 0
-                    cmp_list_left, cmp_list_right = identify_ambigious_diffs(Vars[gene],
-                                                                             Alts_left,
-                                                                             Alts_right,
-                                                                             cmp_list,
-                                                                             verbose)
-
-                    cmp_i = 0
-                    while cmp_i < len(cmp_list):
-                        cmp = cmp_list[cmp_i]
-                        type, length = cmp[0], cmp[2]
-                        if num_mismatch == 0 and type in ["mismatch", "deletion", "insertion"]:
-                            assert cmp[3] != "unknown"
-
-                        if type in ["match", "mismatch"]:
-                            if read_node_pos < 0:
-                                read_node_pos = ref_pos
-
-                        if type == "match":
-                            read_node_seq += read_seq[read_pos:read_pos+length]
-                            read_node_var += ([''] * length)
-                            
-                            var_idx = lower_bound(Var_list[gene], ref_pos)
-                            while var_idx < len(Var_list[gene]):
-                                var_pos, var_id = Var_list[gene][var_idx]
-                                if ref_pos + length <= var_pos:
-                                    break
-                                if ref_pos <= var_pos:
-                                    var_type, _, var_data = Vars[gene][var_id]
-                                    if var_type == "insertion":
-                                        if ref_pos < var_pos and ref_pos + length > var_pos + len(var_data):
-                                            negative_vars.add(var_id)
-                                    elif var_type == "deletion":
-                                        del_len = int(var_data)
-                                        if ref_pos < var_pos and ref_pos + length > var_pos + del_len:
-                                            # Check if this might be one of the two tandem repeats (the same left coordinate)
-                                            cmp_left, cmp_right = cmp[1], cmp[1] + cmp[2]
-                                            test1_seq1 = ref_seq[cmp_left:cmp_right]
-                                            test1_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos + del_len:cmp_right + del_len]
-                                            # Check if this happens due to small repeats (the same right coordinate - e.g. 19 times of TTTC in DQA1*05:05:01:02)
-                                            cmp_left -= read_pos
-                                            cmp_right += (len(read_seq) - read_pos - cmp[2])
-                                            test2_seq1 = ref_seq[cmp_left+int(var_data):cmp_right]
-                                            test2_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos+int(var_data):cmp_right]
-                                            if test1_seq1 != test1_seq2 and test2_seq1 != test2_seq2:
-                                                negative_vars.add(var_id)
-                                    else:
-                                        negative_vars.add(var_id)
-                                var_idx += 1
-                            read_pos += length
-                            ref_pos += length
-                            cigar_match_len += length
-                            MD_match_len += length
-                        elif type == "mismatch":
-                            var_id = cmp[3]
-                            read_base = read_seq[read_pos]
-                            read_node_seq += read_base
-                            read_node_var.append(var_id)
-                            if var_id != "unknown":
-                                if cmp_i >= cmp_list_left and cmp_i <= cmp_list_right:
-                                    positive_vars.add(var_id)
-                            
-                            cmp_MD += ("%d%s" % (MD_match_len, ref_seq[ref_pos]))
-                            MD_match_len = 0
-                            cigar_match_len += 1
-                            read_pos += 1
-                            ref_pos += 1
-                        elif type == "insertion":
-                            assert False
-                            ins_seq = read_seq[read_pos:read_pos+length]
-                            var_idx = lower_bound(Var_list[gene], ref_pos)
-                            while var_idx < len(Var_list[gene]):
-                                var_pos, var_id = Var_list[gene][var_idx]
-                                if ref_pos < var_pos:
-                                    break
-                                if ref_pos == var_pos:
-                                    var_type, _, var_data = Vars[gene][var_id]
-                                    if var_type == "insertion":                                
-                                        if var_data == ins_seq:
-                                            positive_vars.add(var_id)
-                                var_idx += 1
-                            if cigar_match_len > 0:
-                                cmp_cigar_str += ("%dM" % cigar_match_len)
-                                cigar_match_len = 0
-                            read_pos += length
-                            cmp_cigar_str += ("%dI" % length)
-                        elif type == "deletion":
-                            var_id = cmp[3]
-                            alt_match = False
-                            del_len = length
-                            read_node_seq += ('D' * del_len)
-                            if var_id != "unknown":
-                                if cmp_i >= cmp_list_left and cmp_i <= cmp_list_right:
-                                    # Require at least 5bp match before and after a deletion
-                                    if read_pos >= 5 and read_pos + 5 <= len(read_seq):
-                                        positive_vars.add(var_id)
-
-                            if len(read_node_seq) > len(read_node_var):
-                                assert len(read_node_seq) == len(read_node_var) + del_len
-                                read_node_var += ([var_id] * del_len)
-
-                            if cigar_match_len > 0:
-                                cmp_cigar_str += ("%dM" % cigar_match_len)
-                                cigar_match_len = 0
-                            cmp_MD += ("%d" % MD_match_len)
-                            MD_match_len = 0
-                            cmp_cigar_str += ("%dD" % length)
-                            cmp_MD += ("^%s" % ref_seq[ref_pos:ref_pos+length])
-                            ref_pos += length
-                        else:
-                            assert type == "intron"
-                            if cigar_match_len > 0:
-                                cmp_cigar_str += ("%dM" % cigar_match_len)
-                                cigar_match_len = 0
-                            cmp_cigar_str += ("%dN" % length)
-                            ref_pos += length
-
-                        cmp_i += 1
-                
-                    if cigar_match_len > 0:
-                        cmp_cigar_str += ("%dM" % cigar_match_len)
-                    cmp_MD += ("%d" % MD_match_len)
-                    # Sanity check
-                    if read_pos != len(read_seq) or \
-                            cmp_cigar_str != cigar_str or \
-                            cmp_MD != MD:
-                        print >> sys.stderr, "Error:", cigar_str, MD
-                        print >> sys.stderr, "\tcomputed:", cmp_cigar_str, cmp_MD
-                        print >> sys.stderr, "\tcmp list:", cmp_list
-                        assert False
-
-                    # Node
-                    read_nodes.append([orig_read_id, assembly_graph.Node(read_node_pos, read_node_seq, read_node_var)])
-
-                    for positive_var in positive_vars:
-                        if positive_var in exon_vars:
-                            add_count(HLA_count_per_read, positive_var, 1)
-                        add_count(HLA_gen_count_per_read, positive_var, 1)
-                    for negative_var in negative_vars:
-                        if negative_var in exon_vars:
-                            add_count(HLA_count_per_read, negative_var, -1)
-                        add_count(HLA_gen_count_per_read, negative_var, -1)
-
-                    prev_read_id = read_id
-                    prev_right_pos = right_pos
-
-                if num_reads <= 0:
-                    continue
-
-                if prev_read_id != None:
-                    add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, allele_rep_set)
-                    add_stat(HLA_gen_cmpt, HLA_gen_counts, HLA_gen_count_per_read)
-                    for read_id_, read_node in read_nodes:
-                        asm_graph.add_node(read_id_, read_node)
-                    read_nodes, read_var_list = [], []
-
-                # Generate edges
-                asm_graph.generate_edges()
-
-                # Draw assembly graph
-                if len(num_frag_list) > 0:
-                    asm_graph.draw("assembly_graph1", num_frag_list[0][0])
-                else:
-                    asm_graph.draw("assembly_graph1")                    
-
-                # Reduce graph
-                asm_graph.reduce()
-
-                # Draw assembly graph
-                if len(num_frag_list) > 0:
-                    asm_graph.draw("assembly_graph2", num_frag_list[0][0])
-                else:
-                    asm_graph.draw("assembly_graph2")
-                
-                # Further reduce graph with mate pairs
-                tmp_nodes = asm_graph.assemble_with_mates()
-
-                # Draw assembly graph
-                if len(num_frag_list) > 0:
-                    asm_graph.draw("assembly_graph3", num_frag_list[0][0])
-                else:
-                    asm_graph.draw("assembly_graph3")
-
-                # DK - debugging purposes
-                print >> sys.stderr, "Number of tmp nodes:", len(tmp_nodes)
-                for i in range(min(10, len(tmp_nodes))):
-                    node, node_id, node_id_last = tmp_nodes[i]
-                    node_vars = node.get_vars(Vars[gene])
-                    print >> sys.stderr, node_id, node_id_last, node.merged_nodes; node.print_info()
-                    print >> sys.stderr
-                    if simulation:
-                        allele_name, cmp_vars, max_common = "", [], -1
-                        for test_HLA_name in test_HLA_names:
-                            tmp_vars = allele_nodes[test_HLA_name].get_vars(Vars[gene])
-                            tmp_common = len(set(node_vars) & set(allele_vars[test_HLA_name]))
-                            if max_common < tmp_common:
-                                max_common = tmp_common
-                                allele_name = test_HLA_name
-                                cmp_vars = tmp_vars
-                        print >> sys.stderr, "vs.", allele_name
-                        var_i, var_j = 0, 0
-                        while var_i < len(cmp_vars) and var_j < len(node_vars):
-                            cmp_var_id, node_var_id = cmp_vars[var_i], node_vars[var_j]
-                            if cmp_var_id == node_var_id:
-                                print >> sys.stderr, cmp_var_id, Vars[gene][cmp_var_id]
-                                var_i += 1; var_j += 1
-                                continue
-                            cmp_var, node_var = Vars[gene][cmp_var_id], Vars[gene][node_var_id]
-                            if cmp_var[1] <= node_var[1]:
-                                if (var_i > 0 and var_i + 1 < len(cmp_vars)) or cmp_var[0] != "deletion":
-                                    print >> sys.stderr, "***", cmp_var_id, cmp_var, "=="
-                                var_i += 1
-                            else:
-                                print >> sys.stderr, "*** ==", node_var_id, node_var
-                                var_j += 1
-                                
-                            
-                # asm_graph.assemble()
-                
-                # DK - debugging purposes
-                sys.exit(1)
-
-            else:
-                assert index_type == "linear"
-                def add_alleles(alleles):
-                    if not allele in HLA_counts:
-                        HLA_counts[allele] = 1
-                    else:
-                        HLA_counts[allele] += 1
-
-                    cur_cmpt = sorted(list(alleles))
-                    cur_cmpt = '-'.join(cur_cmpt)
-                    if not cur_cmpt in HLA_cmpt:
-                        HLA_cmpt[cur_cmpt] = 1
-                    else:
-                        HLA_cmpt[cur_cmpt] += 1
-
-                prev_read_id, prev_AS = None, None
-                alleles = set()
-                for line in alignview_proc.stdout:
-                    cols = line[:-1].split()
-                    read_id, flag, allele = cols[:3]
-                    flag = int(flag)
-                    if flag & 0x4 != 0:
-                        continue
-                    if not allele.startswith(gene):
-                        continue
-                    if allele.find("BACKBONE") != -1:
-                        continue
-
-                    AS = None
-                    for i in range(11, len(cols)):
-                        col = cols[i]
-                        if col.startswith("AS"):
-                            AS = int(col[5:])
-                    assert AS != None
-                    if read_id != prev_read_id:
-                        if alleles:
-                            if aligner == "hisat2" or \
-                                    (aligner == "bowtie2" and len(alleles) < 10):
-                                add_alleles(alleles)
-                            alleles = set()
-                        prev_AS = None
-                    if prev_AS != None and AS < prev_AS:
-                        continue
-                    prev_read_id = read_id
-                    prev_AS = AS
-                    alleles.add(allele)
-
-                if alleles:
-                    add_alleles(alleles)
-
-            HLA_counts = [[allele, count] for allele, count in HLA_counts.items()]
-            def HLA_count_cmp(a, b):
-                if a[1] != b[1]:
-                    return b[1] - a[1]
-                assert a[0] != b[0]
-                if a[0] < b[0]:
-                    return -1
-                else:
-                    return 1
-            HLA_counts = sorted(HLA_counts, cmp=HLA_count_cmp)
-            for count_i in range(len(HLA_counts)):
-                count = HLA_counts[count_i]
-                if simulation:
-                    found = False
-                    for test_HLA_name in test_HLA_names:
-                        if count[0] == test_HLA_name:
-                            print >> sys.stderr, "\t\t\t*** %d ranked %s (count: %d)" % (count_i + 1, test_HLA_name, count[1])
-                            found = True
-                            """
-                            if count_i > 0 and HLA_counts[0][1] > count[1]:
-                                print >> sys.stderr, "Warning: %s ranked first (count: %d)" % (HLA_counts[0][0], HLA_counts[0][1])
-                                assert False
-                            else:
-                                test_passed += 1
-                            """
-                    if count_i < 5 and not found:
-                        print >> sys.stderr, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
-                else:
-                    print >> sys.stderr, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
-                    if count_i >= 9:
-                        break
-            print >> sys.stderr
-
-            # Calculate the abundance of representative alleles on exonic sequences
-            HLA_prob = single_abundance(HLA_cmpt, HLA_lengths[gene])
-
-            # Incorporate non representative alleles (full length alleles)
-            gen_alleles = set()
-            gen_prob_sum = 0.0
-            for prob_i in range(len(HLA_prob)):
-                allele, prob = HLA_prob[prob_i][:2]
-                if prob_i >= 10 and prob < 0.03:
-                    break
-                if allele in partial_alleles:
-                    continue
-                
-                gen_prob_sum += prob
-                for allele2 in allele_rep_groups[allele]:
-                    gen_alleles.add(allele2)
-            if len(gen_alleles) > 0:
-                HLA_gen_cmpt2 = {}
-                for cmpt, value in HLA_gen_cmpt.items():
-                    cmpt2 = []
-                    for allele in cmpt.split('-'):
-                        if allele in gen_alleles:
-                            cmpt2.append(allele)
-                    if len(cmpt2) == 0:
-                        continue
-                    cmpt2 = '-'.join(cmpt2)
-                    if cmpt2 not in HLA_gen_cmpt2:
-                        HLA_gen_cmpt2[cmpt2] = value
-                    else:
-                        HLA_gen_cmpt2[cmpt2] += value
-                HLA_gen_cmpt = HLA_gen_cmpt2
-                HLA_gen_prob = single_abundance(HLA_gen_cmpt, HLA_lengths[gene])
-
-                HLA_combined_prob = {}
-                for allele, prob in HLA_prob:
-                    assert allele not in HLA_combined_prob
-                    if allele in gen_alleles:
-                        HLA_combined_prob[allele] = 0.0
-                    else:
-                        HLA_combined_prob[allele] = prob
-                for allele, prob in HLA_gen_prob:
-                    HLA_combined_prob[allele] = prob * gen_prob_sum
-                HLA_prob = [[allele, prob] for allele, prob in HLA_combined_prob.items()]
-                HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp)
-
-            success = [False for i in range(len(test_HLA_names))]
-            found_list = [False for i in range(len(test_HLA_names))]
-            for prob_i in range(len(HLA_prob)):
-                prob = HLA_prob[prob_i]
-                found = False
-                _allele_rep = prob[0]
-                if partial and exonic_only:
-                    _fields = _allele_rep.split(':')
-                    if len(_fields) == 4:
-                        _allele_rep = ':'.join(_fields[:-1])
-
-                if simulation:
-                    for name_i in range(len(test_HLA_names)):
-                        test_HLA_name = test_HLA_names[name_i]
-                        if prob[0] == test_HLA_name:
-                            rank_i = prob_i
-                            while rank_i > 0:
-                                if prob == HLA_prob[rank_i - 1][1]:
-                                    rank_i -= 1
-                                else:
-                                    break
-                            print >> sys.stderr, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, test_HLA_name, prob[1] * 100.0)
-                            if rank_i < len(success):
-                                success[rank_i] = True
-                            found_list[name_i] = True
-                            found = True
-                    # DK - for debugging purposes
-                    if not False in found_list and prob_i >= 10:
-                        break
-                if not found:
-                    print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, _allele_rep, prob[1] * 100.0)
-                    if best_alleles and prob_i < 2:
-                        print >> sys.stdout, "SingleModel %s (abundance: %.2f%%)" % (_allele_rep, prob[1] * 100.0)
-                if not simulation and prob_i >= 9:
-                    break
-                if prob_i >= 19:
-                    break
-            print >> sys.stderr
-
-            # DK - for debugging purposes
-            if False and (len(test_HLA_names) == 2 or not simulation):
-                HLA_prob = joint_abundance(HLA_cmpt, HLA_lengths[gene])
-                if len(HLA_prob) <= 0:
-                    continue
-                success = [False]
-                for prob_i in range(len(HLA_prob)):
-                    allele_pair, prob = HLA_prob[prob_i]
-                    allele1, allele2 = allele_pair.split('-')
-                    if best_alleles and prob_i < 1:
-                        print >> sys.stdout, "PairModel %s (abundance: %.2f%%)" % (allele_pair, prob * 100.0)
-                    if simulation:
-                        if allele1 in test_HLA_names and allele2 in test_HLA_names:
-                            rank_i = prob_i
-                            while rank_i > 0:
-                                if HLA_prob[rank_i-1][1] == prob:                                        
-                                    rank_i -= 1
-                                else:
-                                    break
-                            print >> sys.stderr, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, allele_pair, prob * 100.0)
-                            if rank_i == 0:
-                                success[0] = True
-                            break
-                    print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, allele_pair, prob * 100.0)
-                    if not simulation and prob_i >= 9:
-                        break
-                print >> sys.stderr
-
-                # Li's method
-                """
-                li_hla = os.path.join(ex_path, "li_hla/hla")
-                if os.path.exists(li_hla):
-                    li_hla_cmd = [li_hla,
-                                  "hla",
-                                  "hla_input.bam",
-                                  "-b", "%s*BACKBONE" % gene]
-                    li_hla_proc = subprocess.Popen(li_hla_cmd,
-                                                   stdout=subprocess.PIPE,
-                                                   stderr=open("/dev/null", 'w'))
-
-                    # read in the result of Li's hla
-                    for line in li_hla_proc.stdout:
-                        allele1, allele2, score = line.strip().split()
-                        score = float(score)
-                        if simulation:
-                            if allele1 in test_HLA_names and allele2 in test_HLA_names:
-                                print >> sys.stderr, "\t\t\t*** 1 ranked %s-%s (score: %.2f)" % (allele1, allele2, score)
-                                success[0] = True
-                            else:
-                                print >> sys.stderr, "\t\t\tLiModel fails"
-                        if best_alleles:
-                            print >> sys.stdout, "LiModel %s-%s (score: %.2f)" % (allele1, allele2, score)
-                    li_hla_proc.communicate()
-                """
-
-            if simulation and not False in success:
-                aligner_type = "%s %s" % (aligner, index_type)
-                if not aligner_type in test_passed:
-                    test_passed[aligner_type] = 1
-                else:
-                    test_passed[aligner_type] += 1
-
-        if remove_alignment_file:
-            os.system("rm %s*" % (alignment_fname))
-
-    if simulation:
-        return test_passed
-
-    
-"""
-"""
-def read_HLA_alleles(fname, HLAs):
-    for line in open(fname):
-        if line.startswith(">"):
-            HLA_name = line.strip().split()[0][1:]
-            HLA_gene = HLA_name.split('*')[0]
-            if not HLA_gene in HLAs:
-                HLAs[HLA_gene] = {}
-            if not HLA_name in HLAs[HLA_gene]:
-                HLAs[HLA_gene][HLA_name] = ""
-        else:
-            HLAs[HLA_gene][HLA_name] += line.strip()
-    return HLAs
-
-
-"""
-"""
-def read_HLA_vars(fname, reference_type):
-    Vars, Var_list = {}, {}
-    for line in open(fname):
-        var_id, var_type, allele, pos, data = line.strip().split('\t')
-        pos = int(pos)
-        if reference_type != "gene":
-            allele, dist = None, 0
-            for tmp_gene, values in refHLA_loci.items():
-                allele_name, chr, left, right, exons = values
-                if allele == None:
-                    allele = allele_name
-                    dist = abs(pos - left)
-                else:
-                    if dist > abs(pos - left):
-                        allele = allele_name
-                        dist = abs(pos - left)
-            
-        gene = allele.split('*')[0]
-        if not gene in Vars:
-            Vars[gene] = {}
-            assert not gene in Var_list
-            Var_list[gene] = []
-            
-        assert not var_id in Vars[gene]
-        left = 0
-        if reference_type != "gene":
-            _, _, left, _, _ = refHLA_loci[gene]
-        Vars[gene][var_id] = [var_type, pos - left, data]
-        Var_list[gene].append([pos - left, var_id])
-        
-    for gene, in_var_list in Var_list.items():
-        Var_list[gene] = sorted(in_var_list)
-
-    return Vars, Var_list
-
-
-"""
-"""
-def read_HLA_links(fname):
-    Links = {}
-    for line in open(fname):
-        var_id, alleles = line.strip().split('\t')
-        alleles = alleles.split()
-        assert not var_id in Links
-        Links[var_id] = alleles
-
-    return Links
-
-
-"""
-"""
-def construct_allele_seq(backbone_seq, var_ids, Vars):
-    allele_seq = list(backbone_seq)
-    for id in var_ids:
-        assert id in Vars
-        type, pos, data = Vars[id]
-        assert pos < len(allele_seq)
-        if type == "single":
-            assert allele_seq[pos] != data
-            allele_seq[pos] = data
-        else:
-            assert type == "deletion"
-            del_len = int(data)
-            assert pos + del_len <= len(allele_seq)
-            for i in range(pos, pos + del_len):
-                allele_seq[i] = '.'
-
-    allele_seq = ''.join(allele_seq)
-    allele_seq = allele_seq.replace('.', '')
-    return allele_seq
-
-
-"""
-"""
-def test_HLA_genotyping(base_fname,
-                        reference_type,
-                        hla_list,
-                        partial,
-                        aligners,
-                        read_fname,
-                        alignment_fname,
-                        threads,
-                        simulate_interval,
-                        enable_coverage,
-                        best_alleles,
-                        exclude_allele_list,
-                        default_allele_list,
-                        num_mismatch,
-                        perbase_errorrate,
-                        assembly,
-                        concordant_assembly,
-                        exonic_only,
-                        verbose,
-                        daehwan_debug):
-    # Current script directory
-    curr_script = os.path.realpath(inspect.getsourcefile(test_HLA_genotyping))
-    ex_path = os.path.dirname(curr_script)
-
-    # Clone a git repository, IMGTHLA
-    if not os.path.exists("IMGTHLA"):
-        os.system("git clone https://github.com/jrob119/IMGTHLA.git")
-
-    simulation = (read_fname == [] and alignment_fname == "")
-
-    def check_files(fnames):
-        for fname in fnames:
-            if not os.path.exists(fname):
-                return False
-        return True
-
-    # Download HISAT2 index
-    HISAT2_fnames = ["grch38",
-                     "genome.fa",
-                     "genome.fa.fai"]
-    if not check_files(HISAT2_fnames):
-        os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz")
-        hisat2_inspect = os.path.join(ex_path, "hisat2-inspect")
-        os.system("%s grch38/genome > genome.fa" % hisat2_inspect)
-        os.system("samtools faidx genome.fa")
-
-    # Check if the pre-existing files (hla*) are compatible with the current parameter setting
-    if os.path.exists("hla.ref"):
-        left = 0
-        HLA_genes = set()
-        BACKBONE = False
-        for line in open("hla.ref"):
-            HLA_name = line.strip().split()[0]
-            if HLA_name.find("BACKBONE") != -1:
-                BACKBONE = True
-            HLA_gene = HLA_name.split('*')[0]
-            HLA_genes.add(HLA_gene)
-        delete_hla_files = False
-        if reference_type == "gene":
-            if not BACKBONE:
-                delete_hla_files = True
-        elif reference_type in ["chromosome", "genome"]:
-            if BACKBONE:
-                delete_hla_files = True
-        else:
-            assert False
-        if not set(hla_list).issubset(HLA_genes):
-            delete_hla_files = True
-        if delete_hla_files:
-            os.system("rm hla*")
-    
-    # Extract HLA variants, backbone sequence, and other sequeces  
-    if len(base_fname) > 0:
-        base_fname = "_" + base_fname
-    base_fname = "hla" + base_fname
-    
-    HLA_fnames = [base_fname + "_backbone.fa",
-                  base_fname + "_sequences.fa",
-                  base_fname + ".ref",
-                  base_fname + ".snp",
-                  base_fname + ".haplotype",
-                  base_fname + ".link",
-                  base_fname + "_alleles_excluded.txt"]
-    
-    # Check if excluded alleles in current files match
-    excluded_alleles_match = False
-    if(os.path.exists(HLA_fnames[6])):
-        afile = open(HLA_fnames[6],'r')
-        afile.readline()
-        lines = afile.read().split()
-        excluded_alleles_match = (set(exclude_allele_list) == set(lines))
-        afile.close()
-    elif len(exclude_allele_list) == 0:
-        excluded_alleles_match = True
-        try:
-            temp_name = HLA_fnames[6]
-            HLA_fnames.remove(HLA_fnames[6])
-            os.remove(temp_name)
-        except OSError:
-            pass
-        
-    if not excluded_alleles_match:
-        print("Creating Allele Exclusion File.\n")
-        afile = open(HLA_fnames[6],'w')
-        afile.write("Alleles excluded:\n")
-        afile.write("\n".join(exclude_allele_list))
-        afile.close()
-
-    if verbose >= 1:
-        print >> sys.stderr, HLA_fnames
-    
-    if (not check_files(HLA_fnames)) or (not excluded_alleles_match) :
-        extract_hla_script = os.path.join(ex_path, "hisatgenotype_extract_vars.py")
-        extract_cmd = [extract_hla_script,
-                       "--reference-type", reference_type,
-                       "--hla-list", ','.join(hla_list)]
-
-        if len(exclude_allele_list) > 0:
-            print exclude_allele_list
-            extract_cmd += ["--exclude-allele-list", ",".join(exclude_allele_list)]
-
-        if len(base_fname) > 3:
-            extract_cmd += ["--base", base_fname]
-
-        if not partial:
-            extract_cmd += ["--no-partial"]
-        extract_cmd += ["--inter-gap", "30",
-                        "--intra-gap", "50"]
-        if verbose >= 1:
-            print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
-        proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-        proc.communicate()
-        
-        if not check_files(HLA_fnames):
-            print >> sys.stderr, "Error: extract_HLA_vars failed!"
-            sys.exit(1)
-
-    for aligner, index_type in aligners:
-        if aligner == "hisat2":
-            # Build HISAT2 graph indexes based on the above information
-            if index_type == "graph":
-                HLA_hisat2_graph_index_fnames = ["%s.graph.%d.ht2" % (base_fname, i+1) for i in range(8)]
-                if not check_files(HLA_hisat2_graph_index_fnames) or (not excluded_alleles_match):
-                    hisat2_build = os.path.join(ex_path, "hisat2-build")
-                    build_cmd = [hisat2_build,
-                                 "-p", str(threads),
-                                 "--snp", "%s.snp" % base_fname,
-                                 "--haplotype", "%s.haplotype" % base_fname,
-                                 "%s_backbone.fa" % base_fname,
-                                 "%s.graph" % base_fname]
-                    if verbose >= 1:
-                        print >> sys.stderr, "\tRunning:", ' '.join(build_cmd)
-                    proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-                    proc.communicate()        
-                    if not check_files(HLA_hisat2_graph_index_fnames):
-                        print >> sys.stderr, "Error: indexing HLA failed!  Perhaps, you may have forgotten to build hisat2 executables?"
-                        sys.exit(1)
-            # Build HISAT2 linear indexes based on the above information
-            else:
-                assert index_type == "linear"
-                HLA_hisat2_linear_index_fnames = ["%s.linear.%d.ht2" % (base_fname, i+1) for i in range(8)]
-                if reference_type == "gene" and (not check_files(HLA_hisat2_linear_index_fnames) or (not excluded_alleles_match)):
-                    hisat2_build = os.path.join(ex_path, "hisat2-build")
-                    build_cmd = [hisat2_build,
-                                 "%s_backbone.fa,%s_sequences.fa" % (base_fname, base_fname),
-                                 "%s.linear" % base_fname]
-                    proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-                    proc.communicate()        
-                    if not check_files(HLA_hisat2_linear_index_fnames):
-                        print >> sys.stderr, "Error: indexing HLA failed!"
-                        sys.exit(1)
-        else:
-            assert aligner == "bowtie2" and index_type == "linear"
-            # Build Bowtie2 indexes based on the above information
-            HLA_bowtie2_index_fnames = ["%s.%d.bt2" % (base_fname, i+1) for i in range(4)]
-            HLA_bowtie2_index_fnames += ["%s.rev.%d.bt2" % (base_fname, i+1) for i in range(2)]
-            if reference_type == "gene" and (not check_files(HLA_bowtie2_index_fnames) or (not excluded_alleles_match)):
-                build_cmd = ["bowtie2-build",
-                             "%s_backbone.fa,%s_sequences.fa" % (base_fname, base_fname),
-                             base_fname]
-                proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'))
-                proc.communicate()        
-                if not check_files(HLA_bowtie2_index_fnames):
-                    print >> sys.stderr, "Error: indexing HLA failed!"
-                    sys.exit(1)
-
-    # Read partial alleles from hla.data (temporary)
-    partial_alleles = set()
-    for line in open("IMGTHLA/hla.dat"):
-        if not line.startswith("DE"):
-            continue
-        allele_name = line.split()[1][4:-1]
-        gene = allele_name.split('*')[0]
-        if line.find("partial") != -1:
-            partial_alleles.add(allele_name)
-
-    if len(default_allele_list) > 0:
-        if not os.path.exists("Default-HLA/hla_backbone.fa"):
-            try:
-                os.mkdir("Default-HLA")
-            except:
-                pass
-            #os.chdir(current_path + "/Default-HLA")
-            
-            extract_hla_script = os.path.join(ex_path, "hisatgenotype_extract_vars.py")
-            extract_cmd = [extract_hla_script,
-                           "--reference-type", reference_type,
-                           "--hla-list", ','.join(hla_list),
-                           "--base", "Default-HLA/hla"]
-
-            if not partial:
-                extract_cmd += ["--no-partial"]
-            extract_cmd += ["--inter-gap", "30",
-                            "--intra-gap", "50"]
-            if verbose >= 1:
-                print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
-            proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-            proc.communicate()
-            
-            if not os.path.exists("Default-HLA/hla_backbone.fa"):
-                print >> sys.stderr, "Error: extract_HLA_vars (Default) failed!"
-                sys.exit(1)
-    
-    # Read HLA alleles (names and sequences)
-    refHLAs, refHLA_loci = {}, {}
-    for line in open("hla.ref"):
-        HLA_name, chr, left, right, length, exon_str = line.strip().split()
-        HLA_gene = HLA_name.split('*')[0]
-        assert not HLA_gene in refHLAs
-        refHLAs[HLA_gene] = HLA_name
-        left, right = int(left), int(right)
-        exons = []
-        for exon in exon_str.split(','):
-            exon_left, exon_right = exon.split('-')
-            exons.append([int(exon_left), int(exon_right)])
-        refHLA_loci[HLA_gene] = [HLA_name, chr, left, right, exons]
-    HLAs = {}
-
-    if reference_type == "gene":
-        read_HLA_alleles(base_fname + "_backbone.fa", HLAs)
-    read_HLA_alleles(base_fname + "_sequences.fa", HLAs)
-    
-    # HLA gene alleles
-    HLA_names = {}
-    for HLA_gene, data in HLAs.items():
-        HLA_names[HLA_gene] = list(data.keys())
-
-    # HLA gene allele lengths
-    HLA_lengths = {}
-    for HLA_gene, HLA_alleles in HLAs.items():
-        HLA_lengths[HLA_gene] = {}
-        for allele_name, seq in HLA_alleles.items():
-            HLA_lengths[HLA_gene][allele_name] = len(seq)
-
-    # Construct excluded alleles (Via default backbone data)
-    custom_allele_check = False
-    if len(default_allele_list) > 0:
-        custom_allele_check = True
-        HLAs_default = {}
-        read_HLA_alleles("Default-HLA/hla_backbone.fa", HLAs_default)
-        read_HLA_alleles("Default-HLA/hla_sequences.fa", HLAs_default)
-        #HLA_lengths_default = {}
-
-        for HLA_gene, HLA_alleles in HLAs_default.items():
-            for allele_name, seq in HLA_alleles.items():
-                if allele_name in default_allele_list:
-                    HLA_lengths[HLA_gene][allele_name] = len(seq)
-        
-        #for allele_name, seq in HLAs_default.items():
-         #   if allele_name in default_allele_list:
-          #      HLA_lengths[allele_name] = len(seq)
-            #if (allele_name in default_allele_list):
-            #    HLA_lengths_default[allele_name] = len(seq)
-    else:
-        HLAs_default = HLAs
-
-    # Read HLA variants, and link information
-    Vars, Var_list = read_HLA_vars("%s.snp" % base_fname, reference_type)
-    Links = read_HLA_links("%s.link" % base_fname)
-    Vars_default, Var_list_default, Links_default = {}, {}, {}
-    if len(default_allele_list) > 0:
-        Vars_default, Var_list_default = read_HLA_vars("Default-HLA/hla.snp", reference_type)
-        Links_default = read_HLA_links("Default-HLA/hla.link")
-    else:
-        Vars_default, Var_list_default = Vars, Var_list
-        Links_default = Links
-
-    # Test HLA typing
-    test_list = []
-    if simulation:
-        basic_test, pair_test = True, False
-        if daehwan_debug:
-            if "basic_test" in daehwan_debug:
-                basic_test, pair_test = True, False
-            else:
-                basic_test, pair_test = False, True
-
-        test_passed = {}
-        test_list = []
-        genes = list(set(hla_list) & set(HLA_names.keys()))
-        if basic_test:
-            if custom_allele_check:
-                for allele in default_allele_list:
-                    test_list.append([[allele]])
-            else:
-                for gene in genes:
-                    HLA_gene_alleles = HLA_names[gene]
-                    for HLA_name in HLA_gene_alleles:
-                        if HLA_name.find("BACKBONE") != -1:
-                            continue
-                        test_list.append([[HLA_name]])
-        if pair_test:
-            test_size = 500
-            allele_count = 2
-            if custom_allele_check:
-                if (default_allele_list) < allele_count:
-                    print >> sys.stderr, "# of default alleles (%d) is at least %d" % (len(defeault_allele_list), allele_count)
-                    sys.exit(1)
-                    
-                for test_i in range(1):
-                    random.shuffle(default_allele_list)
-                    test_pair = [default_allele_list[:allele_count]]
-                    test_list.append(test_pair)
-            else:
-                for test_i in range(test_size):
-                    test_pairs = []
-                    for gene in genes:
-                        HLA_gene_alleles = []
-
-                        for allele in HLA_names[gene]:
-                            if allele.find("BACKBONE") != -1:
-                                continue
-                            HLA_gene_alleles.append(allele)
-                        nums = [i for i in range(len(HLA_gene_alleles))]
-                        random.shuffle(nums)
-                        test_pairs.append(sorted([HLA_gene_alleles[nums[i]] for i in range(allele_count)]))
-                    test_list.append(test_pairs)
-
-        # DK - for debugging purposes
-        # test_list = [[["A*01:01:01:01"]], [["A*32:29"]]]
-        # test_list = [[["A*01:01:01:01", "A*03:01:01:01"]]]
-        # test_list = [[["A*02:01:21"]], [["A*03:01:01:01"]], [["A*03:01:01:04"]], [["A*02:521"]]]
-        for test_i in range(len(test_list)):
-            if "test_id" in daehwan_debug:
-                daehwan_test_ids = daehwan_debug["test_id"].split('-')
-                if str(test_i + 1) not in daehwan_test_ids:
-                    continue
-
-            print >> sys.stderr, "Test %d" % (test_i + 1), str(datetime.now())
-            test_HLA_list = test_list[test_i]
-            num_frag_list = simulate_reads(HLAs_default if custom_allele_check else HLAs,
-                                           test_HLA_list,
-                                           Vars,
-                                           Links,
-                                           simulate_interval,
-                                           perbase_errorrate)
-
-            assert len(num_frag_list) == len(test_HLA_list)
-            for i_ in range(len(test_HLA_list)):
-                test_HLA_names = test_HLA_list[i_]
-                num_frag_list_i = num_frag_list[i_]
-                assert len(num_frag_list_i) == len(test_HLA_names)
-                for j_ in range(len(test_HLA_names)):
-                    test_HLA_name = test_HLA_names[j_]
-                    if custom_allele_check:
-                        gene = test_HLA_name.split('*')[0]
-                        test_HLA_seq = HLAs_default[gene][test_HLA_name]
-                        seq_type = "partial" if test_HLA_name in partial_alleles else "full"
-                        print >> sys.stderr, "\t%s - %d bp (%s sequence, %d pairs)" % (test_HLA_name, len(test_HLA_seq), seq_type, num_frag_list_i[j_])
-                        continue
-                    gene = test_HLA_name.split('*')[0]
-                    test_HLA_seq = HLAs[gene][test_HLA_name]
-                    seq_type = "partial" if test_HLA_name in partial_alleles else "full"
-                    print >> sys.stderr, "\t%s - %d bp (%s sequence, %d pairs)" % (test_HLA_name, len(test_HLA_seq), seq_type, num_frag_list_i[j_])
-
-            if "single-end" in daehwan_debug:
-                read_fname = ["hla_input_1.fa"]
-            else:
-                read_fname = ["hla_input_1.fa", "hla_input_2.fa"]
-
-            fastq = False
-            tmp_test_passed = HLA_typing(ex_path,
-                                         simulation,
-                                         reference_type,
-                                         test_HLA_list,
-                                         partial,
-                                         partial_alleles,
-                                         refHLAs,
-                                         HLAs,                       
-                                         HLA_names,
-                                         HLA_lengths,
-                                         refHLA_loci,
-                                         Vars,
-                                         Var_list,
-                                         Links,
-                                         HLAs_default,
-                                         Vars_default,
-                                         Var_list_default,
-                                         Links_default,
-                                         exclude_allele_list,
-                                         aligners,
-                                         num_mismatch,
-                                         assembly,
-                                         concordant_assembly,
-                                         exonic_only,
-                                         fastq,
-                                         read_fname,
-                                         alignment_fname,
-                                         num_frag_list,
-                                         threads,
-                                         enable_coverage,
-                                         best_alleles,
-                                         verbose)
-
-            for aligner_type, passed in tmp_test_passed.items():
-                if aligner_type in test_passed:
-                    test_passed[aligner_type] += passed
-                else:
-                    test_passed[aligner_type] = passed
-
-                print >> sys.stderr, "\t\tPassed so far: %d/%d (%.2f%%)" % (test_passed[aligner_type], test_i + 1, (test_passed[aligner_type] * 100.0 / (test_i + 1)))
-
-
-        for aligner_type, passed in test_passed.items():
-            print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list))
-    
-    else: # With real reads or BAMs
-        print >> sys.stderr, "\t", ' '.join(hla_list)
-        fastq = True
-        HLA_typing(ex_path,
-                   simulation,
-                   reference_type,
-                   hla_list,
-                   partial,
-                   partial_alleles,
-                   refHLAs,
-                   HLAs,                       
-                   HLA_names,
-                   HLA_lengths,
-                   refHLA_loci,
-                   Vars,
-                   Var_list,
-                   Links,
-                   HLAs_default,
-                   Vars_default,
-                   Var_list_default,
-                   Links_default,
-                   exclude_allele_list,
-                   aligners,
-                   num_mismatch,
-                   assembly,
-                   concordant_assembly,
-                   exonic_only,
-                   fastq,
-                   read_fname,
-                   alignment_fname,
-                   [],
-                   threads,
-                   enable_coverage,
-                   best_alleles,
-                   verbose)
-
-        
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description='test HLA genotyping')
-    parser.add_argument("--base",
-                        dest="base_fname",
-                        type=str,
-                        default="",
-                        help="base filename for backbone HLA sequence, HLA variants, and HLA linking info")
-    parser.add_argument("--default-list",
-                        dest = "default_allele_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of HLA alleles to be tested. Alleles are retrieved from default backbone data (all alleles included in backbone).")
-    parser.add_argument("--reference-type",
-                        dest="reference_type",
-                        type=str,
-                        default="gene",
-                        help="Reference type: gene, chromosome, and genome (default: gene)")
-    parser.add_argument("--hla-list",
-                        dest="hla_list",
-                        type=str,
-                        default="A,B,C,DQA1,DQB1,DRB1",
-                        help="A comma-separated list of HLA genes (default: A,B,C,DQA1,DQB1,DRB1)")
-    parser.add_argument('--no-partial',
-                        dest='partial',
-                        action='store_false',
-                        help='Include partial alleles (e.g. A_nuc.fasta)')
-    parser.add_argument("--aligner-list",
-                        dest="aligners",
-                        type=str,
-                        default="hisat2.graph,hisat2.linear,bowtie2.linear",
-                        help="A comma-separated list of aligners (default: hisat2.graph,hisat2.linear,bowtie2.linear)")
-    parser.add_argument("--reads",
-                        dest="read_fname",
-                        type=str,
-                        default="",
-                        help="Fastq read file name")
-    parser.add_argument("--alignment",
-                        dest="alignment_fname",
-                        type=str,
-                        default="",
-                        help="BAM file name")
-    parser.add_argument("-p", "--threads",
-                        dest="threads",
-                        type=int,
-                        default=1,
-                        help="Number of threads")
-    parser.add_argument("--simulate-interval",
-                        dest="simulate_interval",
-                        type=int,
-                        default=1,
-                        help="Reads simulated at every these base pairs (default: 1)")
-    parser.add_argument("--coverage",
-                        dest="coverage",
-                        action='store_true',
-                        help="Experimental purpose (assign reads based on coverage)")
-    parser.add_argument("--best-alleles",
-                        dest="best_alleles",
-                        action='store_true',
-                        help="")
-    parser.add_argument("--exclude-allele-list",
-                        dest="exclude_allele_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of alleles to be excluded. Enter a number N to randomly select N alleles for exclusion and N non-excluded alleles for testing (2N tested in total).")
-    parser.add_argument("--random-seed",
-                        dest="random_seed",
-                        type=int,
-                        default=0,
-                        help="A seeding number for randomness (default: 0)")
-    parser.add_argument("--num-mismatch",
-                        dest="num_mismatch",
-                        type=int,
-                        default=0,
-                        help="Maximum number of mismatches per read alignment to be considered (default: 0)")
-    parser.add_argument("--perbase-errorrate",
-                        dest="perbase_errorrate",
-                        type=float,
-                        default=0.0,
-                        help="Per basepair error rate when simulating reads (default: 0.0)")
-    parser.add_argument('-v', '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        help='also print some statistics to stderr')
-    parser.add_argument('--verbose-level',
-                        dest='verbose_level',
-                        type=int,
-                        default=0,
-                        help='also print some statistics to stderr (default: 0)')
-    parser.add_argument("--debug",
-                        dest="debug",
-                        type=str,
-                        default="",
-                        help="e.g., test_id:10,read_id:10000,basic_test")
-    parser.add_argument("--assembly",
-                        dest="assembly",
-                        action="store_true",
-                        help="Perform assembly")
-    parser.add_argument("--no-concordant-assembly",
-                        dest="concordant_assembly",
-                        action="store_false",
-                        help="")
-    parser.add_argument("--exonic-only",
-                        dest="exonic_only",
-                        action="store_true",
-                        help="Consider exonic regions only")
-    parser.add_argument("--novel_allele_detection",
-                        dest="novel_allele_detection",
-                        action='store_true',
-                        help="Change test to detection of new alleles. Report sensitivity and specificity rate at the end.")
-
-
-    args = parser.parse_args()
-    if not args.reference_type in ["gene", "chromosome", "genome"]:
-        print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type)
-        sys.exit(1)
-    args.hla_list = args.hla_list.split(',')
-    if args.aligners == "":
-        print >> sys.stderr, "Error: --aligners must be non-empty."
-        sys.exit(1)    
-    args.aligners = args.aligners.split(',')
-    for i in range(len(args.aligners)):
-        args.aligners[i] = args.aligners[i].split('.')
-    if args.read_fname:
-        args.read_fname = args.read_fname.split(',')
-    else:
-        args.read_fname = []
-    if args.alignment_fname != "" and \
-            not os.path.exists(args.alignment_fname):
-        print >> sys.stderr, "Error: %s doesn't exist." % args.alignment_fname
-        sys.exit(1)
-
-    if args.verbose and args.verbose_level == 0:
-        args.verbose_level = 1
-    
-    if len(args.default_allele_list) > 0:
-        args.default_allele_list = args.default_allele_list.split(',')
-        
-    if len(args.exclude_allele_list) > 0:
-        if args.exclude_allele_list.strip().isdigit():
-            num_alleles = int(args.exclude_allele_list)            
-            
-            if not os.path.exists("Default-HLA/hla_backbone.fa"):
-                curr_script = os.path.realpath(inspect.getsourcefile(test_HLA_genotyping))
-                ex_path = os.path.dirname(curr_script)
-                extract_hla_script = os.path.join(ex_path, "hisatgenotype_extract_vars.py")
-                extract_cmd = [extract_hla_script,
-                               "--reference-type", args.reference_type,
-                               "--hla-list", ','.join(args.hla_list),
-                               "--base", "Default-HLA/hla"]
-                if not args.partial:
-                    extract_cmd += ["--no-partial"]
-                extract_cmd += ["--inter-gap", "30",
-                                "--intra-gap", "50"]
-                if args.verbose_level >= 1:
-                    print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
-                proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-                proc.communicate()
-                if not os.path.exists("Default-HLA/hla_backbone.fa"):
-                    print >> sys.stderr, "Error: extract_HLA_vars (Default) failed!"
-                    sys.exit(1)
-       
-            HLAs_default = {}
-            #read_HLA_alleles("Default-HLA/hla_backbone.fa", HLAs_default)
-            read_HLA_alleles("Default-HLA/hla_sequences.fa", HLAs_default)
-            
-            allele_names = list(HLAs_default['A'].keys())
-            random.seed(args.random_seed)
-            random.shuffle(allele_names)
-            args.exclude_allele_list = allele_names[0:num_alleles]
-            args.default_allele_list = allele_names[num_alleles:2*num_alleles]
-            
-            args.default_allele_list = args.default_allele_list + args.exclude_allele_list
-            
-            # DK - for debugging purposes
-            args.default_allele_list = args.exclude_allele_list
-        else:
-            args.exclude_allele_list = args.exclude_allele_list.split(',')
-
-        if args.num_mismatch == 0:
-            args.num_mismatch = 3
-        
-    debug = {}
-    if args.debug != "":
-        for item in args.debug.split(','):
-            if ':' in item:
-                key, value = item.split(':')
-                debug[key] = value
-            else:
-                debug[item] = 1
-
-    if not args.partial:
-        print >> sys.stderr, "Error: --no-partial is not supported!"
-        sys.exit(1)
-
-    random.seed(1)
-    test_HLA_genotyping(args.base_fname,
-                        args.reference_type,
-                        args.hla_list,
-                        args.partial,
-                        args.aligners,
-                        args.read_fname,
-                        args.alignment_fname,
-                        args.threads,
-                        args.simulate_interval,
-                        args.coverage,
-                        args.best_alleles,
-                        args.exclude_allele_list,
-                        args.default_allele_list,
-                        args.num_mismatch,
-                        args.perbase_errorrate,
-                        args.assembly,
-                        args.concordant_assembly,
-                        args.exonic_only,
-                        args.verbose_level,
-                        debug)
-
diff --git a/hisatgenotype.py b/hisatgenotype.py
new file mode 100755
index 0000000..0ca633d
--- /dev/null
+++ b/hisatgenotype.py
@@ -0,0 +1,489 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2017, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT-genotype.
+#
+# HISAT-genotype is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT-genotype is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT-genotype.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import sys, os, subprocess, re, resource
+import inspect, random
+import math
+from datetime import datetime, date, time
+from argparse import ArgumentParser, FileType
+import hisatgenotype_typing_common as typing_common
+
+
+"""
+Align reads, and sort the alignments into a BAM file
+"""
+def align_reads(base_fname,
+                read_fnames,
+                fastq,
+                threads,
+                verbose):
+    aligner_cmd = ["hisat2",
+                   "--no-unal",
+                   "-p", str(threads),
+                   "--no-spliced-alignment",
+                   "--max-altstried", "64"]
+    # aligner_cmd += ["--mm"]
+    aligner_cmd += ["-x", "%s" % base_fname]
+
+    assert len(read_fnames) > 0
+    if not fastq:
+        aligner_cmd += ["-f"]
+    single = len(read_fnames) == 1
+    if single:
+        aligner_cmd += ["-U", read_fnames[0]]
+    else:
+        aligner_cmd += ["-1", read_fnames[0],
+                        "-2", read_fnames[1]]
+
+    out_base_fname = read_fnames[0].split('/')[-1].split('.')[0]
+
+    print >> sys.stderr, "%s Aligning %s to %s ..." % (str(datetime.now()), ' '.join(read_fnames), base_fname)
+    if verbose:
+        print >> sys.stderr, "\t%s" % (' '.join(aligner_cmd))
+
+    align_proc = subprocess.Popen(aligner_cmd,
+                                  stdout=subprocess.PIPE,
+                                  stderr=open("/dev/null", 'w'))
+
+    unsorted_bam_fname = "%s_unsorted.bam" % out_base_fname
+    sambam_cmd = ["samtools",
+                  "view",
+                  "-bS",
+                  "-"]
+    sambam_proc = subprocess.Popen(sambam_cmd,
+                                   stdin=align_proc.stdout,
+                                   stdout=open(unsorted_bam_fname, 'w'))
+    sambam_proc.communicate()
+
+    # Increase the maximum number of files that can be opened
+    resource.setrlimit(resource.RLIMIT_NOFILE, (10000, 10240))
+    
+    print >> sys.stderr, "%s Sorting %s ..." % (str(datetime.now()), unsorted_bam_fname)
+    bam_fname = "%s.bam" % out_base_fname
+    bamsort_cmd = ["samtools",
+                   "sort",
+                   "--threads", str(threads),
+                   "-m", "1536M",
+                   unsorted_bam_fname,
+                   "-o", bam_fname]    
+    if verbose:
+        print >> sys.stderr, "\t%s" % ' '.join(bamsort_cmd)
+    bamsort_proc = subprocess.call(bamsort_cmd)
+    os.remove(unsorted_bam_fname)
+
+    index_bam(bam_fname,
+              verbose)
+    
+    return bam_fname
+
+
+"""
+"""
+def index_bam(bam_fname,
+              verbose):
+    print >> sys.stderr, "%s Indexing %s ..." % (str(datetime.now()), bam_fname)
+    bamindex_cmd = ["samtools",
+                    "index",
+                    bam_fname]
+    if verbose:
+        print >> sys.stderr, "\t%s" % ' '.join(bamindex_cmd)
+    bamindex_proc = subprocess.call(bamindex_cmd)
+
+
+"""
+"""
+def extract_reads(bam_fname,
+                  chr,
+                  left,
+                  right,
+                  read_base_fname, # sample => sample.1.fq.gz and sample.2.fq.gz
+                  paired,
+                  fastq,
+                  verbose):
+    out_read_dname = "hisatgenotype_out"
+    if not os.path.exists(out_read_dname):
+        os.mkdir(out_read_dname)
+        
+    read_fnames = []
+    if paired:
+        read_fnames = [out_read_dname + "/" + read_base_fname + ".1.fq.gz",
+                       out_read_dname + "/" + read_base_fname + ".2.fq.gz"]
+    else:
+        read_fnames = [out_read_dname + "/" + read_base_fname + ".fq.gz"]
+
+    if paired:
+        gzip1_proc = subprocess.Popen(["gzip"],
+                                      stdin=subprocess.PIPE,
+                                      stdout=open(read_fnames[0], 'w'),
+                                      stderr=open("/dev/null", 'w'))
+
+        gzip2_proc = subprocess.Popen(["gzip"],
+                                      stdin=subprocess.PIPE,
+                                      stdout=open(read_fnames[1], 'w'),
+                                      stderr=open("/dev/null", 'w'))
+    else:
+        gzip1_proc = subprocess.Popen(["gzip"],
+                                      stdin=subprocess.PIPE,
+                                      stdout=open(read_fnames[0], 'w'),
+                                      stderr=open("/dev/null", 'w'))
+
+    def write_read(gzip_proc, read_name, seq, qual):
+        if fastq:
+            gzip_proc.stdin.write("@%s\n" % read_name)
+            gzip_proc.stdin.write("%s\n" % seq)
+            gzip_proc.stdin.write("+\n")
+            gzip_proc.stdin.write("%s\n" % qual)
+        else:
+            gzip_proc.stdin.write(">%s\n" % prev_read_name)
+            gzip_proc.stdin.write("%s\n" % seq)                    
+
+    bamview_cmd = ["samtools", "view", bam_fname, "%s:%d-%d" % (chr, left+1, right+1)]
+    if verbose:
+        print >> sys.stderr, "\t%s" % ' '.join(bamview_cmd)
+    bamview_proc = subprocess.Popen(bamview_cmd,
+                                    stdout=subprocess.PIPE,
+                                    stderr=open("/dev/null", 'w'))
+
+    sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting
+    alignview_proc = subprocess.Popen(sort_read_cmd,
+                                      stdin=bamview_proc.stdout,
+                                      stdout=subprocess.PIPE,
+                                      stderr=open("/dev/null", 'w'))
+
+    prev_read_name, extract_read, read1, read2 = "", False, [], []
+    for line in alignview_proc.stdout:
+        if line.startswith('@'):
+            continue
+        line = line.strip()
+        cols = line.split()
+        read_name, flag, chr, pos, mapQ, cigar, _, _, _, read, qual = cols[:11]
+        flag, pos = int(flag), int(pos)
+        strand = '-' if flag & 0x10 else '+'                   
+        AS, NH = "", ""
+        for i in range(11, len(cols)):
+            col = cols[i]
+            if col.startswith("AS"):
+                AS = int(col[5:])
+            elif col.startswith("NH"):
+                NH = int(col[5:])
+
+        # DK - check this out
+        simulation = True
+        if (not simulation and read_name != prev_read_name) or \
+           (simulation and read_name.split('|')[0] != prev_read_name.split('|')[0]):
+            if extract_read:
+                if paired:
+                    if len(read1) == 2 and len(read2) == 2:
+                        write_read(gzip1_proc, prev_read_name, read1[0], read1[1])
+                        write_read(gzip2_proc, prev_read_name, read2[0], read2[1])
+                else:                    
+                    write_read(gzip1_proc, prev_read_name, read1[0], read1[1])
+            prev_read_name, extract_read, read1, read2 = read_name, False, [], []
+
+        if NH == 1:
+            extract_read = True
+
+        if flag & 0x40 or not paired: # left read
+            if not read1:
+                if flag & 0x10: # reverse complement
+                    read1 = [typing_common.reverse_complement(read), qual[::-1]]
+                else:
+                    read1 = [read, qual]
+        else:
+            assert flag & 0x80 # right read
+            if flag & 0x10: # reverse complement
+                read2 = [typing_common.reverse_complement(read), qual[::-1]]
+            else:
+                read2 = [read, qual]
+
+    if extract_read:
+        if paired:
+            if len(read1) == 2 and len(read2) == 2:
+                write_read(gzip1_proc, prev_read_name, read1[0], read1[1])
+                write_read(gzip2_proc, prev_read_name, read2[0], read2[1])
+        else:                    
+            write_read(gzip1_proc, prev_read_name, read1[0], read1[1])
+
+    gzip1_proc.stdin.close()
+    if paired:
+        gzip2_proc.stdin.close()
+
+    return read_fnames
+
+
+"""
+"""
+def perform_genotyping(base_fname,
+                       database,
+                       locus_list,
+                       read_fnames,
+                       fastq,
+                       num_editdist,
+                       assembly,
+                       local_database,
+                       threads,
+                       verbose):
+    genotype_cmd = ["hisatgenotype_locus.py"]
+    if not local_database:
+        genotype_cmd += ["--genotype-genome", base_fname]
+    genotype_cmd += ["--base", database]
+    if len(locus_list) > 0:
+        genotype_cmd += ["--locus-list", ','.join(locus_list)]
+    genotype_cmd += ["-p", str(threads),
+                     "--num-editdist", str(num_editdist)]
+    if not fastq:
+        genotype_cmd += ["-f"]
+
+    if len(read_fnames) == 2: # paired
+        genotype_cmd += ["-1", read_fnames[0],
+                         "-2", read_fnames[1]]
+    elif len(read_fnames) == 1:
+        genotype_cmd += ["-U", read_fnames[0]] 
+    else:
+        assert len(read_fnames) == 0
+
+    if assembly:
+        genotype_cmd += ["--assembly"]
+
+    if verbose:
+        print >> sys.stderr, "\t%s" % ' '.join(genotype_cmd)
+    genotype_proc = subprocess.Popen(genotype_cmd)
+    genotype_proc.communicate()
+        
+
+"""
+"""
+def genotype(base_fname,
+             target_region_list,
+             fastq,
+             read_fnames,
+             alignment_fname,
+             threads,
+             num_editdist,
+             assembly,
+             local_database,
+             verbose,
+             debug):
+    # variants, backbone sequence, and other sequeces
+    genotype_fnames = ["%s.fa" % base_fname,
+                       "%s.locus" % base_fname,
+                       "%s.snp" % base_fname,
+                       "%s.index.snp" % base_fname,
+                       "%s.haplotype" % base_fname,
+                       "%s.link" % base_fname,
+                       "%s.coord" % base_fname,
+                       "%s.clnsig" % base_fname]
+    # hisat2 graph index files
+    genotype_fnames += ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)]
+    if not typing_common.check_files(genotype_fnames):
+        print >> sys.stderr, "Error: some of the following files are missing!"
+        for fname in genotype_fnames:
+            print >> sys.stderr, "\t%s" % fname
+        sys.exit(1)
+
+    # Read region alleles (names and sequences)
+    regions, region_loci = {}, {}
+    for line in open("%s.locus" % base_fname):
+        family, allele_name, chr, left, right = line.strip().split()[:5]
+        family = family.lower()
+        if len(target_region_list) > 0 and \
+           family not in target_region_list:
+            continue
+        
+        locus_name = allele_name.split('*')[0]
+        if family in target_region_list and \
+           len(target_region_list[family]) > 0 and \
+           locus_name not in target_region_list[family]:
+            continue
+        
+        left, right = int(left), int(right)
+        if family not in region_loci:
+            region_loci[family] = []
+        region_loci[family].append([locus_name, allele_name, chr, left, right])
+
+    if len(region_loci) <= 0:
+        print >> sys.stderr, "Warning: no region exists!"
+        sys.exit(1)
+
+    # Align reads, and sort the alignments into a BAM file
+    if len(read_fnames) > 0:
+        alignment_fname = align_reads(base_fname,
+                                      read_fnames,
+                                      fastq,
+                                      threads,
+                                      verbose)
+    assert alignment_fname != "" and os.path.exists(alignment_fname)
+    if not os.path.exists(alignment_fname + ".bai"):
+        index_bam(alignment_fname,
+                  verbose)
+    assert os.path.exists(alignment_fname + ".bai")
+
+    # Extract reads and perform genotyping
+    for family, loci in region_loci.items():
+        print >> sys.stderr, "Analyzing %s ..." % family.upper()
+        for locus_name, allele_name, chr, left, right in loci:
+            out_read_fname = "%s.%s" % (family, locus_name)
+            if verbose:
+                print >> sys.stderr, "\tExtracting reads beloning to %s-%s ..." % \
+                    (family, locus_name)
+
+            extracted_read_fnames = extract_reads(alignment_fname,
+                                                  chr,
+                                                  left,
+                                                  right,
+                                                  out_read_fname,
+                                                  len(read_fnames) != 1, # paired?
+                                                  fastq,
+                                                  verbose)
+
+            perform_genotyping(base_fname,
+                               family,
+                               [locus_name],
+                               extracted_read_fnames,
+                               fastq,
+                               num_editdist,
+                               assembly,
+                               local_database,
+                               threads,
+                               verbose)
+        print >> sys.stderr
+
+    
+                
+"""
+"""
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        description='HISAT-genotype')
+    parser.add_argument("--base", "--base-name",
+                        dest="base_fname",
+                        type=str,
+                        default="genotype_genome",
+                        help="base filename for genotype genome")
+    parser.add_argument("--region-list",
+                        dest="region_list",
+                        type=str,
+                        default="",
+                        help="A comma-separated list of regions (default: empty)")
+    parser.add_argument("-f", "--fasta",
+                        dest='fastq',
+                        action='store_false',
+                        help='FASTA file')    
+    parser.add_argument("-U",
+                        dest="read_fname_U",
+                        type=str,
+                        default="",
+                        help="filename for single-end reads")
+    parser.add_argument("-1",
+                        dest="read_fname_1",
+                        type=str,
+                        default="",
+                        help="filename for paired-end reads")
+    parser.add_argument("-2",
+                        dest="read_fname_2",
+                        type=str,
+                        default="",
+                        help="filename for paired-end reads")
+    parser.add_argument("--alignment-file",
+                        dest="alignment_fname",
+                        type=str,
+                        default="",
+                        help="Sorted BAM alignment file name")
+    parser.add_argument("-p", "--threads",
+                        dest="threads",
+                        type=int,
+                        default=1,
+                        help="Number of threads")
+    parser.add_argument("--num-editdist",
+                        dest="num_editdist",
+                        type=int,
+                        default=2,
+                        help="Maximum number of mismatches per read alignment to be considered (default: 2)")
+    parser.add_argument('--assembly',
+                        dest='assembly',
+                        action='store_true',
+                        help='Perform assembly')
+    parser.add_argument('--local-database',
+                        dest='local_database',
+                        action='store_true',
+                        help='Use local database')    
+    parser.add_argument('-v', '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        help='also print some statistics to stderr')
+    parser.add_argument("--debug",
+                        dest="debug",
+                        type=str,
+                        default="",
+                        help="e.g., test_id:10,read_id:10000,basic_test")
+
+    args = parser.parse_args()
+    region_list = {}
+    if args.region_list != "":
+        for region in args.region_list.split(','):
+            region = region.split('.')
+            if len(region) < 1 or len(region) > 2:
+                print >> sys.stderr, "Error: --region-list is incorrectly formatted."
+                sys.exit(1)
+                
+            family = region[0].lower()
+            if len(region) == 2:
+                locus_name = region[1].upper()
+            if family not in region_list:
+                region_list[family] = set()
+            if len(region) == 2:
+                region_list[family].add(locus_name)
+
+    read_fnames = []
+    if args.alignment_fname != "":
+        if not os.path.exists(args.alignment_fname):
+            print >> sys.stderr, "Error: %s does not exist." % args.alignment_fname
+    elif args.read_fname_U != "":
+        read_fnames = [args.read_fname_U]
+    else:
+        if args.read_fname_1 == "" or args.read_fname_2 == "":
+            print >> sys.stderr, "Error: please specify read file names correctly: -U or -1 and -2"
+            sys.exit(1)
+        read_fnames = [args.read_fname_1, args.read_fname_2]
+
+    debug = {}
+    if args.debug != "":
+        for item in args.debug.split(','):
+            if ':' in item:
+                key, value = item.split(':')
+                debug[key] = value
+            else:
+                debug[item] = 1
+
+    genotype(args.base_fname,
+             region_list,
+             args.fastq,
+             read_fnames,
+             args.alignment_fname,
+             args.threads,
+             args.num_editdist,
+             args.assembly,
+             args.local_database,
+             args.verbose,
+             debug)
+
+
diff --git a/hisat2_build_genotype_genome.py b/hisatgenotype_build_genome.py
similarity index 58%
rename from hisat2_build_genotype_genome.py
rename to hisatgenotype_build_genome.py
index 4ca9b27..31a8449 100755
--- a/hisat2_build_genotype_genome.py
+++ b/hisatgenotype_build_genome.py
@@ -23,86 +23,7 @@
 import os, sys, subprocess, re
 import inspect
 from argparse import ArgumentParser, FileType
-
-
-"""
-"""
-def read_genome(genome_file):
-    chr_dic, chr_names, chr_full_names = {}, [], []
-    chr_name, chr_full_name, sequence = "", "", ""
-    for line in genome_file:
-        if line.startswith(">"):
-            if chr_name and sequence:
-                chr_dic[chr_name] = sequence
-                chr_names.append(chr_name)
-            chr_full_name = line.strip()[1:]
-            chr_name = line.strip().split()[0][1:]
-            chr_full_names.append(chr_full_name)
-            sequence = ""
-        else:
-            sequence += line.strip()
-    if chr_name and sequence:
-        chr_dic[chr_name] = sequence
-        chr_names.append(chr_name)
-        chr_full_names.append(chr_full_name)
-    return chr_dic, chr_names, chr_full_names
-
-
-"""
-"""
-def read_sequences(fname):
-    allele_seqs = {}
-    allele_name, sequence = "", ""
-    for line in open(fname):
-        if line.startswith(">"):
-            if allele_name != "" and allele_name not in allele_seqs:
-                allele_seqs[allele_name] = sequence
-            allele_name = line.strip()[1:]
-            sequence = ""
-        else:
-            sequence += line.strip()
-    if allele_name != "" and allele_name not in allele_seqs:
-        allele_seqs[allele_name] = sequence
-    return allele_seqs
-
-
-"""
-"""
-def read_variants(fname):
-    allele_vars = {}
-    for line in open(fname):
-        var_id, type, allele_name, left, data = line.strip().split()
-        left = int(left)
-        if type == "deletion":
-            data = int(data)
-        if allele_name not in allele_vars:
-            allele_vars[allele_name] = []
-        allele_vars[allele_name].append([left, type, data, var_id])
-    return allele_vars
-
-
-"""
-"""
-def read_haplotypes(fname):
-    allele_haplotypes = {}
-    for line in open(fname):
-        haplotype_id, allele_name, left, right, vars = line.strip().split()
-        vars = vars.split(',')
-        left, right = int(left), int(right)
-        if allele_name not in allele_haplotypes:
-            allele_haplotypes[allele_name] = []
-        allele_haplotypes[allele_name].append([left, right, vars])
-    return allele_haplotypes
-
-
-"""
-"""
-def read_links(fname):
-    links = []
-    for line in open(fname):
-        var_id, allele_names = line.strip().split('\t')
-        links.append([var_id, allele_names])
-    return links
+import hisatgenotype_typing_common as typing_common, hisatgenotype_gene_typing as gene_typing
 
 
 """
@@ -116,64 +37,26 @@ def read_clnsig(fname):
 
 
 """
-Compare two variants
-"""
-def compare_vars(a, b):
-    a_pos, a_type, a_data = a[:3]
-    b_pos, b_type, b_data = b[:3]
-
-    if a_pos != b_pos:
-        return a_pos - b_pos
-    if a_type != b_type:
-         if a_type == 'I':
-             return -1
-         elif b_type == 'I':
-             return 1
-         if a_type == 'S':
-             return -1
-         else:
-             return 1
-    if a_data < b_data:
-        return -1
-    elif a_data > b_data:
-        return 1
-    else:
-        return 0
-
-
-"""
 """
-def build_genotype_genome(reference,
-                          base_fname,                          
-                          partial,
+def build_genotype_genome(base_fname,                          
                           inter_gap,
                           intra_gap,
                           threads,
+                          database_list,
                           use_clinvar,
+                          use_commonvar,
                           verbose):    
-    # Current script directory
-    curr_script = os.path.realpath(inspect.getsourcefile(build_genotype_genome))
-    ex_path = os.path.dirname(curr_script)
-
-    def check_files(fnames):
-        for fname in fnames:
-            if not os.path.exists(fname):
-                return False
-        return True
-
     # Download HISAT2 index
     HISAT2_fnames = ["grch38",
                      "genome.fa",
                      "genome.fa.fai"]
-    if not check_files(HISAT2_fnames):
-        os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz")
-        hisat2_inspect = os.path.join(ex_path, "hisat2-inspect")
-        os.system("%s grch38/genome > genome.fa" % hisat2_inspect)
-        os.system("samtools faidx genome.fa")
+    if not typing_common.check_files(HISAT2_fnames):
+        typing_common.download_genome_and_index()
 
     # Load genomic sequences
-    chr_dic, chr_names, chr_full_names = read_genome(open(reference))
+    chr_dic, chr_names, chr_full_names = typing_common.read_genome(open("genome.fa"))
 
+    genotype_vars, genotype_haplotypes, genotype_clnsig = {}, {}, {}
     if use_clinvar:
         # Extract variants from the ClinVar database
         CLINVAR_fnames = ["clinvar.vcf.gz",
@@ -181,86 +64,95 @@ def build_genotype_genome(reference,
                           "clinvar.haplotype",
                           "clinvar.clnsig"]
 
-        if not check_files(CLINVAR_fnames):
+        if not typing_common.check_files(CLINVAR_fnames):
             if not os.path.exists("clinvar.vcf.gz"):
-                os.system("wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz")
+                os.system("wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/archive/2017/clinvar_20170404.vcf.gz")
             assert os.path.exists("clinvar.vcf.gz")
 
-            extract_clinvar_script = os.path.join(ex_path, "hisat2_extract_snps_haplotypes_VCF.py")
-            extract_cmd = [extract_clinvar_script]
+            extract_cmd = ["hisat2_extract_snps_haplotypes_VCF.py"]
             extract_cmd += ["--inter-gap", str(inter_gap),
                             "--intra-gap", str(intra_gap),
                             "--genotype-vcf", "clinvar.vcf.gz",
-                            reference, "/dev/null", "clinvar"]
+                            "genome.fa", "/dev/null", "clinvar"]
             if verbose:
                 print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
             proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
             proc.communicate()
-            if not check_files(CLINVAR_fnames):
+            if not typing_common.check_files(CLINVAR_fnames):
                 print >> sys.stderr, "Error: extract variants from clinvar failed!"
                 sys.exit(1)
 
         # Read variants to be genotyped
-        genotype_vars = read_variants("clinvar.snp")
+        genotype_vars = typing_common.read_variants("clinvar.snp")
 
         # Read haplotypes
-        genotype_haplotypes = read_haplotypes("clinvar.haplotype")
+        genotype_haplotypes = typing_common.read_haplotypes("clinvar.haplotype")
 
         # Read information about clinical significance
-        genotype_clnsig = read_clnsig("clinvar.clnsig")
-    else:
-        genotype_vars, genotype_haplotypes, genotype_clnsig = {}, {}, {}
+        genotype_clnsig = typing_common.read_clnsig("clinvar.clnsig")
+
+    if use_commonvar:
+        # Extract variants from dbSNP database
+        commonvar_fbase = "snp144Common"
+        commonvar_fnames = ["%s.snp" % commonvar_fbase,
+                            "%s.haplotype" % commonvar_fbase]
+        if not typing_common.check_files(commonvar_fnames):
+            if not os.path.exists("%s.txt.gz" % commonvar_fbase):
+                os.system("wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/%s.txt.gz" % commonvar_fbase)
+            assert os.path.exists("%s.txt.gz" % commonvar_fbase)
+            os.system("gzip -cd %s.txt.gz | awk 'BEGIN{OFS=\"\t\"} {if($2 ~ /^chr/) {$2 = substr($2, 4)}; if($2 == \"M\") {$2 = \"MT\"} print}' > %s.txt" % (commonvar_fbase, commonvar_fbase))
+            extract_cmd = ["hisat2_extract_snps_haplotypes_UCSC.py",
+                           "--inter-gap", str(inter_gap),
+                           "--intra-gap", str(intra_gap),
+                           "genome.fa", "%s.txt" % commonvar_fbase, commonvar_fbase]
+            if verbose:
+                print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
+            proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
+            proc.communicate()
+            if not typing_common.check_files(commonvar_fnames):
+                print >> sys.stderr, "Error: extract variants from clinvar failed!"
+                sys.exit(1)
+
+        # Read variants to be genotyped
+        genotype_vars = typing_common.read_variants("%s.snp" % commonvar_fbase)
+
+        # Read haplotypes
+        genotype_haplotypes = typing_common.read_haplotypes("%s.haplotype" % commonvar_fbase)
 
     # Genes to be genotyped
     genotype_genes = {}
 
-    # Clone a git repository, IMGTHLA
-    if not os.path.exists("IMGTHLA"):
-        os.system("git clone https://github.com/jrob119/IMGTHLA.git")
-
-    # Extract HLA variants, backbone sequence, and other sequeces
-    HLA_fnames = ["hla_backbone.fa",
-                  "hla.ref",
-                  "hla.snp",
-                  "hla.haplotype",
-                  "hla.link"]
-
-    if not check_files(HLA_fnames):
-        extract_hla_script = os.path.join(ex_path, "hisat2_extract_HLA_vars.py")
-        extract_cmd = [extract_hla_script]
-        if partial:
-            extract_cmd += ["--partial"]
-        extract_cmd += ["--inter-gap", str(inter_gap),
-                        "--intra-gap", str(intra_gap)]
-        if verbose:
-            print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
-        proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-        proc.communicate()
-        if not check_files(HLA_fnames):
-            print >> sys.stderr, "Error: extract_HLA_vars failed!"
-            sys.exit(1)
-
-    # Read HLA genes
-    if os.path.exists("hla.ref"):
-        for line in open("hla.ref"):
-            HLA_name, chr, left, right, length, exon_str = line.strip().split()
+    # Read genes or genomics regions
+    for database_name in database_list:
+        # Extract HLA variants, backbone sequence, and other sequeces
+        typing_common.extract_database_if_not_exists(database_name,
+                                                     [],            # locus_list
+                                                     inter_gap,
+                                                     intra_gap,
+                                                     True,          # partial?
+                                                     verbose)
+        locus_fname = "%s.locus" % database_name
+        assert os.path.exists(locus_fname)
+        for line in open(locus_fname):
+            HLA_name, chr, left, right, length, exon_str, strand = line.strip().split()
             left, right = int(left), int(right)
             length = int(length)
             if chr not in chr_names:
                 continue
             if chr not in genotype_genes:
                 genotype_genes[chr] = []
-            genotype_genes[chr].append([left, right, length, HLA_name, "hla"])
+            genotype_genes[chr].append([left, right, length, HLA_name, database_name, exon_str, strand])
 
     # Write genotype genome
     var_num, haplotype_num = 0, 0
     genome_out_file = open("%s.fa" % base_fname, 'w')
-    gene_out_file = open("%s.gene" % base_fname, 'w')
+    locus_out_file = open("%s.locus" % base_fname, 'w')
     var_out_file = open("%s.snp" % base_fname, 'w')
+    index_var_out_file = open("%s.index.snp" % base_fname, 'w')
     haplotype_out_file = open("%s.haplotype" % base_fname, 'w')
     link_out_file = open("%s.link" % base_fname, 'w')
     coord_out_file = open("%s.coord" % base_fname, 'w')
-    clnsig_out_file = open("%s.clnsig" % base_fname, 'w')    
+    clnsig_out_file = open("%s.clnsig" % base_fname, 'w')
     for c in range(len(chr_names)):
         chr = chr_names[c]
         chr_full_name = chr_full_names[c]
@@ -298,15 +190,17 @@ def build_genotype_genome(reference,
                 if var_right > right:
                     break
                 if var_right >= left:
+                    chr_genotype_vari += 1
                     continue
 
-                print >> var_out_file, "%s\t%s\t%s\t%d\t%s" % \
-                    (var_id, var_type, chr, var_left + off, var_data)
+                out_str = "%s\t%s\t%s\t%d\t%s" % (var_id, var_type, chr, var_left + off, var_data)
+                print >> var_out_file, out_str
+                print >> index_var_out_file, out_str
 
-                assert var_id in genotype_clnsig
-                var_gene, clnsig = genotype_clnsig[var_id]
-                print >> clnsig_out_file, "%s\t%s\t%s" % \
-                    (var_id, var_gene, clnsig)
+                if var_id in genotype_clnsig:
+                    var_gene, clnsig = genotype_clnsig[var_id]
+                    print >> clnsig_out_file, "%s\t%s\t%s" % \
+                        (var_id, var_gene, clnsig)
                 
                 chr_genotype_vari += 1
 
@@ -316,6 +210,7 @@ def build_genotype_genome(reference,
                 if ht_right > right:
                     break
                 if ht_right >= left:
+                    chr_genotype_hti += 1
                     continue
 
                 print >> haplotype_out_file, "ht%d\t%s\t%d\t%d\t%s" % \
@@ -330,28 +225,33 @@ def build_genotype_genome(reference,
         off = 0
         prev_right = 0
         for gene in chr_genes:
-            left, right, length, name, family = gene
+            left, right, length, name, family, exon_str, strand = gene
 
             chr_genotype_vari, chr_genotype_hti, haplotype_num = add_vars(left, right, chr_genotype_vari, chr_genotype_hti, haplotype_num)
 
             # Read HLA backbone sequences
-            allele_seqs = read_sequences("%s_backbone.fa" % family)
+            allele_seqs = typing_common.read_allele_sequences("%s_backbone.fa" % family)
 
             # Read HLA variants
-            allele_vars = read_variants("%s.snp" % family)
-
+            allele_vars = typing_common.read_variants("%s.snp" % family)
+            allele_index_vars = typing_common.read_variants("%s.index.snp" % family)
+                
             # Read HLA haplotypes
-            allele_haplotypes = read_haplotypes("%s.haplotype" % family)
+            allele_haplotypes = typing_common.read_haplotypes("%s.haplotype" % family)
 
             # Read HLA link information between haplotypes and variants
-            links = read_links("%s.link" % family)
+            links = typing_common.read_links("%s.link" % family)
 
             if name not in allele_seqs or \
                     name not in allele_vars or \
                     name not in allele_haplotypes:
                 continue
             allele_seq = allele_seqs[name]
-            vars = allele_vars[name]
+            vars, index_vars = allele_vars[name], allele_index_vars[name]
+            index_var_ids = set()
+            for _, _, _, var_id in index_vars:
+                index_var_ids.add(var_id)
+
             haplotypes = allele_haplotypes[name]
             assert length == len(allele_seq)
             assert left < chr_len and right < chr_len
@@ -367,35 +267,36 @@ def build_genotype_genome(reference,
             assert prev_length <= length
 
             if prev_right < left:
-                # print >> coord_out_file, "%d\t%d\t%d" % \
-                #    (len(out_chr_seq), prev_right, left - prev_right)
                 out_chr_seq += chr_seq[prev_right:left]
 
             # Output gene (genotype_genome.gene)
-            print >> gene_out_file, "%s\t%s\t%s\t%d\t%d" % \
-                (family.upper(), name, chr, len(out_chr_seq), len(out_chr_seq) + length - 1)
+            print >> locus_out_file, "%s\t%s\t%s\t%d\t%d\t%s\t%s" % \
+                (family.upper(), name, chr, len(out_chr_seq), len(out_chr_seq) + length - 1, exon_str, strand)
 
             # Output coord (genotype_genome.coord)
             print >> coord_out_file, "%s\t%d\t%d\t%d" % \
                 (chr, len(out_chr_seq), left, right - left + 1)
             out_chr_seq += allele_seq
 
-            # Output variants (genotype_genome.snp)
+            # Output variants (genotype_genome.snp and genotype_genome.index.snp)
             for var in vars:
                 var_left, var_type, var_data, var_id = var
                 new_var_id = "hv%d" % var_num
                 varID2htID[var_id] = new_var_id
                 new_var_left = var_left + left + off
-                assert var_type in ["single", "deletion"]
+                assert var_type in ["single", "deletion", "insertion"]
                 assert new_var_left < len(out_chr_seq)
                 if var_type == "single":                    
                     assert out_chr_seq[new_var_left] != var_data
-                else:
-                    assert var_type == "deletion"
+                elif var_type == "deletion":
                     assert new_var_left + var_data <= len(out_chr_seq)
-                    
-                print >> var_out_file, "%s\t%s\t%s\t%d\t%s" % \
-                    (new_var_id, var_type, chr, new_var_left, var_data)
+                else:
+                    assert var_type == "insertion"
+
+                out_str = "%s\t%s\t%s\t%d\t%s" % (new_var_id, var_type, chr, new_var_left, var_data)
+                print >> var_out_file, out_str
+                if var_id in index_var_ids:
+                    print >> index_var_out_file, out_str
                 var_num += 1
                 
             # Output haplotypes (genotype_genome.haplotype)
@@ -427,7 +328,7 @@ def build_genotype_genome(reference,
             prev_right = right + 1
 
         # Write the rest of the Vars
-        chr_genotype_vari, chr_genotype_hti, haplotype_num = add_vars(10000000000, 10000000000, chr_genotype_vari, chr_genotype_hti, haplotype_num)            
+        chr_genotype_vari, chr_genotype_hti, haplotype_num = add_vars(sys.maxint, sys.maxint, chr_genotype_vari, chr_genotype_hti, haplotype_num)            
             
         print >> coord_out_file, "%s\t%d\t%d\t%d" % \
             (chr, len(out_chr_seq), prev_right, len(chr_seq) - prev_right)
@@ -442,49 +343,70 @@ def build_genotype_genome(reference,
             print >> genome_out_file, out_chr_seq[s:s+line_width]
 
     genome_out_file.close()
-    gene_out_file.close()
+    locus_out_file.close()
     var_out_file.close()
+    index_var_out_file.close()
     haplotype_out_file.close()
     link_out_file.close()
     coord_out_file.close()
     clnsig_out_file.close()
 
-    # Build HISAT2 graph indexes based on the above information
+    partial_out_file = open("%s.partial" % base_fname, 'w')
+    for database in database_list:
+        for line in open("%s.partial" % database):
+            allele_name = line.strip()
+            print >> partial_out_file, "%s\t%s" % (database.upper(), allele_name)
+    partial_out_file.close()
+
+    # Index genotype_genome.fa
+    index_cmd = ["samtools", "faidx", "%s.fa" % base_fname]
+    subprocess.call(index_cmd)
+
+    # Build HISAT-genotype graph indexes based on the above information
     hisat2_index_fnames = ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)]
-    hisat2_build = os.path.join(ex_path, "hisat2-build")
-    build_cmd = [hisat2_build,
+    build_cmd = ["hisat2-build",
                  "-p", str(threads),
-                 "--snp", "%s.snp" % base_fname,
+                 "--snp", "%s.index.snp" % base_fname,
                  "--haplotype", "%s.haplotype" % base_fname,
                  "%s.fa" % base_fname,
                  "%s" % base_fname]
     if verbose:
         print >> sys.stderr, "\tRunning:", ' '.join(build_cmd)
-    proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-    proc.communicate()        
-    if not check_files(hisat2_index_fnames):
+        
+    subprocess.call(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
+    if not typing_common.check_files(hisat2_index_fnames):
         print >> sys.stderr, "Error: indexing failed!  Perhaps, you may have forgotten to build hisat2 executables?"
         sys.exit(1)
 
-
         
 """
 """
 if __name__ == '__main__':
     parser = ArgumentParser(
-        description="Extract HLA variants from HLA multiple sequence alignments")
-    parser.add_argument("reference",
-                        nargs='?',
+        description="Build genotype genome")
+    parser.add_argument("--base", "--base-fname",
+                        dest="base_fname",
                         type=str,
-                        help="Reference genome")
-    parser.add_argument("base_fname",
-                        nargs='?',
+                        default="genotype_genome",
+                        help="base filename for genotype genome (default: genotype_genome)")
+    parser.add_argument("-p", "--threads",
+                        dest="threads",
+                        type=int,
+                        default=1,
+                        help="Number of threads")
+    parser.add_argument("--database-list",
+                        dest="database_list",
                         type=str,
-                        help="base filename for genotype genome")
-    parser.add_argument('--partial',
-                        dest='partial',
-                        action='store_true',
-                        help='Include partial alleles (e.g. A_nuc.fasta)')
+                        default="",
+                        help="A comma-separated list of databases (default: hla,codis,cyp)")
+    parser.add_argument("--commonvar",
+                        dest="use_commonvar",
+                        action="store_true",
+                        help="Include common variants from dbSNP")
+    parser.add_argument("--clinvar",
+                        dest="use_clinvar",
+                        action="store_true",
+                        help="Include variants from ClinVar database")
     parser.add_argument("--inter-gap",
                         dest="inter_gap",
                         type=int,
@@ -494,33 +416,33 @@ if __name__ == '__main__':
                         dest="intra_gap",
                         type=int,
                         default=50,
-                        help="Break a haplotype into several haplotypes")
-    parser.add_argument("-p", "--threads",
-                        dest="threads",
-                        type=int,
-                        default=1,
-                        help="Number of threads") 
-    parser.add_argument("--no-clinvar",
-                        dest="use_clinvar",
-                        action="store_false",
-                        help="")
+                        help="Break a haplotype into several haplotypes")    
     parser.add_argument("-v", "--verbose",
                         dest="verbose",
                         action="store_true",
                         help="also print some statistics to stderr")
 
     args = parser.parse_args()
-    if not args.reference or not args.base_fname:
-        parser.print_help()
-        sys.exit(1)
     if args.inter_gap > args.intra_gap:
         print >> sys.stderr, "Error: --inter-gap (%d) must be smaller than --intra-gap (%d)" % (args.inter_gap, args.intra_gap)
         sys.exit(1)
-    build_genotype_genome(args.reference,
-                          args.base_fname,
-                          args.partial,
+        
+    if args.database_list == "":
+        database_list = []
+    else:
+        database_list = args.database_list.split(',')
+
+    if args.use_clinvar and args.use_commonvar:
+        print >> sys.stderr, "Error: both --clinvar and --commonvar cannot be used together."
+        sys.exit(1)
+        
+        
+    build_genotype_genome(args.base_fname,
                           args.inter_gap,
                           args.intra_gap,
                           args.threads,
+                          database_list,
                           args.use_clinvar,
+                          args.use_commonvar,
                           args.verbose)
+    
diff --git a/hisatgenotype_extract_reads.py b/hisatgenotype_extract_reads.py
new file mode 100755
index 0000000..8a0bc17
--- /dev/null
+++ b/hisatgenotype_extract_reads.py
@@ -0,0 +1,430 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2017, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT-genotype.
+#
+# HISAT-genotype is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT-genotype is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT-genotype.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import sys, os, subprocess, re
+import inspect
+import random
+import glob
+from argparse import ArgumentParser, FileType
+import hisatgenotype_typing_common as typing_common
+
+
+"""
+"""
+def parallel_work(pids, 
+                  work, 
+                  fq_fname_base, 
+                  fq_fname, 
+                  fq_fname2, 
+                  ranges,
+                  simulation,
+                  verbose):
+    child = -1
+    for i in range(len(pids)):
+        if pids[i] == 0:
+            child = i
+            break
+
+    while child == -1:
+        status = os.waitpid(0, 0)
+        for i in range(len(pids)):
+            if status[0] == pids[i]:
+                child = i
+                pids[i] = 0
+                break
+
+    child_id = os.fork()
+    if child_id == 0:
+        work(fq_fname_base, 
+             fq_fname, 
+             fq_fname2, 
+             ranges,
+             simulation,
+             verbose)
+        os._exit(os.EX_OK)
+    else:
+        # print >> sys.stderr, '\t\t>> thread %d: %d' % (child, child_id)
+        pids[child] = child_id
+
+        
+"""
+"""
+def wait_pids(pids):
+    for pid in pids:
+        if pid > 0:
+            os.waitpid(pid, 0)
+            
+
+"""
+"""
+def extract_reads(base_fname,
+                  database_list,
+                  read_dir,
+                  out_dir,
+                  suffix,
+                  read_fname,
+                  fastq,
+                  paired,
+                  simulation,
+                  threads,
+                  max_sample,
+                  job_range,
+                  verbose):
+    genotype_fnames = ["%s.fa" % base_fname,
+                       "%s.locus" % base_fname,
+                       "%s.snp" % base_fname,
+                       "%s.haplotype" % base_fname,
+                       "%s.link" % base_fname,
+                       "%s.coord" % base_fname,
+                       "%s.clnsig" % base_fname]
+    # hisat2 graph index files
+    genotype_fnames += ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)]
+    if not typing_common.check_files(genotype_fnames):        
+        print >> sys.stderr, "Error: %s related files do not exist as follows:" % base_fname
+        for fname in genotype_fnames:
+            print >> sys.stderr, "\t%s" % fname
+        sys.exit(1)
+
+    filter_region = len(database_list) > 0
+    ranges = []
+    regions, region_loci = {}, {}
+    for line in open("%s.locus" % base_fname):
+        family, allele_name, chr, left, right = line.strip().split()[:5]
+        if filter_region and family.lower() not in database_list:
+            continue
+        region_name = "%s-%s" % (family, allele_name.split('*')[0])
+        assert region_name not in regions
+        regions[region_name] = allele_name
+        left, right = int(left), int(right)
+        """
+        exons = []
+        for exon in exon_str.split(','):
+            exon_left, exon_right = exon.split('-')
+            exons.append([int(exon_left), int(exon_right)])
+        """
+        if chr not in region_loci:
+            region_loci[chr] = {}
+        region_loci[chr][region_name] = [allele_name, chr, left, right]
+        database_list.add(family.lower())
+
+    if out_dir != "" and not os.path.exists(out_dir):
+        os.mkdir(out_dir)
+
+    # Extract reads
+    if len(read_fname) > 0:
+        if paired:
+            fq_fnames = [read_fname[0]]
+            fq_fnames2 = [read_fname[1]]
+        else:
+            fq_fnames = read_fname
+    else:
+        if paired:
+            fq_fnames = glob.glob("%s/*.1.%s" % (read_dir, suffix))
+        else:
+            fq_fnames = glob.glob("%s/*.%s" % (read_dir, suffix))
+    count = 0
+    pids = [0 for i in range(threads)]
+    for file_i in range(len(fq_fnames)):
+        if file_i >= max_sample:
+            break
+        fq_fname = fq_fnames[file_i]
+        if job_range[1] > 1:
+            if job_range[0] != (file_i % job_range[1]):
+                continue
+
+        fq_fname_base = fq_fname.split('/')[-1]
+        one_suffix = ".1." + suffix
+        if fq_fname_base.find(one_suffix) != -1:
+            fq_fname_base = fq_fname_base[:fq_fname_base.find(one_suffix)]
+        else:
+            fq_fname_base = fq_fname_base.split('.')[0]
+            
+        if paired:
+            if read_dir == "":
+                fq_fname2 = fq_fnames2[file_i]
+            else:
+                fq_fname2 = "%s/%s.2.%s" % (read_dir, fq_fname_base, suffix)
+            if not os.path.exists(fq_fname2):
+                print >> sys.stderr, "%s does not exist." % fq_fname2
+                continue
+        else:
+            fq_fname2 = ""
+
+        if paired:
+            if out_dir != "":
+                if os.path.exists("%s/%s.extracted.1.fq.gz" % (out_dir, fq_fname_base)):
+                    continue
+        else:
+            if out_dir != "":
+                if os.path.exists("%s/%s.extracted.fq.gz" % (out_dir, fq_fname_base)):
+                    continue
+        count += 1
+
+        print >> sys.stderr, "\t%d: Extracting reads from %s" % (count, fq_fname_base)
+        def work(fq_fname_base,
+                 fq_fname, 
+                 fq_fname2, 
+                 ranges,
+                 simulation,
+                 verbose):
+            aligner_cmd = ["hisat2"]
+            if not fastq:
+                aligner_cmd += ["-f"]
+            aligner_cmd += ["-x", base_fname]
+            aligner_cmd += ["--no-spliced-alignment",
+                            "--max-altstried", "64"]
+            if paired:
+                aligner_cmd += ["-1", fq_fname,
+                                "-2", fq_fname2]
+            else:
+                aligner_cmd += ["-U", fq_fname]
+            if verbose:
+                print >> sys.stderr, "\t\trunning", ' '.join(aligner_cmd)
+            align_proc = subprocess.Popen(aligner_cmd,
+                                          stdout=subprocess.PIPE,
+                                          stderr=open("/dev/null", 'w'))
+
+            gzip_dic = {}
+            out_dir_slash = out_dir
+            if out_dir != "":
+                out_dir_slash += "/"
+            for database in database_list:
+                if paired:
+                    # LP6005041-DNA_A01.extracted.1.fq.gz
+                    gzip1_proc = subprocess.Popen(["gzip"],
+                                                  stdin=subprocess.PIPE,
+                                                  stdout=open("%s%s.%s.extracted.1.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'),
+                                                  stderr=open("/dev/null", 'w'))
+
+                    # LP6005041-DNA_A01.extracted.2.fq.gz
+                    gzip2_proc = subprocess.Popen(["gzip"],
+                                                  stdin=subprocess.PIPE,
+                                                  stdout=open("%s%s.%s.extracted.2.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'),
+                                                  stderr=open("/dev/null", 'w'))
+                else:
+                    # LP6005041-DNA_A01.extracted.fq.gz
+                    gzip1_proc = subprocess.Popen(["gzip"],
+                                                  stdin=subprocess.PIPE,
+                                                  stdout=open("%s%s.%s.extracted.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'),
+                                                  stderr=open("/dev/null", 'w'))
+                gzip_dic[database] = [gzip1_proc, gzip2_proc if paired else None]
+
+            def write_read(gzip_proc, read_name, seq, qual):
+                if fastq:
+                    gzip_proc.stdin.write("@%s\n" % read_name)
+                    gzip_proc.stdin.write("%s\n" % seq)
+                    gzip_proc.stdin.write("+\n")
+                    gzip_proc.stdin.write("%s\n" % qual)
+                else:
+                    gzip_proc.stdin.write(">%s\n" % prev_read_name)
+                    gzip_proc.stdin.write("%s\n" % seq)                    
+
+            prev_read_name, extract_read, read1, read2 = "", False, [], []
+            for line in align_proc.stdout:
+                if line.startswith('@'):
+                    continue
+                line = line.strip()
+                cols = line.split()
+                read_name, flag, chr, pos, mapQ, cigar, _, _, _, read, qual = cols[:11]
+                flag, pos = int(flag), int(pos)
+                strand = '-' if flag & 0x10 else '+'                   
+                AS, NH = "", ""
+                for i in range(11, len(cols)):
+                    col = cols[i]
+                    if col.startswith("AS"):
+                        AS = int(col[5:])
+                    elif col.startswith("NH"):
+                        NH = int(col[5:])
+
+                if (not simulation and read_name != prev_read_name) or \
+                   (simulation and read_name.split('|')[0] != prev_read_name.split('|')[0]):
+                    if extract_read:
+                        write_read(gzip_dic[region][0], prev_read_name, read1[0], read1[1])
+                        if paired:
+                            write_read(gzip_dic[region][1], prev_read_name, read2[0], read2[1])
+                    prev_read_name, extract_read, read1, read2 = read_name, False, [], []
+
+                if flag & 0x4 == 0 and NH == 1 and chr in region_loci:                    
+                    for region, loci in region_loci[chr].items():
+                        region = region.split('-')[0].lower()
+                        _, _, loci_left, loci_right = loci
+                        if pos >= loci_left and pos < loci_right:
+                            extract_read = True
+                            break
+
+                if flag & 0x40 or not paired: # left read
+                    if not read1:
+                        if flag & 0x10: # reverse complement
+                            read1 = [typing_common.reverse_complement(read), qual[::-1]]
+                        else:
+                            read1 = [read, qual]
+                else:
+                    assert flag & 0x80 # right read
+                    if flag & 0x10: # reverse complement
+                        read2 = [typing_common.reverse_complement(read), qual[::-1]]
+                    else:
+                        read2 = [read, qual]
+
+            if extract_read:
+                write_read(gzip_dic[region][0], prev_read_name, read1[0], read1[1])
+                if paired:
+                    write_read(gzip_dic[region][1], prev_read_name, read2[0], read2[1])
+
+            for gzip1_proc, gzip2_proc in gzip_dic.values():
+                gzip1_proc.stdin.close()
+                if paired:
+                    gzip2_proc.stdin.close()                        
+
+        if threads <= 1:
+            work(fq_fname_base, 
+                 fq_fname, 
+                 fq_fname2,
+                 ranges,
+                 simulation,
+                 verbose)
+        else:
+            parallel_work(pids, 
+                          work, 
+                          fq_fname_base, 
+                          fq_fname, 
+                          fq_fname2, 
+                          ranges,
+                          simulation,
+                          verbose)
+
+    if threads > 1:
+        wait_pids(pids)
+
+
+"""
+"""
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        description='Extract reads')
+    parser.add_argument("--base", "--base-fname",
+                        dest="base_fname",
+                        type=str,
+                        default="genotype_genome",
+                        help="base filename for genotype genome")
+    parser.add_argument("--read-dir",
+                        dest="read_dir",
+                        type=str,
+                        default="",
+                        help="Directory name for read files")
+    parser.add_argument("--out-dir",
+                        dest="out_dir",
+                        type=str,
+                        default="",
+                        help="Directory name for extracted read files")
+    parser.add_argument("--suffix",
+                        dest="suffix",
+                        type=str,
+                        default="fq.gz",
+                        help="Read file suffix (Default: fq.gz)")
+    parser.add_argument('-f', '--fasta',
+                        dest='fastq',
+                        action='store_false',
+                        help='FASTA format')
+    parser.add_argument("-U",
+                        dest="read_fname_U",
+                        type=str,
+                        default="",
+                        help="filename for single-end reads")
+    parser.add_argument("-1",
+                        dest="read_fname_1",
+                        type=str,
+                        default="",
+                        help="filename for paired-end reads")
+    parser.add_argument("-2",
+                        dest="read_fname_2",
+                        type=str,
+                        default="",
+                        help="filename for paired-end reads")    
+    parser.add_argument("--database-list",
+                        dest="database_list",
+                        type=str,
+                        default="",
+                        help="A comma-separated list of database (default: empty)")
+    parser.add_argument('--simulation',
+                        dest='simulation',
+                        action='store_true',
+                        help='Simulated reads (Default: False)')    
+    parser.add_argument("-p", "--threads",
+                        dest="threads",
+                        type=int,
+                        default=1,
+                        help="Number of threads")
+    parser.add_argument("--max-sample",
+                        dest="max_sample",
+                        type=int,
+                        default=sys.maxint,
+                        help="Number of samples to be extracted (default: sys.maxint)")
+    parser.add_argument("--job-range",
+                        dest="job_range",
+                        type=str,
+                        default="0,1",
+                        help="two numbers (e.g. 1,3)")
+    parser.add_argument('-v', '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        help='also print some statistics to stderr')
+
+    args = parser.parse_args()
+
+    database_list = set()
+    if args.database_list != "":
+        for region in args.database_list.split(','):
+            database_list.add(region)
+    if args.read_fname_U != "":
+        args.read_fname = [args.read_fname_U]
+    elif args.read_fname_1 != "" or args.read_fname_2 != "":
+        if args.read_fname_1 == "" or args.read_fname_2 == "":
+            print >> sys.stderr, "Error: please specify both -1 and -2."
+            sys.exit(1)
+        args.read_fname = [args.read_fname_1, args.read_fname_2]
+    else:
+        args.read_fname = []
+    if len(args.read_fname) == 0:
+        if args.read_dir == "" or not os.path.exists(args.read_dir):
+            print >> sys.stderr, "Error: please specify --read-dir with an existing directory."
+            sys.exit(1)
+        if args.out_dir == "":
+            print >> sys.stderr, "Error: please specify --out-dir with a directory name."
+            sys.exit(1)
+    job_range = []
+    for num in args.job_range.split(','):
+        job_range.append(int(num))
+        
+    extract_reads(args.base_fname,
+                  database_list,
+                  args.read_dir,
+                  args.out_dir,
+                  args.suffix,
+                  args.read_fname,
+                  args.fastq,
+                  False if args.read_fname_U != "" else True,
+                  args.simulation,
+                  args.threads,
+                  args.max_sample,
+                  job_range,
+                  args.verbose)
+
diff --git a/hisatgenotype_extract_vars.py b/hisatgenotype_extract_vars.py
index 17d2d80..8f9cdf4 100755
--- a/hisatgenotype_extract_vars.py
+++ b/hisatgenotype_extract_vars.py
@@ -24,7 +24,7 @@ import os, sys, subprocess, re
 import inspect
 import glob
 from argparse import ArgumentParser, FileType
-
+import hisatgenotype_typing_common as typing_common, hisatgenotype_gene_typing as gene_typing
 
 
 """
@@ -45,71 +45,253 @@ def create_map(seq):
 
 """
 """
+def create_consensus_seq(seqs,
+                         seq_len,
+                         min_var_freq,
+                         remove_empty = True):
+    consensus_freq = [[0, 0, 0, 0, 0] for i in range(seq_len)]
+    for i in range(len(seqs)):                
+        seq = seqs[i]
+        if len(seq) != seq_len:
+            continue                    
+        for j in range(seq_len):
+            nt = seq[j]
+            assert nt in "ACGT.E"
+            if nt == 'A':
+                consensus_freq[j][0] += 1
+            elif nt == 'C':
+                consensus_freq[j][1] += 1
+            elif nt == 'G':
+                consensus_freq[j][2] += 1
+            elif nt == 'T':
+                consensus_freq[j][3] += 1
+            else:
+                assert nt in ".E"
+                consensus_freq[j][4] += 1
+
+    for j in range(len(consensus_freq)):
+        for k in range(len(consensus_freq[j])):
+            consensus_freq[j][k] /= float(len(seqs))
+            consensus_freq[j][k] *= 100.0
+
+    consensus_seq = ""
+    has_empty = False
+    for c in range(len(consensus_freq)):
+        freq = consensus_freq[c]
+        A, C, G, T, E = freq
+        # No alleles have bases at this particular location
+        if E >= 100.0:
+            has_empty = True
+            consensus_seq += 'E'
+            continue
+        if E >= 100.0 - min_var_freq:
+            idx = 4
+        else:
+            idx = freq.index(max(freq[:4]))
+        assert idx < 5
+        consensus_seq += "ACGT."[idx]
+    consensus_seq = ''.join(consensus_seq)
+
+    # Remove dots (deletions)
+    skip_pos = set()
+    if has_empty and remove_empty:
+        for seq_i in range(len(seqs)):
+            seqs[seq_i] = list(seqs[seq_i])
+        for i in range(len(consensus_seq)):
+            if consensus_seq[i] != 'E':
+                continue
+            skip_pos.add(i)
+            for seq_i in range(len(seqs)):
+                if i >= len(seqs[seq_i]):
+                    continue
+                seqs[seq_i][i] = 'E'
+        for seq_i in range(len(seqs)):
+            seqs[seq_i] = ''.join(seqs[seq_i])
+            seqs[seq_i] = seqs[seq_i].replace('E', '')
+        consensus_seq = consensus_seq.replace('E', '')
+
+    # Convert a list form of consensus_freq to a dictionary form
+    temp_freq = []
+    for j in range(len(consensus_freq)):
+        if j in skip_pos:
+            continue
+        freq_dic = {}
+        for k in range(len(consensus_freq[j])):
+            freq = consensus_freq[j][k]
+            if freq <= 0.0:
+                continue
+            nt = "ACGT."[k]                    
+            freq_dic[nt] = freq
+        temp_freq.append(freq_dic)
+    consensus_freq = temp_freq
+
+    assert len(consensus_seq) == len(consensus_freq)                
+    return consensus_seq, consensus_freq
+
+
+
+"""
+Left-shift deletions if poissble
+"""
+def leftshift_deletions(backbone_seq, seq, debug = False):
+    if len(seq) != len(backbone_seq):
+        return seq
+    seq = list(seq)
+    seq_len = len(seq)
+    bp_i = 0
+    # Skip the first deletion
+    while bp_i < seq_len:
+        if seq[bp_i] in "ACGT":
+            break
+        bp_i += 1
+
+    while bp_i < seq_len:
+        bp = seq[bp_i]
+        if bp != '.':
+            bp_i += 1
+            continue
+        bp_j = bp_i + 1
+        while bp_j < seq_len:
+            bp2 = seq[bp_j]
+            if bp2 != '.':
+                break
+            else:
+                bp_j += 1
+
+        if bp_j >= seq_len:
+            bp_i = bp_j
+            break
+
+        if debug:
+            print >> sys.stderr, bp_i, bp_j, backbone_seq[bp_i-10:bp_i], backbone_seq[bp_i:bp_j], backbone_seq[bp_j:bp_j+10]
+            print >> sys.stderr, bp_i, bp_j, ''.join(seq[bp_i-10:bp_i]), ''.join(seq[bp_i:bp_j]), ''.join(seq[bp_j:bp_j+10])
+        prev_i, prev_j = bp_i, bp_j
+
+        while bp_i > 0 and seq[bp_i-1] in "ACGT" and backbone_seq[bp_j-1] in "ACGT":
+            if seq[bp_i-1] != backbone_seq[bp_j-1]:
+                break
+            seq[bp_j-1] = seq[bp_i-1]
+            seq[bp_i-1] = '.'
+            bp_i -= 1
+            bp_j -= 1
+        bp_i = bp_j
+        while bp_i < seq_len:
+            if seq[bp_i] in "ACGT":
+                break
+            bp_i += 1
+
+        # DK - debugging purposes
+        if debug:
+            print prev_i, prev_j, ''.join(seq[prev_i-10:prev_i]), ''.join(seq[prev_i:prev_j]), ''.join(seq[prev_j:prev_j+10])
+
+    return ''.join(seq)
+
+
+"""
+"""
 def extract_vars(base_fname,
                  base_dname,
-                 reference_type,
-                 hla_list,
-                 partial,
+                 locus_list,
                  inter_gap,
                  intra_gap,
-                 DRB1_REF,
-                 exclude_allele_list,
+                 whole_haplotype,
+                 min_var_freq,
+                 ext_seq_len,
                  leftshift,
+                 partial,
                  verbose):
-    # Current script directory
-    curr_script = os.path.realpath(inspect.getsourcefile(extract_vars))
-    ex_path = os.path.dirname(curr_script)
-
     base_fullpath_name = base_fname
     if base_dname != "" and not os.path.exists(base_dname):
         os.mkdir(base_dname)
         base_fullpath_name = "%s/%s" % (base_dname, base_fname)
 
-    # Samples of HLA_MSA_file are found in
-    #    ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/msf/
-    #    git clone https://github.com/jrob119/IMGTHLA.git
+    # Download human genome and HISAT2 index
+    HISAT2_fnames = ["grch38",
+                     "genome.fa",
+                     "genome.fa.fai"]
 
+    if not typing_common.check_files(HISAT2_fnames):
+        typing_common.download_genome_and_index()
+    
     # Corresponding genomic loci found by HISAT2 (reference is GRCh38)
-    #   e.g. hisat2 --no-unal --score-min C,0 -x grch38/genome -f IMGTHLA/fasta/A_gen.fasta
-    hla_ref_file = open(base_fullpath_name + ".ref", 'w')
-    if base_fname in ["hla"]:
-        HLA_genes, HLA_gene_strand = {}, {}
-        for gene in hla_list:
-            hisat2 = os.path.join(ex_path, "hisat2")
-            aligner_cmd = [hisat2,
-                           "--score-min", "C,0",
-                           "--no-unal",
-                           "-x", "grch38/genome",
-                           "-f", "IMGTHLA/fasta/%s_gen.fasta" % gene]
-            align_proc = subprocess.Popen(aligner_cmd,
-                                          stdout=subprocess.PIPE,
-                                          stderr=open("/dev/null", 'w'))
-            print aligner_cmd
-            allele_id, strand = "", ''
-            for line in align_proc.stdout:
-                if line.startswith('@'):
-                    continue
-                line = line.strip()
-                cols = line.split()
-                t_allele_id, flag = cols[:2]
-                # Avoid selection of excluded allele as backbone
-                if t_allele_id in exclude_allele_list:
-                    continue
-                allele_id = t_allele_id
-
-                flag = int(flag)
-                strand = '-' if flag & 0x10 else '+'
-                AS = ""
-                for i in range(11, len(cols)):
-                    col = cols[i]
-                    if col.startswith("AS"):
-                        AS = col[5:]
-                assert int(AS) == 0
-
-            align_proc.communicate()
-            assert allele_id != ""
+    #   e.g. hisat2 --no-unal --score-min C,0 -x grch38/genome -f hisatgenotype_db/HLA/fasta/A_gen.fasta
+    locus_file = open(base_fullpath_name + ".locus", 'w')
+    left_ext_seq_dic, right_ext_seq_dic = {}, {}
+    genes, gene_strand = {}, {}
+
+    # Clone a git repository, hisatgenotype_db
+    if not os.path.exists("hisatgenotype_db"):
+        typing_common.clone_hisatgenotype_database()
+    fasta_dname = "hisatgenotype_db/%s/fasta" % base_fname.upper()
+
+    # Check HLA genes
+    gene_names = []
+    if base_fname == "hla":
+        fasta_fnames = glob.glob("%s/*_gen.fasta" % fasta_dname)
+    else:
+        assert base_fname in ["codis", "cyp"]
+        fasta_fnames = glob.glob("%s/*.fasta" % fasta_dname)
+    for gen_fname in fasta_fnames:
+        gene_name = gen_fname.split('/')[-1].split('_')[0]
+        if gene_name == "hla":
+            continue
+        gene_names.append(gene_name)
+
+    if locus_list == []:
+        locus_list = gene_names
+
+    cigar_re = re.compile('\d+\w')
+    remove_locus_list = []
+    for gene in locus_list:
+        aligner_cmd = ["hisat2"]
+        if base_fname in ["hla", "coids"]:
+            aligner_cmd += ["--score-min", "C,0"]
+        aligner_cmd += ["--no-unal",
+                        "-x", "grch38/genome",
+                        "-f", "%s/%s_gen.fasta" % (fasta_dname, gene)]
+        align_proc = subprocess.Popen(aligner_cmd,
+                                      stdout=subprocess.PIPE,
+                                      stderr=open("/dev/null", 'w'))
+        allele_id = ""
+        best_chr, best_left, best_right, best_AS, best_strand = "", -1, -1, -sys.maxint, ''
+        for line in align_proc.stdout:
+            if line.startswith('@'):
+                continue
+            line = line.strip()
+            cols = line.split()
+            temp_allele_id, flag, chr, left, _, cigar_str = cols[:6]
+            left = int(left) - 1
+            right = left
+            cigars = cigar_re.findall(cigar_str)
+            cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
+            if len(cigars) > 1 or cigars[0][0] != 'M':
+                continue
+            for i in range(len(cigars)):
+                cigar_op, length = cigars[i]
+                if cigar_op in "MND":
+                    right += length
+
+            flag = int(flag)
+            strand = '-' if flag & 0x10 else '+'
+            AS = ""
+            for i in range(11, len(cols)):
+                col = cols[i]
+                if col.startswith("AS"):
+                    AS = col[5:]
+            assert AS != ""
+            AS = int(AS)
+            if AS > best_AS:
+                allele_id = temp_allele_id
+                best_chr, best_left, best_right, best_AS, best_strand = chr, left, right, AS, strand
+
+        chr, left, right, strand = best_chr, best_left, best_right, best_strand
+        align_proc.communicate()
+        if allele_id == "":
+            remove_locus_list.append(gene)
+            continue
+        if base_fname == "hla":
             allele_name = ""
-            for line in open("IMGTHLA/fasta/%s_gen.fasta" % gene):
+            for line in open("%s/%s_gen.fasta" % (fasta_dname, gene)):
                 line = line.strip()
                 if not line.startswith('>'):
                     continue
@@ -117,63 +299,96 @@ def extract_vars(base_fname,
                 if allele_id == tmp_allele_id:
                     allele_name = tmp_allele_name
                     break
-            assert allele_name != "" and strand != ''
-            HLA_genes[gene] = allele_name
-            HLA_gene_strand[gene] = strand
-            print "HLA-%s's backbone allele is %s on '%s' strand" % (gene, allele_name, strand)
+        else:
+            allele_name = allele_id
+        assert allele_name != "" and strand != ''
+        genes[gene] = allele_name
+        gene_strand[gene] = strand
+        print >> sys.stderr, "%s-%s's reference allele is %s on '%s' strand of chromosome %s" % \
+            (base_fname.upper(), gene, allele_name, strand, chr)
+
+        assert chr != "" and left >= 0 and right > left
+        if ext_seq_len > 0:
+            left_ext_seq, right_ext_seq = "", ""
+            left1, left2 = max(1, left - ext_seq_len), max(1, left - 1)
+            if left2 > 0:
+                extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, left1, left2)]
+                extract_seq_proc = subprocess.Popen(extract_seq_cmd,
+                                                    stdout=subprocess.PIPE,
+                                                    stderr=open("/dev/null", 'w'))
+                for line in extract_seq_proc.stdout:
+                    if line.startswith('>'):
+                        continue
+                    line = line.strip()
+                    left_ext_seq += line
+            extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, right, right + ext_seq_len - 1)]
+            extract_seq_proc = subprocess.Popen(extract_seq_cmd,
+                                                stdout=subprocess.PIPE,
+                                                stderr=open("/dev/null", 'w'))
+            for line in extract_seq_proc.stdout:
+                if line.startswith('>'):
+                    continue
+                line = line.strip()
+                right_ext_seq += line
+
+            if strand == '-':
+                left_ext_seq, right_ext_seq = typing_common.reverse_complement(right_ext_seq), typing_common.reverse_complement(left_ext_seq)
+            left_ext_seq_dic[gene], right_ext_seq_dic[gene] = left_ext_seq, right_ext_seq
+            
 
-        # Extract exon information from hla.data
-        HLA_gene_exons = {}
+    # Extract exon information from hla.data
+    gene_exons = {}
+    if base_fname == "hla":        
         skip = False
-        for line in open("IMGTHLA/hla.dat"):
+        for line in open("hisatgenotype_db/%s/hla.dat" % base_fname.upper()):
             if line.startswith("DE"):
-                allele_name = line.split()[1][4:-1]
+                allele_name = line.split()[1][:-1]
+                if allele_name.startswith("HLA-"):
+                    allele_name = allele_name[4:]
                 gene = allele_name.split('*')[0]
                 if line.find("partial") != -1 or \
-                        not gene in HLA_genes or \
-                        allele_name != HLA_genes[gene] or \
-                        allele_name in exclude_allele_list :
+                        not gene in genes or \
+                        allele_name != genes[gene]:
                     skip = True
                     continue
                 skip = False
             elif not skip:
-                if not line.startswith("FT") or \
-                        line.find("exon") == -1:
+                if not line.startswith("FT"):
                     continue
-                exon_range = line.split()[2].split("..")
-                if not gene in HLA_gene_exons:
-                    HLA_gene_exons[gene] = []
-                HLA_gene_exons[gene].append([int(exon_range[0]) - 1, int(exon_range[1]) - 1])
-    else:
-        assert base_fname == "cyp"
-        
-        HLA_genes, HLA_gene_strand = {}, {}        
-        fasta_dirname = "hisat_genotype_db/%s/fasta" % base_fname.upper()
-        assert os.path.exists(fasta_dirname)
-        fasta_fnames = glob.glob("%s/*.fasta" % fasta_dirname)
-        for fasta_fname in fasta_fnames:
-            gene_name = fasta_fname.split('/')[-1]
-            gene_name = gene_name.split('_')[0]
-            ref_allele_name = ""
-            for line in open(fasta_fname):
-                assert line[0] == '>'
-                ref_allele_name = line.split(' ')[0][1:]
-                break
+                if line.find("exon") != -1:
+                    exon_range = line.split()[2].split("..")
+                    exon_left, exon_right = int(exon_range[0]) - 1, int(exon_range[1]) - 1
+                    assert exon_left >= 0
+                    assert exon_left < exon_right
+                    if not gene in gene_exons:
+                        gene_exons[gene] = []
+                    if gene in left_ext_seq_dic:
+                        left_ext_seq_len = len(left_ext_seq_dic[gene])
+                    else:
+                        left_ext_seq_len = 0
+                    gene_exons[gene].append([exon_left + left_ext_seq_len, exon_right + left_ext_seq_len])
 
-            assert ref_allele_name != ""
-            assert gene_name not in HLA_genes
-            HLA_genes[gene_name] = ref_allele_name
-            # DK - temporary solution
-            HLA_gene_strand[gene_name] = '+'
+    tmp_locus_list = []
+    for gene in locus_list:
+        if gene in remove_locus_list:
+            continue
+        if base_fname == "hla" and gene not in gene_exons:
+            continue
+        tmp_locus_list.append(gene)
+    locus_list = tmp_locus_list
+    for key in genes.keys():
+        if key in locus_list:
+            continue
+        del genes[key]
+        del gene_strand[key]
 
-        HLA_gene_exons = {}
-        assert reference_type == "gene"
-        
     # Write the backbone sequences into a fasta file
-    if reference_type == "gene":
-        backbone_file = open(base_fullpath_name + "_backbone.fa", 'w')        
+    backbone_file = open(base_fullpath_name + "_backbone.fa", 'w')        
     # variants w.r.t the backbone sequences into a SNP file
     var_file = open(base_fullpath_name + ".snp", 'w')
+    var_index_file = open(base_fullpath_name + ".index.snp", 'w')
+    # variant frequence
+    var_freq_file = open(base_fullpath_name + ".snp.freq", 'w')
     # haplotypes
     haplotype_file = open(base_fullpath_name + ".haplotype", 'w')
     # pairs of a variant and the corresponding HLA allels into a LINK file    
@@ -181,12 +396,16 @@ def extract_vars(base_fname,
     # Write all the sequences with dots removed into a file
     input_file = open(base_fullpath_name + "_sequences.fa", 'w')
     num_vars, num_haplotypes = 0, 0
-    HLA_full_alleles = {}
-    for HLA_gene, HLA_ref_gene in HLA_genes.items():
-        strand = HLA_gene_strand[HLA_gene]        
-        def read_MSF_file(fname):
-            HLA_names = {} # HLA allele names to numeric IDs
-            HLA_seqs = []  # HLA multiple alignment sequences
+    full_alleles = {}
+    for gene, ref_gene in genes.items():
+        strand = gene_strand[gene]
+        left_ext_seq, right_ext_seq = "", ""
+        if gene in left_ext_seq_dic:
+            left_ext_seq, right_ext_seq = left_ext_seq_dic[gene], right_ext_seq_dic[gene]
+
+        def read_MSF_file(fname, left_ext_seq = "", right_ext_seq = ""):
+            names = {} # HLA allele names to numeric IDs
+            seqs = []  # HLA multiple alignment sequences
             for line in open(fname):
                 line = line.strip()
                 if not line or \
@@ -200,20 +419,17 @@ def extract_vars(base_fname,
                     try:
                         name = line.split('\t')[0]
                         name = name.split()[1]
-                        if name in exclude_allele_list:
-                            continue
-                        
                     except ValueError:
                         continue
 
-                    if name in HLA_names:
+                    if name in names:
                         print >> sys.stderr, "Warning: %s is found more than once in Names" % (name)
                         continue
 
-                    HLA_names[name] = len(HLA_names)
+                    names[name] = len(names)
                 else:
-                    if len(HLA_seqs) == 0:
-                        HLA_seqs = ["" for i in range(len(HLA_names))]
+                    if len(seqs) == 0:
+                        seqs = [left_ext_seq for i in range(len(names))]
                     try:
                         cols = line.split()
                         name = cols[0]
@@ -222,15 +438,15 @@ def extract_vars(base_fname,
                     except ValueError:
                         continue
 
-                    if name not in HLA_names:
-                        HLA_names[name] = len(HLA_names)
+                    if name not in names:
+                        names[name] = len(names)
 
-                    id = HLA_names[name]
-                    if id >= len(HLA_seqs):
-                        assert id == len(HLA_seqs)
-                        HLA_seqs.append("")
+                    id = names[name]
+                    if id >= len(seqs):
+                        assert id == len(seqs)
+                        seqs.append(left_ext_seq)
                         
-                    HLA_seqs[id] += ''.join(fives)
+                    seqs[id] += ''.join(fives)
 
                     # Add sub-names of the allele
                     sub_name = ""
@@ -238,26 +454,30 @@ def extract_vars(base_fname,
                         if sub_name != "":
                             sub_name += ":"
                         sub_name += group
-                        if sub_name not in HLA_full_alleles:
-                            HLA_full_alleles[sub_name] = [name]
+                        if sub_name not in full_alleles:
+                            full_alleles[sub_name] = [name]
                         else:
-                            HLA_full_alleles[sub_name].append(name)                        
-                    
-            return HLA_names, HLA_seqs
+                            full_alleles[sub_name].append(name)
+
+            if len(right_ext_seq) > 0:
+                for i_ in range(len(seqs)):
+                    seqs[i_] += right_ext_seq
+
+            return names, seqs
 
         if base_fname == "hla":
-            HLA_MSA_fname = "IMGTHLA/msf/%s_gen.msf" % HLA_gene
+            MSA_fname = "hisatgenotype_db/%s/msf/%s_gen.msf" % (base_fname.upper(), gene)
         else:
-            HLA_MSA_fname = "hisatgenotype_db/%s/msf/%s_gen.msf" % (base_fname.upper(), HLA_gene)
+            MSA_fname = "hisatgenotype_db/%s/msf/%s_gen.msf" % (base_fname.upper(), gene)
             
-        if not os.path.exists(HLA_MSA_fname):
-            print >> sys.stderr, "Warning: %s does not exist" % HLA_MSA_fname
+        if not os.path.exists(MSA_fname):
+            print >> sys.stderr, "Warning: %s does not exist" % MSA_fname
             continue
-        
-        HLA_names, HLA_seqs = read_MSF_file(HLA_MSA_fname)
+
+        names, seqs = read_MSF_file(MSA_fname, left_ext_seq, right_ext_seq)
 
         # Identify a consensus sequence
-        assert len(HLA_seqs) > 0
+        assert len(seqs) > 0
 
         # Check sequences are of equal length
         def find_seq_len(seqs):
@@ -276,82 +496,29 @@ def extract_vars(base_fname,
                     max_seq_count = tmp_seq_count
             return seq_len
 
-        def create_consensus_seq(seqs, seq_len, partial):
-            consensus_count = [[0, 0, 0, 0] for i in range(seq_len)]
-            for i in range(len(seqs)):                
-                seq = seqs[i]
-                if len(seq) != seq_len:
-                    continue                    
-                for j in range(seq_len):
-                    nt = seq[j]
-                    if not nt in "ACGT":
-                        continue
-                    if nt == 'A':
-                        consensus_count[j][0] += 1
-                    elif nt == 'C':
-                        consensus_count[j][1] += 1
-                    elif nt == 'G':
-                        consensus_count[j][2] += 1
-                    else:
-                        assert nt == 'T'
-                        consensus_count[j][3] += 1
-            consensus_seq = ""
-            has_empty = False
-            for count in consensus_count:
-                # No alleles have bases at this particular location
-                if sum(count) <= 0:
-                    has_empty = True
-                    consensus_seq += 'E'
-                    continue
-                idx = count.index(max(count))
-                assert idx < 4
-                consensus_seq += "ACGT"[idx]
-            consensus_seq = ''.join(consensus_seq)
-
-            # Remove dots (deletions)
-            if has_empty and not partial:
-                for seq_i in range(len(seqs)):
-                    seqs[seq_i] = list(seqs[seq_i])
-                for i in range(len(consensus_seq)):
-                    if consensus_seq[i] != 'E':
-                        continue
-                    for seq_i in range(len(seqs)):
-                        if i >= len(seqs[seq_i]):
-                            continue
-                        seqs[seq_i][i] = 'E'
-                for seq_i in range(len(seqs)):
-                    seqs[seq_i] = ''.join(seqs[seq_i])
-                    seqs[seq_i] = seqs[seq_i].replace('E', '')
-                consensus_seq = consensus_seq.replace('E', '')
-                
-            return consensus_seq
-
-        seq_len = find_seq_len(HLA_seqs)        
-        if reference_type == "gene" and \
-                (not DRB1_REF or HLA_gene != "DRB1"):
-            backbone_name = "%s*BACKBONE" % HLA_gene
-            backbone_seq = create_consensus_seq(HLA_seqs, seq_len, partial)
-            # Allele sequences can shrink, so readjust the sequence length
-            if not partial:
-                seq_len = find_seq_len(HLA_seqs)
-        else:
-            backbone_name = HLA_ref_gene
-            backbone_id = HLA_names[backbone_name]
-            backbone_seq = HLA_seqs[backbone_id]
-
-        if partial:
-            HLA_partial_MSA_fname = "IMGTHLA/msf/%s_nuc.msf" % HLA_gene
-            if not os.path.exists(HLA_partial_MSA_fname):
-                print >> sys.stderr, "Warning: %s does not exist" % HLA_partial_MSA_fname
+        seq_len = find_seq_len(seqs)        
+        backbone_name = "%s*BACKBONE" % gene
+        backbone_seq, backbone_freq = create_consensus_seq(seqs,
+                                                           seq_len,
+                                                           min_var_freq,
+                                                           not partial) # Remove empty sequences?
+        # Allele sequences can shrink, so readjust the sequence length
+        if not partial:
+            seq_len = find_seq_len(seqs)
+
+        if partial and base_fname == "hla":
+            partial_MSA_fname = "hisatgenotype_db/HLA/msf/%s_nuc.msf" % gene
+            if not os.path.exists(partial_MSA_fname):
+                print >> sys.stderr, "Warning: %s does not exist" % partial_MSA_fname
                 continue
-            HLA_partial_names, HLA_partial_seqs = read_MSF_file(HLA_partial_MSA_fname)
+            partial_names, partial_seqs = read_MSF_file(partial_MSA_fname)
 
-            # DK - for debugging purposes
+            # DK - debugging purposes
             # Partial alleles vs. Full alleles
             """
             counts = [0, 0, 0, 0]
-            for partial_name in HLA_partial_names.keys():
-                if partial_name in HLA_names:
+            for partial_name in partial_names.keys():
+                if partial_name in names:
                     continue
                 name_group = partial_name.split(':')
                 for group_i in [3, 2, 1, 0]:
@@ -360,160 +527,158 @@ def extract_vars(base_fname,
                     if group_i > len(name_group):
                         continue
                     sub_name = ':'.join(name_group[:group_i])
-                    if sub_name in HLA_full_alleles:
-                        print partial_name, sub_name, HLA_full_alleles[sub_name][:5]
+                    if sub_name in full_alleles:
+                        print partial_name, sub_name, full_alleles[sub_name][:5]
                         counts[group_i] += 1
                         break
             print "DK: counts:", counts
             sys.exit(1)
             """
                 
-            ref_seq = HLA_seqs[HLA_names[HLA_ref_gene]]
+            ref_seq = seqs[names[ref_gene]]
             ref_seq_map = create_map(ref_seq)
-            ref_partial_seq = HLA_partial_seqs[HLA_partial_names[HLA_ref_gene]]
+            ref_partial_seq = partial_seqs[partial_names[ref_gene]]
             ref_partial_seq_map = create_map(ref_partial_seq)
-            exons = HLA_gene_exons[HLA_gene]
+            exons = gene_exons[gene]
             exon_len = 0
             ref_exons = [] # converted exons to MSF file (e.g. A_gen.msf)
             ref_partial_exons = [] # converted exons to MSF file (e.g. A_nuc.msf)
+
+            complete = True
             for exon in exons:
                 left, right = exon
                 ref_exons.append([ref_seq_map[left], ref_seq_map[right]])
-                ref_partial_exons.append([ref_partial_seq_map[exon_len], ref_partial_seq_map[right - left + exon_len]])
+                next_exon_len = right - left + exon_len
+                if next_exon_len >= len(ref_partial_seq_map):
+                    print >> sys.stderr, "Warning: partial sequences (%s) seem to be incomplete" % gene
+                    complete = False
+                    break
+                ref_partial_exons.append([ref_partial_seq_map[exon_len], ref_partial_seq_map[next_exon_len]])
                 exon_len += (right - left + 1)
                 # Make sure two MSF files (e.g. A_gen.msf and A_nuc.msf) share the same MSF lengths in the exonic sequences
                 ref_exon_len = ref_exons[-1][1] - ref_exons[-1][0] + 1
-                ref_partial_exon_len =  ref_partial_exons[-1][1] - ref_partial_exons[-1][0] + 1
+                ref_partial_exon_len = ref_partial_exons[-1][1] - ref_partial_exons[-1][0] + 1
                 assert ref_exon_len == ref_partial_exon_len
-                
-            for HLA_name, seq_id in HLA_partial_names.items():
-                if HLA_name in HLA_names:
-                    continue
-                seq = HLA_partial_seqs[seq_id]
-                new_seq = ""
-                right = 0
-                for e in range(len(exons)):
-                    ref_exon = ref_exons[e]
-                    ref_partial_exon = ref_partial_exons[e]
-                    new_seq += backbone_seq[right:ref_exon[0]]
-                    new_seq += seq[ref_partial_exon[0]:ref_partial_exon[1] + 1]
-                    right = ref_exon[1] + 1
-                new_seq += backbone_seq[right:]
-                HLA_names[HLA_name] = len(HLA_seqs)
-                HLA_seqs.append(new_seq)
-
-            backbone_seq = create_consensus_seq(HLA_seqs, seq_len, partial)
-
-        # Left-shift deletions if poissble
-        def leftshift_deletions(backbone_seq, seq, debug = False):
-            if len(seq) != len(backbone_seq):
-                return seq
-            seq = list(seq)
-            seq_len = len(seq)
-            bp_i = 0
-            # Skip the first deletion
-            while bp_i < seq_len:
-                if seq[bp_i] in "ACGT":
-                    break
-                bp_i += 1
 
-            while bp_i < seq_len:
-                bp = seq[bp_i]
-                if bp != '.':
-                    bp_i += 1
-                    continue
-                bp_j = bp_i + 1
-                while bp_j < seq_len:
-                    bp2 = seq[bp_j]
-                    if bp2 != '.':
-                        break
-                    else:
-                        bp_j += 1
-
-                if bp_j >= seq_len:
-                    bp_i = bp_j
-                    break
+            if complete:
+                partial_seq_len = find_seq_len(partial_seqs)
+                partial_backbone_seq, partial_backbone_freq = create_consensus_seq(partial_seqs,
+                                                                                   partial_seq_len,
+                                                                                   min_var_freq,
+                                                                                   False) # Remove empty sequences?
+                for name, seq_id in partial_names.items():
+                    if name in names:
+                        continue
+                    seq = partial_seqs[seq_id]
+                    new_seq = ""
+                    right = 0
+                    for e in range(len(exons)):
+                        ref_exon = ref_exons[e]
+                        ref_partial_exon = ref_partial_exons[e]
+                        new_seq += backbone_seq[right:ref_exon[0]]
+                        exon_seq = seq[ref_partial_exon[0]:ref_partial_exon[1] + 1]
+                        nt_exon_seq = exon_seq.replace('.', '')
+                        if len(nt_exon_seq) == 0:
+                            exon_seq = partial_backbone_seq[ref_partial_exon[0]:ref_partial_exon[1] + 1]
+                        new_seq += exon_seq
+                        right = ref_exon[1] + 1
+                    new_seq += backbone_seq[right:]
+                    names[name] = len(seqs)
+                    seqs.append(new_seq)
+
+                backbone_seq, backbone_freq = create_consensus_seq(seqs,
+                                                                   seq_len,
+                                                                   min_var_freq,
+                                                                   True) # Remove empty sequences?
+                seq_len = find_seq_len(seqs)
                 
-                # DK - for debugging purposes
-                if debug:
-                    print bp_i, bp_j, backbone_seq[bp_i-10:bp_i], backbone_seq[bp_i:bp_j], backbone_seq[bp_j:bp_j+10]
-                    print bp_i, bp_j, ''.join(seq[bp_i-10:bp_i]), ''.join(seq[bp_i:bp_j]), ''.join(seq[bp_j:bp_j+10])
-                prev_i, prev_j = bp_i, bp_j
-
-                while bp_i > 0 and seq[bp_i-1] in "ACGT":
-                    assert backbone_seq[bp_j-1] in "ACGT"
-                    if seq[bp_i-1] != backbone_seq[bp_j-1]:
-                        break
-                    seq[bp_j-1] = seq[bp_i-1]
-                    seq[bp_i-1] = '.'
-                    bp_i -= 1
-                    bp_j -= 1
-                bp_i = bp_j
-                while bp_i < seq_len:
-                    if seq[bp_i] in "ACGT":
-                        break
-                    bp_i += 1
-
-                # DK - for debugging purposes
-                if debug:
-                    print prev_i, prev_j, ''.join(seq[prev_i-10:prev_i]), ''.join(seq[prev_i:prev_j]), ''.join(seq[prev_j:prev_j+10])
-                  
-            return ''.join(seq)
-
-        if leftshift:
-            for seq_i in range(len(HLA_seqs)):
-                HLA_seqs[seq_i] = leftshift_deletions(backbone_seq, HLA_seqs[seq_i], seq_i == HLA_names["A*68:02:02"] and False)
-
+        if min_var_freq <= 0.0:
+            assert '.' not in backbone_seq and 'E' not in backbone_seq
+        
         # Reverse complement MSF if this gene is on '-' strand
         if strand == '-':
-            def reverse_complement(seq):
-                comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
-                rc_seq = ""
-                for s in reversed(seq):
-                    if s in comp_table:
-                        rc_seq += comp_table[s]
-                    else:
-                        rc_seq += s
-                return rc_seq
-            for i in range(len(HLA_seqs)):
-                HLA_seqs[i] = reverse_complement(HLA_seqs[i])
-            backbone_seq = reverse_complement(backbone_seq)
+            # Reverse exons
+            ref_seq = seqs[names[ref_gene]]
+            ref_seq = ref_seq.replace('.', '')
+            ref_seq_len = len(ref_seq)
+            if base_fname == "hla":
+                exons = []
+                for left, right in reversed(gene_exons[gene]):
+                    left, right = ref_seq_len - right - 1, ref_seq_len - left - 1
+                    exons.append([left, right])
+                gene_exons[gene] = exons
+
+            for i in range(len(seqs)):
+                seqs[i] = typing_common.reverse_complement(seqs[i])
+            backbone_seq, backbone_freq = create_consensus_seq(seqs, seq_len, min_var_freq, True)
+
+        if leftshift:
+            for seq_i in range(len(seqs)):
+                seqs[seq_i] = leftshift_deletions(backbone_seq, seqs[seq_i])
+            backbone_seq, backbone_freq = create_consensus_seq(seqs, seq_len, min_var_freq, True)
+            seq_len = find_seq_len(seqs)
 
-        print >> sys.stderr, "%s: number of HLA genes is %d." % (HLA_gene, len(HLA_names))
+        print >> sys.stderr, "%s: number of HLA alleles is %d." % (gene, len(names))
 
         Vars = {}
-        for cmp_name, id in HLA_names.items():
+        for cmp_name, id in names.items():
             if cmp_name == backbone_name:
                 continue
-            assert id < len(HLA_seqs)
-            cmp_seq = HLA_seqs[id]
-
+            assert id < len(seqs)
+            cmp_seq = seqs[id]
             if len(cmp_seq) != seq_len:
                 print >> sys.stderr, "Warning: the length of %s (%d) is different from %d" % \
                     (cmp_name, len(cmp_seq), seq_len)
                 continue
 
-            # DK - for debugging purposes
+            # DK - debugging purposes
             """
             if cmp_name == "A*03:01:07":
                 print cmp_name
-                cmp_seq2 = HLA_seqs[HLA_names["A*32:29"]]
+                cmp_seq2 = seqs[names["A*32:29"]]
                 for s in range(0, seq_len, 100):
                     print s, backbone_seq[s:s+100]
                     print s, cmp_seq2[s:s+100]
                     print s, cmp_seq[s:s+100]
                 # sys.exit(1)
             """
-
-            def insertVar(indel, type):
+            def insertVar(type, info):
+                pos, backbone_pos, data = info
                 if type in "MI":
-                    varKey = "%d-%s-%s" % (indel[0], type, indel[1])
+                    varKey = "%d-%s-%s" % (pos, type, data)
                 else:
-                    varKey = "%d-%s-%d" % (indel[0], type, indel[1])
+                    varKey = "%d-%s-%d" % (pos, type, data)
+
                 if varKey not in Vars:
-                    Vars[varKey] = [cmp_name]
+                    if type == 'M':
+                        assert backbone_pos < backbone_freq
+                        assert data in backbone_freq[backbone_pos]
+                        freq = backbone_freq[backbone_pos][data]
+                    elif type == 'D':
+                        del_len = int(data)
+                        freq = 100.0
+                        assert backbone_pos + del_len <= backbone_freq
+                        for d in range(del_len):
+                            assert '.' in backbone_freq[backbone_pos + d]
+                            freq2 = backbone_freq[backbone_pos + d]['.']
+                            if freq2 < freq:
+                                freq = freq2
+                    else:
+                        assert type == 'I'
+                        ins_len = len(data)
+                        freq = 100.0
+                        assert backbone_pos + ins_len <= backbone_freq
+                        for i in range(ins_len):
+                            nt = data[i]
+                            assert nt in backbone_freq[backbone_pos + i]
+                            freq2 = backbone_freq[backbone_pos + i][nt]
+                            if freq2 < freq:
+                                freq = freq2
+                        assert freq <= min_var_freq
+                    
+                    Vars[varKey] = [freq, [cmp_name]]
                 else:
-                    Vars[varKey].append(cmp_name)
+                    Vars[varKey][1].append(cmp_name)
 
             insertion, deletion = [], []
             ndots = 0
@@ -523,35 +688,34 @@ def extract_vars(base_fname,
                 cc = cmp_seq[s]
                 if bc != '.' and cc != '.':
                     if insertion:
-                        insertVar(insertion, 'I')
+                        insertVar('I', insertion)
                         insertion = []
                     elif deletion:
-                        insertVar(deletion, 'D')
+                        insertVar('D', deletion)
                         deletion = []
                     if bc != cc:
-                        mismatch = [s - ndots, cc]
-                        insertVar(mismatch, 'M')
+                        mismatch = [s - ndots, s, cc]
+                        insertVar('M', mismatch)
                 elif bc == '.' and cc != '.':
                     if deletion:
-                        insertVar(deletion, 'D')
+                        insertVar('D', deletion)
                         deletion = []
                     if insertion:
-                        insertion[1] += cc
+                        insertion[2] += cc
                     else:
-                        insertion = [s - ndots, cc]
+                        insertion = [s - ndots, s, cc]
                 elif bc != '.' and cc == '.':
                     if insertion:
-                        insertVar(insertion, 'I')
+                        insertVar('I', insertion)
                         insertion = []
                     if deletion:
-                        deletion[1] += 1
+                        deletion[2] += 1
                     else:
-                        deletion = [s - ndots, 1]
+                        deletion = [s - ndots, s, 1]
 
                 if bc == '.':
                     ndots += 1
 
-
                 """
                 if backbone_seq[s] != cmp_seq[s]:
                     print "%s is different %s at %d: %s vs. %s" % \
@@ -559,9 +723,9 @@ def extract_vars(base_fname,
                 """
 
             if insertion:
-                insertVar(insertion, 'I')
+                insertVar('I', insertion)
             elif deletion:
-                insertVar(deletion, 'D')
+                insertVar('D', deletion)
 
 
         print >> sys.stderr, "Number of variants is %d." % (len(Vars.keys()))
@@ -593,20 +757,21 @@ def extract_vars(base_fname,
                 assert a_type == 'D'
                 return int(a_data) - int(b_data)            
 
-        HLA_Vars = {}
-        for key, names in Vars.items():
-            for name in names:
-                if not name in HLA_Vars:
-                    HLA_Vars[name] = [key]
+        Vars_ = {}
+        for key, values in Vars.items():
+            freq, names_ = values
+            for name in names_:
+                if not name in Vars_:
+                    Vars_[name] = [key]
                 else:
-                    HLA_Vars[name].append(key)
-        for name, vars in HLA_Vars.items():
-            HLA_Vars[name] = sorted(vars, cmp=cmp_varKey)
+                    Vars_[name].append(key)
+        for name, vars in Vars_.items():
+            Vars_[name] = sorted(vars, cmp=cmp_varKey)
 
         # Sanity check -
         #    (1) Reconstruct the other sequences from the backbone sequence and variants and
         #    (2) Confirm these constructed sequences are the same as those input sequences.
-        for cmp_name, id in HLA_names.items():
+        for cmp_name, id in names.items():
             if cmp_name == backbone_name:
                 continue
 
@@ -614,10 +779,10 @@ def extract_vars(base_fname,
             constr_seq = list(constr_seq)
             locus_diff = 0
 
-            if cmp_name not in HLA_Vars:
+            if cmp_name not in Vars_:
                 continue
             
-            for var in HLA_Vars[cmp_name]:
+            for var in Vars_[cmp_name]:
                 try:
                     locus, type, data = var.split('-')
                     locus = int(locus)
@@ -641,8 +806,8 @@ def extract_vars(base_fname,
                     locus_diff -= del_len
 
             constr_seq = "".join(constr_seq)
-            assert id < len(HLA_seqs)
-            cmp_seq = HLA_seqs[id].replace('.', '')
+            assert id < len(seqs)
+            cmp_seq = seqs[id].replace('.', '')
             if len(constr_seq) != len(cmp_seq):
                 print >> sys.stderr, "Error: reconstruction fails (%s)! Lengths different: %d vs. %d" % \
                     (cmp_name, len(constr_seq), len(cmp_seq))
@@ -661,81 +826,145 @@ def extract_vars(base_fname,
                 assert False
 
         # Write the backbone sequences into a fasta file
-        if reference_type == "gene":
-            print >> backbone_file, ">%s" % (backbone_name)
-            backbone_seq_ = backbone_seq.replace('.', '')
-            for s in range(0, len(backbone_seq_), 60):
-                print >> backbone_file, backbone_seq_[s:s+60]
+        print >> backbone_file, ">%s" % (backbone_name)
+        backbone_seq_ = backbone_seq.replace('.', '')
+        for s in range(0, len(backbone_seq_), 60):
+            print >> backbone_file, backbone_seq_[s:s+60]
 
         # Remap the backbone allele, which is sometimes slighly different from
-        #   IMGTHLA/fasta version
+        #   fasta version
+        ref_backbone_id = names[ref_gene]
+        ref_backbone_seq = seqs[ref_backbone_id]
+        aligner_cmd = ["hisat2"]
         if base_fname == "hla":
-            ref_backbone_id = HLA_names[HLA_ref_gene]
-            ref_backbone_seq = HLA_seqs[ref_backbone_id]
-            hisat2 = os.path.join(ex_path, "hisat2")
-            aligner_cmd = [hisat2,
-                           "--score-min", "C,0",
-                           "--no-unal",
-                           "-x", "grch38/genome",
-                           "-f", 
-                           "-c", "%s" % ref_backbone_seq.replace('.', '')]
-            align_proc = subprocess.Popen(aligner_cmd,
-                                          stdout=subprocess.PIPE,
-                                          stderr=open("/dev/null", 'w'))
-            left, right = 0, 0
-            for line in align_proc.stdout:
-                if line.startswith('@'):
-                    continue
-                line = line.strip()
-                cols = line.split()
-                allele_id, flag, chr, left, mapQ, cigar_str = cols[:6]
-                flag = int(flag)
-                assert flag & 0x10 == 0
-                left = int(left) - 1
-                AS = ""
-                for i in range(11, len(cols)):
-                    col = cols[i]
-                    if col.startswith("AS"):
-                        AS = col[5:]
-                assert int(AS) == 0
-                cigar_re = re.compile('\d+\w')
-                right = left
-                cigars = cigar_re.findall(cigar_str)
-                cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-                assert len(cigars) == 1
-                for cigar_op, length in cigars:
-                    assert cigar_op == 'M'
-                    right += (length - 1)
-                break            
-            align_proc.communicate()
-            assert left < right
-
-            if reference_type == "gene":
-                base_locus = 0
-                backbone_seq_ = backbone_seq.replace('.', '')
-
-                ref_seq = HLA_seqs[HLA_names[HLA_ref_gene]]
-                ref_seq_map = create_map(ref_seq)
-                exons = HLA_gene_exons[HLA_gene]
-                exon_str = ""
-                for exon in exons:
-                    if exon_str != "":
-                        exon_str += ','
-                    exon_str += ("%d-%d" % (ref_seq_map[exon[0]], ref_seq_map[exon[1]]))
-
-                print >> hla_ref_file, "%s\t6\t%d\t%d\t%d\t%s" % (backbone_name, left, right, len(backbone_seq_), exon_str)
+            aligner_cmd += ["--score-min", "C,0"]
+        aligner_cmd += ["--no-unal",
+                        "-x", "grch38/genome",
+                        "-f", 
+                        "-c", "%s" % ref_backbone_seq.replace('.', '')]
+        align_proc = subprocess.Popen(aligner_cmd,
+                                      stdout=subprocess.PIPE,
+                                      stderr=open("/dev/null", 'w'))
+        best_chr, best_left, best_right, best_AS = "", 0, 0, -sys.maxint
+        for line in align_proc.stdout:
+            if line.startswith('@'):
+                continue
+            line = line.strip()
+            cols = line.split()
+            allele_id, flag, chr, left, mapQ, cigar_str = cols[:6]
+            flag = int(flag)
+            assert flag & 0x10 == 0
+            left = int(left) - 1
+            right = left
+            AS = ""
+            for i in range(11, len(cols)):
+                col = cols[i]
+                if col.startswith("AS"):
+                    AS = col[5:]
+            AS = int(AS)
+            cigars = cigar_re.findall(cigar_str)
+            cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
+            for i in range(len(cigars)):
+                cigar_op, length = cigars[i]
+                if cigar_op in "MND":
+                    right += length
+            if AS > best_AS:
+                best_chr, best_left, best_right, best_AS = chr, left, right, AS
+
+        chr, left, right = best_chr, best_left, best_right
+        align_proc.communicate()
+        if left == right:
+            print >> sys.stderr, "Warning: %s (%s) is not remapped" % (gene, ref_gene)
+            continue
+        assert left < right
+
+        base_locus = 0                
+        ref_seq = seqs[names[ref_gene]]
+        ref_seq_map = create_map(ref_seq)
+
+        del_count = []
+        for nt in backbone_seq:
+            assert nt in "ACGT."
+            add = 1 if nt == '.' else 0
+            if len(del_count) == 0:
+                del_count.append(add)
             else:
-                exons = HLA_gene_exons[HLA_gene]
-                exon_str = ""
-                for exon in exons:
-                    if exon_str != "":
-                        exon_str += ','
-                    exon_str += ("%d-%d" % (left + exon[0], left + exon[1]))
-
-                print >> hla_ref_file, "%s\t6\t%d\t%d\t%d\t%s" % (backbone_name, left, right, right - left + 1, exon_str)
-                base_locus = left
+                del_count.append(del_count[-1] + add)
+        
+        if base_fname == "hla":
+            exon_str = ""
+            for exon_left, exon_right in gene_exons[gene]:
+                exon_left, exon_right = ref_seq_map[exon_left], ref_seq_map[exon_right]
+                exon_left -= del_count[exon_left]
+                exon_right -= del_count[exon_right]
+                if exon_str != "":
+                    exon_str += ','
+                exon_str += ("%d-%d" % (exon_left, exon_right))
+
+            # Sanity check for exonic sequence
+            sanity_check = True
+            if sanity_check and \
+               os.path.exists("hisatgenotype_db/HLA/fasta/%s_nuc.fasta" % gene):
+                exons_ = []
+                for exon in exon_str.split(','):
+                    exon_left, exon_right = exon.split('-')
+                    exon_left, exon_right = int(exon_left), int(exon_right)
+                    exons_.append([exon_left, exon_right])
+
+                backbone_seq_ = backbone_seq.replace('.', '')
+                if ref_gene in Vars_:
+                    vars_ = Vars_[ref_gene]
+                else:
+                    vars_ = []
+                seq_ = list(backbone_seq_)
+                has_insertion = False
+                for var_ in vars_:
+                    var_pos, var_type, var_data = var_.split('-')
+                    var_pos = int(var_pos)
+                    assert var_pos >= 0 and var_pos < len(backbone_seq_)
+                    if var_type == 'M':
+                        seq_[var_pos] = var_data
+                    elif var_type == 'D':
+                        del_len = int(var_data)
+                        assert var_pos + del_len <= len(ref_seq)
+                        seq_[var_pos:var_pos + del_len] = ['.'] * del_len
+                    else:
+                        assert var_type == 'I'
+                        has_insertion = True
+
+                seq_ = ''.join(seq_)
+                exon_seq_ = ""
+                for exon_left, exon_right in exons_:
+                    exon_seq_ += seq_[exon_left:exon_right+1]
+                exon_seq_ = exon_seq_.replace('.', '')
+                if gene_strand[gene] == '-':
+                    exon_seq_ = typing_common.reverse_complement(exon_seq_)
+
+                cmp_exon_seq_, allele_name_ = "", ""
+                for line in open("hisatgenotype_db/HLA/fasta/%s_nuc.fasta" % gene):
+                    if line.startswith(">"):
+                        if allele_name_ == ref_gene:
+                            break
+                        allele_name_ = line.strip().split()[1]
+                        cmp_exon_seq_ = ""
+                    else:
+                        cmp_exon_seq_ += line.strip()
+                """
+                print "Has insertions:", has_insertion
+                print "constructed:", len(exon_seq_)
+                for p in range(0, len(exon_seq_), 60):
+                    print exon_seq_[p:p+60]
+                print "true:", len(cmp_exon_seq_)
+                for p in range(0, len(cmp_exon_seq_), 60):
+                    print cmp_exon_seq_[p:p+60]
+                """
+                if exon_seq_ != cmp_exon_seq_:
+                    print >> sys.stderr, "Waring: exonic sequences do not match (%s)" % gene
         else:
-            base_locus = 0
+            exon_str = "%d-%d" % (left, right - 1)
+
+        print >> locus_file, "%s\t%s\t%d\t%d\t%d\t%s\t%s" % \
+            (backbone_name, chr, left, right - 1, len(backbone_seq.replace('.', '')), exon_str, gene_strand[gene])
 
         # Write
         #       (1) variants w.r.t the backbone sequences into a SNP file
@@ -753,49 +982,75 @@ def extract_vars(base_fname,
                 assert type == 'D'
                 type_str = "deletion"
 
+            freq, names_ = Vars[keys[k]]
+            names_ = sorted(names_)            
             varID = "hv%d" % (num_vars)
             tmp_backbone_name = backbone_name
-            if reference_type != "gene":
-                tmp_backbone_name = "6"
             print >> var_file, "%s\t%s\t%s\t%d\t%s" % \
                 (varID, type_str, tmp_backbone_name, base_locus + locus, data)
-            names = sorted(Vars[keys[k]])
-            print >> link_file, "%s\t%s" % (varID, ' '.join(names))
+            if freq >= min_var_freq:
+                print >> var_index_file, "%s\t%s\t%s\t%d\t%s" % \
+                    (varID, type_str, tmp_backbone_name, base_locus + locus, data)
+            print >> var_freq_file, "%s\t%.2f" % (varID, freq)
+            print >> link_file, "%s\t%s" % (varID, ' '.join(names_))
             var2ID[keys[k]] = num_vars
             num_vars += 1
 
         add_seq_len = 0
         # Write haplotypes
+        excluded_vars = set()
+        var_leftmost, var_rightmost = sys.maxint, -1
+        for k in range(len(keys)):
+            key = keys[k]
+            if Vars[key][0] < min_var_freq:
+                excluded_vars.add(key)
+
+            # Update leftmost and rightmost of Vars
+            locus, type, data = key.split('-')
+            left = right = int(locus)
+            if type == 'D':
+                right = left + int(data) - 1
+            if k == 0:
+                var_leftmost = left
+            if var_rightmost < right:
+                var_rightmost = right
+
         i = 0
         while i < len(keys):
             key_i = keys[i]
             locus, type, data = key_i.split('-')
             locus = int(locus)
             if type == 'D':
-                locus += (int(data)- 1)
+                locus += (int(data) - 1)
             prev_locus = locus
-            j = i + 1
-            while j < len(keys):
-                key_j = keys[j]
-                locus2, type2, data2 = key_j.split('-')
-                locus2 = int(locus2)
-                if prev_locus + inter_gap < locus2:
-                    break
-                prev_locus = locus2
-                if type == 'D':
-                    prev_locus += (int(data)- 1)                
-                j += 1
+            if whole_haplotype:
+                j = len(keys)
+            else:
+                j = i + 1
+                while j < len(keys):
+                    key_j = keys[j]
+                    locus2, type2, data2 = key_j.split('-')
+                    locus2 = int(locus2)
+                    if prev_locus + inter_gap < locus2:
+                        break
+                    prev_locus = locus2
+                    if type == 'D':
+                        prev_locus += (int(data) - 1)
+                    j += 1
 
             alleles = set()
             for k in range(i, j):
                 key_k = keys[k]
-                add_alleles = set(Vars[key_k])
+                freq, names_ = Vars[key_k]
+                if freq < min_var_freq:
+                    continue
+                add_alleles = set(names_)
                 alleles |= add_alleles
 
             haplotypes = set()
-            cur_vars = set(keys[i:j])
+            cur_vars = set(keys[i:j]) - excluded_vars
             for allele in alleles:
-                allele_vars = set(HLA_Vars[allele])
+                allele_vars = set(Vars_[allele]) - excluded_vars
                 allele_cur_vars = '#'.join(sorted(list(cur_vars & allele_vars), cmp=cmp_varKey))
                 haplotypes.add(allele_cur_vars)
 
@@ -823,7 +1078,8 @@ def extract_vars(base_fname,
                             split_haplotypes.add('#'.join(haplotype[prev_s:s]))
                 return split_haplotypes
 
-            haplotypes2 = split_haplotypes(haplotypes)
+            if not whole_haplotype:
+                haplotypes = split_haplotypes(haplotypes)
 
             def cmp_haplotype(a, b):
                 a = a.split('#')
@@ -843,7 +1099,6 @@ def extract_vars(base_fname,
                 return a_end - b_end
 
             haplotypes = sorted(list(haplotypes), cmp=cmp_haplotype)
-            haplotypes2 = sorted(list(haplotypes2), cmp=cmp_haplotype)
             
             # DK - for debugging purposes
             """
@@ -861,40 +1116,42 @@ def extract_vars(base_fname,
 
             # Write haplotypes
             sanity_vars = set()
-            for h_i in range(len(haplotypes2)):
-                h = haplotypes2[h_i].split('#')
-                h1_locus, _, _ = h[0].split('-')
-                h2_locus, h2_type, h2_data = h[-1].split('-')
-                h_begin, h_end = int(h1_locus), int(h2_locus)
-                if h2_type == 'D':
-                    h_end += (int(h2_data) - 1)
-                assert h_begin <= h_end
+            for h_i in range(len(haplotypes)):
+                h = haplotypes[h_i].split('#')
                 varIDs = []
                 for var in h:
                     varIDs.append("hv%s" % var2ID[var])
                     # DK - for debugging purposes
                     # varIDs.append(var)
                     sanity_vars.add(var2ID[var])
-                h_new_begin = h_begin
-                for h_j in reversed(range(0, h_i)):
-                    hc = haplotypes2[h_j].split('#')
-                    hc_begin, hc_type, hc_data = hc[-1].split('-')
-                    hc_begin = int(hc_begin)
-                    hc_end = hc_begin
-                    if hc_type == 'D':
-                        hc_end += (int(hc_data) - 1)
-                    if hc_end + inter_gap < h_begin:
-                        break
-                    if h_new_begin > hc_end:
-                        h_new_begin = hc_end
-                assert h_new_begin <= h_begin
+                if whole_haplotype:
+                    h_begin, h_end = var_leftmost, var_rightmost
+                else:
+                    h1_locus, _, _ = h[0].split('-')
+                    h2_locus, h2_type, h2_data = h[-1].split('-')
+                    h_begin, h_end = int(h1_locus), int(h2_locus)
+                    if h2_type == 'D':
+                        h_end += (int(h2_data) - 1)
+                    assert h_begin <= h_end
+                    h_new_begin = h_begin
+                    for h_j in reversed(range(0, h_i)):
+                        hc = haplotypes[h_j].split('#')
+                        hc_begin, hc_type, hc_data = hc[-1].split('-')
+                        hc_begin = int(hc_begin)
+                        hc_end = hc_begin
+                        if hc_type == 'D':
+                            hc_end += (int(hc_data) - 1)
+                        if hc_end + inter_gap < h_begin:
+                            break
+                        if h_new_begin > hc_end:
+                            h_new_begin = hc_end
+                    assert h_new_begin <= h_begin
+                    h_begin = h_new_begin
                 tmp_backbone_name = backbone_name
-                if reference_type != "gene":
-                    tmp_backbone_name = "6"
                 print >> haplotype_file, "ht%d\t%s\t%d\t%d\t%s" % \
-                    (num_haplotypes, tmp_backbone_name, base_locus + h_new_begin, base_locus + h_end, ','.join(varIDs))
+                    (num_haplotypes, tmp_backbone_name, base_locus + h_begin, base_locus + h_end, ','.join(varIDs))
                 num_haplotypes += 1
-                add_seq_len += (h_end - h_new_begin + 1)
+                add_seq_len += (h_end - h_begin + 1)
             assert len(sanity_vars) == len(cur_vars)
                     
             i = j
@@ -902,26 +1159,40 @@ def extract_vars(base_fname,
         print >> sys.stderr, "Length of additional sequences for haplotypes:", add_seq_len
                     
         # Write all the sequences with dots removed into a file
-        for name, ID in HLA_names.items():
+        for name, ID in names.items():
             print >> input_file, ">%s" % (name)
-            assert ID < len(HLA_seqs)
-            seq = HLA_seqs[ID].replace('.', '')
+            assert ID < len(seqs)
+            seq = seqs[ID].replace('.', '')
             for s in range(0, len(seq), 60):
                 print >> input_file, seq[s:s+60]
 
-    if reference_type == "gene":
-        backbone_file.close()
-    elif reference_type == "chromosome":
-        os.system("samtools faidx genome.fa 6 > hla_backbone.fa")
-    else:
-        assert reference_type == "genome"
-        os.system("cp genome.fa hla_backbone.fa")
-
-    hla_ref_file.close()
+    backbone_file.close()
+    locus_file.close()
     var_file.close()
+    var_index_file.close()
+    var_freq_file.close()
     haplotype_file.close()
     link_file.close()
     input_file.close()
+
+    # Read partial alleles from hla.data, and write them into a file
+    partial_allele_list = []
+    if base_fname == "hla":
+        for line in open("hisatgenotype_db/HLA/hla.dat"):
+            if not line.startswith("DE"):
+                continue
+            allele_name = line.split()[1][:-1]
+            if allele_name.startswith("HLA-"):
+                allele_name = allele_name[4:]
+            gene = allele_name.split('*')[0]
+            if line.find("partial") != -1:
+                partial_allele_list.append(allele_name)
+
+    partial_file = open("%s.partial" % base_fullpath_name, 'w')
+    for partial_allele in partial_allele_list:
+        print >> partial_file, partial_allele
+    partial_file.close()
+   
     
         
 """
@@ -933,63 +1204,58 @@ if __name__ == '__main__':
                         dest="base_fname",
                         type=str,
                         default="hla",
-                        help="base filename for backbone HLA sequence, HLA variants, and HLA linking info")
-    parser.add_argument("--reference-type",
-                        dest="reference_type",
+                        help="base filename for backbone sequence, variants, and linking info (Default: hla)")
+    parser.add_argument("--locus-list",
+                        dest="locus_list",
                         type=str,
-                        default="gene",
-                        help="Reference type: gene, chromosome, and genome")
-    parser.add_argument("--hla-list",
-                        dest="hla_list",
-                        type=str,
-                        default="A,B,C,DQA1,DQB1,DRB1",
-                        help="A comma-separated list of HLA genes")
-    parser.add_argument("--no-partial",
-                        dest="partial",
-                        action="store_false",
-                        help="Include partial alleles (e.g. A_nuc.fasta)")
+                        default="",
+                        help="A comma-separated list of gene names (default: empty, all genes)")
     parser.add_argument("--inter-gap",
                         dest="inter_gap",
                         type=int,
                         default=30,
-                        help="Maximum distance for variants to be in the same haplotype")
+                        help="Maximum distance for variants to be in the same haplotype (default: 30)")
     parser.add_argument("--intra-gap",
                         dest="intra_gap",
                         type=int,
                         default=50,
-                        help="Break a haplotype into several haplotypes")
-    parser.add_argument("--DRB1-REF",
-                        dest="DRB1_REF",
+                        help="Break a haplotype into several haplotypes (default: 50)")
+    parser.add_argument("--whole-haplotype",
+                        dest="whole_haplotype",
                         action="store_true",
-                        help="Some DRB1 alleles seem to include vector sequences, so use this option to avoid including them")
-    parser.add_argument("--exclude-allele-list",
-                        dest="exclude_allele_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of alleles to be excluded")
+                        help="Include partial alleles (e.g. A_nuc.fasta)")
+    parser.add_argument("--min-var-freq",
+                        dest="min_var_freq",
+                        type=float,
+                        default=0.0,
+                        help="Exclude variants whose freq is below than this value in percentage (Default: 0.0)")    
+    parser.add_argument("--ext-seq",
+                        dest="ext_seq_len",
+                        type=int,
+                        default=0,
+                        help="Length of extra sequences flanking backbone sequences (Default: 0)")
     parser.add_argument("--leftshift",
                         dest="leftshift",
                         action="store_true",
                         help="Shift deletions to the leftmost")
+    parser.add_argument("--no-partial",
+                        dest="partial",
+                        action="store_false",
+                        help="Exclude partial alleles, exon-only sequences in HLA")
     parser.add_argument("-v", "--verbose",
                         dest="verbose",
                         action="store_true",
                         help="also print some statistics to stderr")
 
     args = parser.parse_args()
-    args.hla_list = args.hla_list.split(',')
+    if args.locus_list == "":
+        locus_list = []
+    else:
+        locus_list = args.locus_list.split(',')
     if args.inter_gap > args.intra_gap:
         print >> sys.stderr, "Error: --inter-gap (%d) must be smaller than --intra-gap (%d)" % (args.inter_gap, args.intra_gap)
         sys.exit(1)
-    if not args.reference_type in ["gene", "chromosome", "genome"]:
-        print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome" % (args.reference_type)
-        sys.exit(1)
              
-    if len(args.exclude_allele_list) > 0:
-        args.exclude_allele_list = args.exclude_allele_list.split(',')
-    else:
-        args.exclude_allele_list = []
-
     if args.base_fname.find('/') != -1:
         elems = args.base_fname.split('/')
         base_fname = elems[-1]
@@ -998,15 +1264,15 @@ if __name__ == '__main__':
         base_fname = args.base_fname
         base_dname = ""
         
-    # print args.exclude_allele_list
     extract_vars(base_fname,
                  base_dname,
-                 args.reference_type,
-                 args.hla_list,
-                 args.partial,
+                 locus_list,
                  args.inter_gap,
                  args.intra_gap,
-                 args.DRB1_REF,
-                 args.exclude_allele_list,
+                 args.whole_haplotype,
+                 args.min_var_freq,
+                 args.ext_seq_len,
                  args.leftshift,
+                 args.partial,
                  args.verbose)
+
diff --git a/hisatgenotype_typing.py b/hisatgenotype_hla_cyp.py
similarity index 93%
rename from hisatgenotype_typing.py
rename to hisatgenotype_hla_cyp.py
index b4603ff..cd97eea 100755
--- a/hisatgenotype_typing.py
+++ b/hisatgenotype_hla_cyp.py
@@ -73,6 +73,7 @@ def simulate_reads(HLAs,
 Align reads, and sort the alignments into a BAM file
 """
 def align_reads(ex_path,
+                base_fname,
                 aligner,
                 index_type,
                 read_fname,
@@ -86,12 +87,12 @@ def align_reads(ex_path,
                        "--mm"]
         if index_type == "linear":
             aligner_cmd += ["-k", "10"]
-        aligner_cmd += ["-x", "hla.%s" % index_type]
+        aligner_cmd += ["-x", "%s.%s" % (base_fname, index_type)]
     elif aligner == "bowtie2":
         aligner_cmd = [aligner,
                        "--no-unal",
                        "-k", "10",
-                       "-x", "hla"]
+                       "-x", base_fname]
     else:
         assert False
     assert len(read_fname) in [1,2]
@@ -303,6 +304,7 @@ def joint_abundance(HLA_cmpt,
 """
 """
 def HLA_typing(ex_path,
+               base_fname,
                simulation,
                reference_type,
                hla_list,
@@ -355,6 +357,7 @@ def HLA_typing(ex_path,
         if alignment_fname == "":
             # Align reads, and sort the alignments into a BAM file
             align_reads(ex_path,
+                        base_fname,
                         aligner,
                         index_type,
                         read_fname,
@@ -367,6 +370,7 @@ def HLA_typing(ex_path,
                 gene = test_HLA_names[0].split('*')[0]
             else:
                 gene = test_HLA_names
+            
             ref_allele = refHLAs[gene]
             ref_seq = HLAs[gene][ref_allele]
             ref_exons = refHLA_loci[gene][-1]
@@ -1053,8 +1057,10 @@ def genotyping(base_fname,
         os.system("git clone https://github.com/jrob119/IMGTHLA.git")
 
     # Clone hisat2 genotype database, hisat_genotype_db
+    """
     if not os.path.exists("hisat_genotype_db"):
         os.system("git clone https://github.com/infphilo/hisat_genotype_db.git")
+    """
 
     simulation = (read_fname == [] and alignment_fname == "")
 
@@ -1075,11 +1081,11 @@ def genotyping(base_fname,
         os.system("samtools faidx genome.fa")
 
     # Check if the pre-existing files (hla*) are compatible with the current parameter setting
-    if os.path.exists("hla.ref"):
+    if os.path.exists("%s.ref" % base_fname):
         left = 0
         HLA_genes = set()
         BACKBONE = False
-        for line in open("hla.ref"):
+        for line in open("%s.ref" % base_fname):
             HLA_name = line.strip().split()[0]
             if HLA_name.find("BACKBONE") != -1:
                 BACKBONE = True
@@ -1096,14 +1102,11 @@ def genotyping(base_fname,
             assert False
         if not set(hla_list).issubset(HLA_genes):
             delete_hla_files = True
-        if delete_hla_files:
-            os.system("rm hla*")
+        if base_fname == "hla":
+            if delete_hla_files:
+                os.system("rm %s*" % base_fname)
     
     # Extract HLA variants, backbone sequence, and other sequeces  
-    if len(base_fname) > 0:
-        base_fname = "_" + base_fname
-    base_fname = "hla" + base_fname
-    
     HLA_fnames = [base_fname+"_backbone.fa",
                   base_fname+"_sequences.fa",
                   base_fname+".ref",
@@ -1111,7 +1114,6 @@ def genotyping(base_fname,
                   base_fname+".haplotype",
                   base_fname+".link",
                   base_fname+"_alleles_excluded.txt"]
-
     
     # Check if excluded alleles in current files match
     excluded_alleles_match = False
@@ -1137,13 +1139,14 @@ def genotyping(base_fname,
         afile.write("\n".join(exclude_allele_list))
         afile.close()
         
-    print HLA_fnames
-    
     if (not check_files(HLA_fnames)) or (not excluded_alleles_match) :
         extract_hla_script = os.path.join(ex_path, "hisatgenotype_extract_vars.py")
         extract_cmd = [extract_hla_script,
-                       "--reference-type", reference_type,
-                       "--hla-list", ','.join(hla_list)]
+                       "--base", base_fname,
+                       "--reference-type", reference_type]
+
+        if base_fname == "hla":
+            extract_cmd += ["--hla-list", ','.join(hla_list)]
 
         if len(exclude_allele_list) > 0:
             print exclude_allele_list
@@ -1165,63 +1168,65 @@ def genotyping(base_fname,
             print >> sys.stderr, "Error: extract_HLA_vars failed!"
             sys.exit(1)
             
-    print "Base files built\n"
-
-    # Build HISAT2 graph indexes based on the above information
-    HLA_hisat2_graph_index_fnames = ["hla.graph.%d.ht2" % (i+1) for i in range(8)]
-    if not check_files(HLA_hisat2_graph_index_fnames) or (not excluded_alleles_match):
-        hisat2_build = os.path.join(ex_path, "hisat2-build")
-        build_cmd = [hisat2_build,
-                     "-p", str(threads),
-                     "--snp", HLA_fnames[3],
-                     "--haplotype", HLA_fnames[4] ,
-                     HLA_fnames[0],
-                     "hla.graph"]
-        if verbose:
-            print >> sys.stderr, "\tRunning:", ' '.join(build_cmd)
-        proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-        proc.communicate()        
-        if not check_files(HLA_hisat2_graph_index_fnames):
-            print >> sys.stderr, "Error: indexing HLA failed!  Perhaps, you may have forgotten to build hisat2 executables?"
-            sys.exit(1)
-    print "Step 1 Complete\n"
-    # Build HISAT2 linear indexes based on the above information
-    HLA_hisat2_linear_index_fnames = ["hla.linear.%d.ht2" % (i+1) for i in range(8)]
-    if reference_type == "gene" and (not check_files(HLA_hisat2_linear_index_fnames) or (not excluded_alleles_match)):
-        hisat2_build = os.path.join(ex_path, "hisat2-build")
-        build_cmd = [hisat2_build,
-                     "%s,%s"%(HLA_fnames[0],HLA_fnames[1]),
-                     "hla.linear"]
-        proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-        proc.communicate()        
-        if not check_files(HLA_hisat2_linear_index_fnames):
-            print >> sys.stderr, "Error: indexing HLA failed!"
-            sys.exit(1)
-            
-    print "Step 2 Complete\n"
-    # Build Bowtie2 indexes based on the above information
-    HLA_bowtie2_index_fnames = ["hla.%d.bt2" % (i+1) for i in range(4)]
-    HLA_bowtie2_index_fnames += ["hla.rev.%d.bt2" % (i+1) for i in range(2)]
-    if reference_type == "gene" and (not check_files(HLA_bowtie2_index_fnames) or (not excluded_alleles_match)):
-        build_cmd = ["bowtie2-build",
-                     "%s,%s"%(HLA_fnames[0],HLA_fnames[1]),
-                     "hla"]
-        proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'))
-        proc.communicate()        
-        if not check_files(HLA_bowtie2_index_fnames):
-            print >> sys.stderr, "Error: indexing HLA failed!"
-            sys.exit(1)
+    for aligner, index_type in aligners:
+        # Build HISAT2 graph indexes based on the above information
+        if aligner == "hisat2" and index_type == "graph":
+            HLA_hisat2_graph_index_fnames = ["%s.graph.%d.ht2" % (base_fname, i+1) for i in range(8)]
+            if not check_files(HLA_hisat2_graph_index_fnames) or (not excluded_alleles_match):
+                hisat2_build = os.path.join(ex_path, "hisat2-build")
+                build_cmd = [hisat2_build,
+                             "-p", str(threads),
+                             "--snp", HLA_fnames[3],
+                             "--haplotype", HLA_fnames[4] ,
+                             HLA_fnames[0],
+                             "%s.graph" % base_fname]
+                if verbose:
+                    print >> sys.stderr, "\tRunning:", ' '.join(build_cmd)
+                proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
+                proc.communicate()        
+                if not check_files(HLA_hisat2_graph_index_fnames):
+                    print >> sys.stderr, "Error: indexing HLA failed!  Perhaps, you may have forgotten to build hisat2 executables?"
+                    sys.exit(1)
 
-    print "Step 3 Complete\n"
+        # Build HISAT2 linear indexes based on the above information
+        elif aligner == "hisat2" and index_type == "linear":
+            HLA_hisat2_linear_index_fnames = ["%s.linear.%d.ht2" % (base_fname, i+1) for i in range(8)]
+            if reference_type == "gene" and (not check_files(HLA_hisat2_linear_index_fnames) or (not excluded_alleles_match)):
+                hisat2_build = os.path.join(ex_path, "hisat2-build")
+                build_cmd = [hisat2_build,
+                             "%s,%s"%(HLA_fnames[0],HLA_fnames[1]),
+                             "%s.linear" % base_fname]
+                proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
+                proc.communicate()        
+                if not check_files(HLA_hisat2_linear_index_fnames):
+                    print >> sys.stderr, "Error: indexing HLA failed!"
+                    sys.exit(1)
+
+        # Build Bowtie2 indexes based on the above information
+        else:
+            assert aligner == "bowtie2" and index_type == "linear"
+            HLA_bowtie2_index_fnames = ["%s.%d.bt2" % (base_fname, i+1) for i in range(4)]
+            HLA_bowtie2_index_fnames += ["%s.rev.%d.bt2" % (base_fname, i+1) for i in range(2)]
+            if reference_type == "gene" and (not check_files(HLA_bowtie2_index_fnames) or (not excluded_alleles_match)):
+                build_cmd = ["bowtie2-build",
+                             "%s,%s"%(HLA_fnames[0],HLA_fnames[1]),
+                             base_fname]
+                proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'))
+                proc.communicate()        
+                if not check_files(HLA_bowtie2_index_fnames):
+                    print >> sys.stderr, "Error: indexing HLA failed!"
+                    sys.exit(1)
+        
     # Read partial alleles from hla.data (temporary)
     partial_alleles = set()
-    for line in open("IMGTHLA/hla.dat"):
-        if not line.startswith("DE"):
-            continue
-        allele_name = line.split()[1][4:-1]
-        gene = allele_name.split('*')[0]
-        if line.find("partial") != -1:
-            partial_alleles.add(allele_name)
+    if base_fname == "hla":
+        for line in open("IMGTHLA/hla.dat"):
+            if not line.startswith("DE"):
+                continue
+            allele_name = line.split()[1][4:-1]
+            gene = allele_name.split('*')[0]
+            if line.find("partial") != -1:
+                partial_alleles.add(allele_name)
 
     if len(default_allele_list) != 0:
         #print os.getcwd()
@@ -1254,7 +1259,7 @@ def genotyping(base_fname,
     
     # Read HLA alleles (names and sequences)
     refHLAs, refHLA_loci = {}, {}
-    for line in open("hla.ref"):
+    for line in open("%s.ref" % base_fname):
         HLA_name, chr, left, right, length, exon_str = line.strip().split()
         HLA_gene = HLA_name.split('*')[0]
         assert not HLA_gene in refHLAs
@@ -1265,10 +1270,8 @@ def genotyping(base_fname,
             exon_left, exon_right = exon.split('-')
             exons.append([int(exon_left), int(exon_right)])
         refHLA_loci[HLA_gene] = [HLA_name, chr, left, right, exons]
+        
     HLAs = {}
-
-
-
     if reference_type == "gene":
         read_HLA_alleles(HLA_fnames[0], HLAs)
     read_HLA_alleles(HLA_fnames[1], HLAs)
@@ -1292,24 +1295,15 @@ def genotyping(base_fname,
         HLAs_default = {}
         read_HLA_alleles("./Default-HLA/hla_backbone.fa",HLAs_default)
         read_HLA_alleles("./Default-HLA/hla_sequences.fa",HLAs_default)
-        #HLA_lengths_default = {}
-        
         
         for HLA_gene, HLA_alleles in HLAs_default.items():
             for allele_name, seq in HLA_alleles.items():
                 if allele_name in default_allele_list:
                     HLA_lengths[HLA_gene][allele_name] = len(seq)
-        
-        #for allele_name, seq in HLAs_default.items():
-         #   if allele_name in default_allele_list:
-          #      HLA_lengths[allele_name] = len(seq)
-            #if (allele_name in default_allele_list):
-            #    HLA_lengths_default[allele_name] = len(seq)
-
 
     # Read HLA variants, and link information
     Vars, Var_list = {}, {}
-    for line in open(HLA_fnames[3]):
+    for line in open("%s.snp" % base_fname):
         var_id, var_type, allele, pos, data = line.strip().split('\t')
         pos = int(pos)
         if reference_type != "gene":
@@ -1341,22 +1335,12 @@ def genotyping(base_fname,
         Var_list[gene] = sorted(in_var_list)
         
     Links = {}
-    for line in open(HLA_fnames[5]):
+    for line in open("%s.link" % base_fname):
         var_id, alleles = line.strip().split('\t')
         alleles = alleles.split()
         assert not var_id in Links
         Links[var_id] = alleles
 
-    # Scoring schemes from Sangtae Kim (Illumina)'s implementation
-    # Currently not used.
-    """
-    max_qual_value = 100
-    match_score, mismatch_score = [0] * max_qual_value, [0] * max_qual_value
-    for qual in range(max_qual_value):
-        error_rate = 0.1 ** (qual / 10.0)
-        match_score[qual] = math.log(1.000000000001 - error_rate);
-        mismatch_score[qual] = math.log(error_rate / 3.0);
-    """
     # Test HLA typing
     test_list = []
     if simulation:
@@ -1369,7 +1353,11 @@ def genotyping(base_fname,
 
         test_passed = {}
         test_list = []
-        genes = list(set(hla_list) & set(HLA_names.keys()))
+        if base_fname == "hla":
+            genes = list(set(hla_list) & set(HLA_names.keys()))
+        else:
+            genes = HLA_names.keys()
+            
         if basic_test:
             for gene in genes:
                 HLA_gene_alleles = HLA_names[gene]
@@ -1383,33 +1371,21 @@ def genotyping(base_fname,
             for test_i in range(test_size):
                 test_pairs = []
                 for gene in genes:
-                    HLA_gene_alleles = []
-                    
+                    HLA_gene_alleles = []                    
                     for allele in HLA_names[gene]:
                         if allele.find("BACKBONE") != -1:
                             continue
                         HLA_gene_alleles.append(allele)
+
+                    # DK - temporary
+                    if len(HLA_gene_alleles) < 2:
+                        continue
+                        
                     nums = [i for i in range(len(HLA_gene_alleles))]
                     random.shuffle(nums)
                     test_pairs.append(sorted([HLA_gene_alleles[nums[i]] for i in range(allele_count)]))
                 test_list.append(test_pairs)
 
-        print test_list
-        if custom_allele_check:
-        
-            test_list = []
-            if basic_test:
-            #test_pairs = []
-            #allele_count = 2
-            #for allele in default_allele_list:
-            #nums = [i for i in range(len(default_allele_list))]
-            #random.shuffle(nums)
-            #test_pairs.append(sorted([default_allele_list[nums[i]] for i in range(allele_count)]))
-            #test_list.append(test_pairs)
-                for allele in default_allele_list:
-                    test_list.append([[allele]])
-        print test_list
-        
         for test_i in range(len(test_list)):
             if "test_id" in daehwan_debug:
                 daehwan_test_ids = daehwan_debug["test_id"].split('-')
@@ -1444,7 +1420,9 @@ def genotyping(base_fname,
                 read_fname = ["hla_input_1.fa", "hla_input_2.fa"]
 
             fastq = False
+            
             tmp_test_passed = HLA_typing(ex_path,
+                                         base_fname,
                                          simulation,
                                          reference_type,
                                          test_HLA_list,
@@ -1481,12 +1459,18 @@ def genotyping(base_fname,
             print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list))
     
     else: # With real reads or BAMs
-        print >> sys.stderr, "\t", ' '.join(hla_list)
+        if base_fname == "hla":
+            gene_list = hla_list
+        else:
+            gene_list = Vars.keys()
+        print >> sys.stderr, "\t", ' '.join(gene_list)
+
         fastq = True
         HLA_typing(ex_path,
+                   base_fname,
                    simulation,
                    reference_type,
-                   hla_list,
+                   gene_list,
                    partial,
                    refHLAs,
                    HLAs,                       
@@ -1516,7 +1500,7 @@ if __name__ == '__main__':
     parser.add_argument("--base",
                         dest="base_fname",
                         type=str,
-                        default="",
+                        default="hla",
                         help="base filename for backbone HLA sequence, HLA variants, and HLA linking info")
     parser.add_argument("--default-list",
                         dest = "default_allele_list",
@@ -1621,8 +1605,6 @@ if __name__ == '__main__':
     if len(args.exclude_allele_list) > 0:
         if args.exclude_allele_list.strip().isdigit():
             num_alleles = int(args.exclude_allele_list)
-            
-            
             if not os.path.exists("./Default-HLA/hla_backbone.fa"):
                 try:
                     os.mkdir("./Default-HLA")
@@ -1649,8 +1631,7 @@ if __name__ == '__main__':
             HLAs_default = {}
             #read_HLA_alleles("./Default-HLA/hla_backbone.fa",HLAs_default)
             read_HLA_alleles("./Default-HLA/hla_sequences.fa",HLAs_default)
-            
-
+    
             allele_names = list(HLAs_default['A'].keys())
             random.shuffle(allele_names)
             args.exclude_allele_list = allele_names[0:num_alleles]
@@ -1686,3 +1667,5 @@ if __name__ == '__main__':
                args.num_mismatch,
                args.verbose,
                debug)
+
+    
diff --git a/hisatgenotype_locus.py b/hisatgenotype_locus.py
new file mode 100755
index 0000000..f36ae48
--- /dev/null
+++ b/hisatgenotype_locus.py
@@ -0,0 +1,2335 @@
+#!/usr/bin/env python
+#
+# Copyright 2015, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT 2.
+#
+# HISAT 2 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT 2 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import sys, os, subprocess, re
+import inspect, random
+import math
+from datetime import datetime, date, time
+from argparse import ArgumentParser, FileType
+from copy import deepcopy
+import hisatgenotype_typing_common as typing_common, hisatgenotype_gene_typing as gene_typing, hisatgenotype_assembly_graph as assembly_graph
+
+
+"""
+   var: ['single', 3300, 'G']
+   exons: [[301, 373], [504, 822], [1084, 1417], [2019, 2301], [2404, 2520], [2965, 2997], [3140, 3187], [3357, 3361]]
+"""
+def var_in_exon(var, exons):
+    exonic = False
+    var_type, var_left, var_data = var
+    var_right = var_left
+    if var_type == "deletion":
+        var_right = var_left + int(var_data) - 1
+    for exon_left, exon_right in exons:
+        if var_left >= exon_left and var_right <= exon_right:
+            return True
+    return False
+
+
+"""
+Report variant IDs whose var is within exonic regions
+"""
+def get_exonic_vars(Vars, exons):
+    vars = set()
+    for var_id, var in Vars.items():
+        var_type, var_left, var_data = var
+        var_right = var_left
+        if var_type == "deletion":
+            var_right = var_left + int(var_data) - 1
+        for exon_left, exon_right in exons:
+            if var_left >= exon_left and var_right <= exon_right:
+                vars.add(var_id)
+                
+    return vars
+
+
+"""
+Get representative alleles among those that share the same exonic sequences
+"""
+def get_rep_alleles(Links, exon_vars):
+    allele_vars = {}
+    for var, alleles in Links.items():
+        if var not in exon_vars:
+            continue
+        for allele in alleles:
+            if allele not in allele_vars:
+                allele_vars[allele] = set()
+            allele_vars[allele].add(var)
+
+    allele_groups = {}
+    for allele, vars in allele_vars.items():
+        vars = '-'.join(vars)
+        if vars not in allele_groups:
+            allele_groups[vars] = []
+        allele_groups[vars].append(allele)
+
+    allele_reps = {} # allele representatives
+    allele_rep_groups = {} # allele groups by allele representatives
+    for allele_members in allele_groups.values():
+        assert len(allele_members) > 0
+        allele_rep = allele_members[0]
+        allele_rep_groups[allele_rep] = allele_members
+        for allele_member in allele_members:
+            assert allele_member not in allele_reps
+            allele_reps[allele_member] = allele_rep
+
+    return allele_reps, allele_rep_groups
+    
+
+"""
+"""
+def error_correct(ref_seq,
+                  read_seq,
+                  read_pos,
+                  mpileup,
+                  Vars,
+                  Var_list,
+                  cmp_list,
+                  debug = False):
+    if debug:
+        print >> sys.stderr, cmp_list
+        print >> sys.stderr, read_seq
+
+    num_correction = 0
+    i = 0
+    while i < len(cmp_list):
+        type, left, length = cmp_list[i][:3]
+        assert length > 0
+        if left >= len(ref_seq):
+            break
+        if type == "match":
+            middle_cmp_list = []
+            last_j = 0
+            for j in range(length):
+                if read_pos + j >= len(read_seq) or \
+                   left + j >= len(ref_seq):
+                    continue
+                
+                read_bp, ref_bp = read_seq[read_pos + j], ref_seq[left + j]
+                assert left + j < len(mpileup)
+                nt_set = mpileup[left + j][0]
+                if len(nt_set) > 0 and read_bp not in nt_set:
+                    read_bp = 'N' if len(nt_set) > 1 else nt_set[0]                    
+                    read_seq = read_seq[:read_pos + j] + read_bp + read_seq[read_pos + j + 1:]
+                    assert read_bp != ref_bp
+                    new_cmp = ["mismatch", left + j, 1, "unknown"]
+                    num_correction += 1
+                    if read_bp != 'N':
+                        var_idx = typing_common.lower_bound(Var_list, left + j)
+                        while var_idx < len(Var_list):
+                            var_pos, var_id = Var_list[var_idx]
+                            if var_pos > left + j:
+                                break
+                            if var_pos == left + j:
+                                var_type, _, var_data = Vars[var_id]
+                                if var_type == "single" and read_bp == var_data:
+                                    new_cmp[3] = var_id
+                                    break                                                        
+                            var_idx += 1
+                    if j > last_j:
+                        middle_cmp_list.append(["match", left + last_j, j- last_j])
+                    middle_cmp_list.append(new_cmp)
+                    last_j = j + 1
+            if last_j < length:
+                middle_cmp_list.append(["match", left + last_j, length - last_j])
+
+            assert len(middle_cmp_list) > 0
+            cmp_list = cmp_list[:i] + middle_cmp_list + cmp_list[i+1:]
+            i += (len(middle_cmp_list) - 1)
+        else:
+            assert type == "mismatch"
+            read_bp, ref_bp = read_seq[read_pos], ref_seq[left]
+            assert left < len(mpileup)
+            nt_set = mpileup[left][0]
+
+            if debug:
+                print >> sys.stderr, left, read_bp, ref_bp, mpileup[left]
+
+            if len(nt_set) > 0 and read_bp not in nt_set:
+                read_bp = 'N' if len(nt_set) > 1 else nt_set[0]
+                read_seq = read_seq[:read_pos] + read_bp + read_seq[read_pos+1:]
+                if read_bp == 'N':
+                    cmp_list[i][3] = "unknown"
+                elif read_bp == ref_bp:
+                    cmp_list[i] = ["match", left, 1]
+                    num_correction += 1
+                else:
+                    cmp_list[i][3] = "unknown"
+                    var_idx = typing_common.lower_bound(Var_list, left)
+                    while var_idx < len(Var_list):
+                        var_pos, var_id = Var_list[var_idx]
+                        if var_pos > left:
+                            break
+                        if var_pos == left:
+                            var_type, _, var_data = Vars[var_id]
+                            if var_type == "single" and read_bp == var_data:
+                                cmp_list[i][3] = var_id
+                                break                                                        
+                        var_idx += 1
+
+                if debug:
+                    print >> sys.stderr, left, read_bp, ref_bp, mpileup[left]
+                    print >> sys.stderr, cmp_list[i]
+
+        read_pos += length
+        i += 1
+
+    # Combine matches
+    i = 0
+    while i < len(cmp_list):
+        type, left, length = cmp_list[i][:3]
+        if type == "match" and i + 1 < len(cmp_list):
+            type2, left2, length2 = cmp_list[i+1][:3]
+            if type2 == "match":
+                cmp_list[i] = [type, left, length + length2]
+                cmp_list = cmp_list[:i+1] + cmp_list[i+2:]
+                continue
+        i += 1
+
+    if debug:
+        print >> sys.stderr, cmp_list
+        print >> sys.stderr, read_seq
+                            
+    return cmp_list, read_seq, num_correction
+
+
+"""
+"""
+def typing(simulation,
+           base_fname,
+           locus_list,
+           genotype_genome,
+           partial,
+           partial_alleles,
+           refGenes,
+           Genes,
+           Gene_names,
+           Gene_lengths,
+           refGene_loci,
+           Vars,
+           Var_list,
+           Links,
+           aligners,
+           num_editdist,
+           assembly,
+           output_base,
+           error_correction,
+           allow_discordant,
+           display_alleles,
+           fastq,
+           read_fname,
+           alignment_fname,
+           num_frag_list,
+           read_len,
+           fragment_len,
+           threads,
+           best_alleles,
+           verbose):
+    if simulation:
+        test_passed = {}
+    report_file = open(output_base + ".report", 'w')
+    for aligner, index_type in aligners:
+        for f_ in [sys.stderr, report_file]:
+            if index_type == "graph":
+                print >> f_, "\n\t\t%s %s" % (aligner, index_type)
+            else:
+                print >> f_, "\n\t\t%s %s" % (aligner, index_type)
+
+        remove_alignment_file = False
+        if alignment_fname == "":
+            # Align reads, and sort the alignments into a BAM file
+            remove_alignment_file = True
+            if simulation:
+                alignment_fname = "%s_output.bam" % base_fname
+            else:
+                alignment_fname = read_fname[0].split('/')[-1]
+                alignment_fname = alignment_fname.split('.')[0] + ".bam"
+                
+            typing_common.align_reads(aligner,
+                                      simulation,
+                                      genotype_genome if genotype_genome != "" else (base_fname + "." + index_type),
+                                      index_type,
+                                      base_fname,
+                                      read_fname,
+                                      fastq,
+                                      threads,
+                                      alignment_fname,
+                                      verbose)
+            
+        for test_Gene_names in locus_list:
+            if simulation:
+                gene = test_Gene_names[0].split('*')[0]
+            else:
+                gene = test_Gene_names
+            ref_allele = refGenes[gene]
+            ref_seq = Genes[gene][ref_allele]
+            ref_locus = refGene_loci[gene]
+            ref_exons = ref_locus[-1]
+            
+            novel_var_count = 0        
+            gene_vars, gene_var_list = deepcopy(Vars[gene]), deepcopy(Var_list[gene])
+            cur_maxright = -1
+            gene_var_maxrights = {}
+            for var_pos, var_id in gene_var_list:
+                var_type, var_pos, var_data = gene_vars[var_id]
+                if var_type == "deletion":
+                    var_pos = var_pos + int(var_data) - 1
+                cur_maxright = max(cur_maxright, var_pos)
+                gene_var_maxrights[var_id] = cur_maxright
+                    
+            var_count = {}
+            def add_novel_var(gene_vars,
+                              gene_var_list,
+                              novel_var_count,
+                              var_type,
+                              var_pos,
+                              var_data):
+                var_idx = typing_common.lower_bound(gene_var_list, var_pos)
+                while var_idx < len(gene_var_list):
+                    pos_, id_ = gene_var_list[var_idx]
+                    if pos_ > var_pos:
+                        break
+                    if pos_ == var_pos:
+                        type_, _, data_ = gene_vars[id_]
+                        assert type_ != var_type or data_ != var_data
+                        if type_ != var_type:
+                            if var_type == "insertion":
+                                break
+                            elif var_type == "single" and type_ == "deletion":
+                                break
+                        else:
+                            if var_data < data_:
+                                break
+                    var_idx += 1
+                var_id = "nv%d" % novel_var_count
+                assert var_id not in gene_vars
+                gene_vars[var_id] = [var_type, var_pos, var_data]
+                gene_var_list.insert(var_idx, [var_pos, var_id])                
+                return var_id, novel_var_count + 1
+
+            if not os.path.exists(alignment_fname + ".bai"):
+                os.system("samtools index %s" % alignment_fname)
+            # Read alignments
+            alignview_cmd = ["samtools",
+                             "view",
+                             alignment_fname]
+            base_locus = 0
+            if genotype_genome != "":
+                _, chr, left, right = ref_locus[:4]
+                alignview_cmd += ["%s:%d-%d" % (chr, left+1, right+1)]
+                base_locus = left
+
+            if index_type == "graph":
+                alignview_cmd += [ref_allele]
+                mpileup = typing_common.get_mpileup(alignview_cmd,
+                                                    ref_seq,
+                                                    base_locus,
+                                                    gene_vars,
+                                                    allow_discordant)
+
+                if base_fname == "codis":
+                    pair_interdist = typing_common.get_pair_interdist(alignview_cmd,
+                                                                      simulation,
+                                                                      verbose)
+                else:
+                    pair_interdist = None
+
+                bamview_proc = subprocess.Popen(alignview_cmd,
+                                                stdout=subprocess.PIPE,
+                                                stderr=open("/dev/null", 'w'))
+
+                sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting
+                alignview_proc = subprocess.Popen(sort_read_cmd,
+                                                  stdin=bamview_proc.stdout,
+                                                  stdout=subprocess.PIPE,
+                                                  stderr=open("/dev/null", 'w'))
+            else:
+                alignview_proc = subprocess.Popen(alignview_cmd,
+                                             stdout=subprocess.PIPE,
+                                             stderr=open("/dev/null", 'w'))
+
+            # List of nodes that represent alleles
+            allele_vars = {}
+            for _, var_id in gene_var_list:
+                allele_list = Links[var_id]
+                for allele_id in allele_list:
+                    if allele_id not in Genes[gene]:
+                        continue
+                    if allele_id not in allele_vars:
+                        allele_vars[allele_id] = [var_id]
+                    else:
+                        allele_vars[allele_id].append(var_id)
+
+            # Extract variants that are within exons
+            exon_vars = get_exonic_vars(gene_vars, ref_exons)
+
+            # Store nodes that represent alleles
+            allele_nodes = {}
+            def create_allele_node(allele_name):
+                if allele_name in allele_nodes:
+                    return allele_nodes[allele_name]
+                if allele_name in allele_vars:
+                    var_ids = allele_vars[allele_name]
+                else:
+                    var_ids = []
+                seq = list(ref_seq)  # sequence that node represents
+                var = ["" for i in range(len(ref_seq))]  # how sequence is related to backbone
+                for var_id in var_ids:
+                    assert var_id in gene_vars
+                    var_type, var_pos, var_data = gene_vars[var_id]
+                    assert var_pos >= 0 and var_pos < len(ref_seq)
+                    if var_type == "single":
+                        seq[var_pos] = var_data
+                        var[var_pos] = var_id
+                    elif var_type == "deletion":
+                        del_len = int(var_data)
+                        assert var_pos + del_len <= len(ref_seq)
+                        seq[var_pos:var_pos + del_len] = ['D'] * del_len
+                        var[var_pos:var_pos + del_len] = [var_id] * del_len
+                    else:
+                        # DK - to be implemented for insertions
+                        assert var_type == "insertion"
+
+                qual = ' ' * len(seq)
+                allele_node = assembly_graph.Node(allele_name,
+                                                  0,
+                                                  seq,
+                                                  qual,
+                                                  var,
+                                                  ref_seq,
+                                                  gene_vars,
+                                                  mpileup,
+                                                  simulation)
+                allele_nodes[allele_name] = allele_node
+                return allele_node
+
+            true_allele_nodes = {}
+            if simulation:
+                for allele_name in test_Gene_names:
+                    true_allele_nodes[allele_name] = create_allele_node(allele_name)
+
+            display_allele_nodes = {}
+            for display_allele in display_alleles:
+                display_allele_nodes[display_allele] = create_allele_node(display_allele)
+
+            # Assembly graph
+            asm_graph = assembly_graph.Graph(ref_seq,
+                                             gene_vars,
+                                             ref_exons,
+                                             partial_alleles,
+                                             true_allele_nodes,
+                                             {}, # predicted_allele_nodes, which is empty for now
+                                             display_allele_nodes,
+                                             simulation)
+
+            # Choose allele representives from those that share the same exonic sequences
+            allele_reps, allele_rep_groups = get_rep_alleles(Links, exon_vars)
+            allele_rep_set = set(allele_reps.values())
+
+            # For checking alternative alignments near the ends of alignments
+            Alts_left, Alts_right = typing_common.get_alternatives(ref_seq,
+                                                                   allele_vars,
+                                                                   gene_vars,
+                                                                   gene_var_list,
+                                                                   verbose >= 2)
+
+            def haplotype_alts_list(haplotype_alts, left = True):
+                haplotype_list = []
+                for haplotype in haplotype_alts.keys():
+                    if left:
+                        pos = int(haplotype.split('-')[-1])
+                    else:
+                        pos = int(haplotype.split('-')[0])
+                    haplotype_list.append([pos, haplotype])
+                return sorted(haplotype_list, cmp = lambda a, b: a[0] - b[0])
+
+            Alts_left_list, Alts_right_list = haplotype_alts_list(Alts_left, True), haplotype_alts_list(Alts_right, False)
+
+            # Count alleles
+            Gene_counts, Gene_cmpt = {}, {}
+            Gene_gen_counts, Gene_gen_cmpt = {}, {}
+            num_reads, num_pairs = 0, 0
+
+            # For debugging purposes
+            debug_allele_names = set(test_Gene_names) if simulation and verbose >= 2 else set()
+
+            # Read information
+            prev_read_id = None
+            prev_right_pos = 0
+            prev_lines = []
+            left_read_ids, right_read_ids = set(), set()
+            if index_type == "graph":
+                # nodes for reads
+                read_nodes = []
+                read_vars_list = []
+
+                # 
+                def add_count(count_per_read, ht, add):
+                    orig_ht = ht
+                    ht = ht.split('-')
+
+                    assert len(ht) >= 2
+                    left, right = int(ht[0]), int(ht[-1])
+                    assert left <= right
+
+                    ht = ht[1:-1]
+                    alleles = set(Genes[gene].keys()) - set([ref_allele])
+                    for i in range(len(ht)):
+                        var_id = ht[i]
+                        if var_id.startswith("nv"):
+                            continue
+                        alleles &= set(Links[var_id])
+                    ht = set(ht)
+
+                    tmp_alleles = set()
+                    var_idx = typing_common.lower_bound(gene_var_list, right + 1)
+                    var_idx = min(var_idx, len(gene_var_list) - 1)
+                    while var_idx >= 0:
+                        _, var_id = gene_var_list[var_idx]
+                        if var_id.startswith("nv") or var_id in ht:
+                            var_idx -= 1
+                            continue
+                        if var_id in gene_var_maxrights and gene_var_maxrights[var_id] < left:
+                            break
+                        var_type, var_left, var_data = gene_vars[var_id]
+                        var_right = var_left
+                        if var_type == "deletion":
+                            var_right = var_left + int(var_data) - 1
+                        if (var_left >= left and var_left <= right) or \
+                           (var_right >= left and var_right <= right):
+                            tmp_alleles |= set(Links[var_id])
+                        var_idx -= 1                        
+                    alleles -= tmp_alleles
+                    
+                    for allele in alleles:
+                        count_per_read[allele] += add
+
+                    return len(alleles)
+
+                # Identify best pairs
+                def choose_pairs(left_positive_hts, right_positive_hts):
+                    if len(left_positive_hts) > 0 and \
+                       len(right_positive_hts) > 0 and \
+                       max(len(left_positive_hts), len(right_positive_hts)) >= 2:
+                        expected_inter_dist = pair_interdist
+                        """
+                        if simulation:
+                            expected_inter_dist = fragment_len - read_len * 2
+                        """
+                            
+                        best_diff = sys.maxint
+                        picked = []                                
+                        for left_ht_str in left_positive_hts:
+                            left_ht = left_ht_str.split('-')
+                            l_left, l_right = int(left_ht[0]), int(left_ht[-1])
+                            for right_ht_str in right_positive_hts:
+                                right_ht = right_ht_str.split('-')
+                                r_left, r_right = int(right_ht[0]), int(right_ht[-1])
+                                if l_right < r_right:
+                                    inter_dist = r_left - l_right - 1
+                                else:
+                                    inter_dist = l_left - r_right - 1
+
+                                cur_diff = abs(expected_inter_dist - inter_dist)
+                                if best_diff > cur_diff:
+                                    best_diff = cur_diff
+                                    picked = [[left_ht_str, right_ht_str]]
+                                elif best_diff == cur_diff:
+                                    picked.append([left_ht_str, right_ht_str])
+
+                        assert len(picked) > 0
+
+                        left_positive_hts, right_positive_hts = set(), set()
+                        for left_ht_str, right_ht_str in picked:
+                            left_positive_hts.add(left_ht_str)
+                            right_positive_hts.add(right_ht_str)
+
+                    return left_positive_hts, right_positive_hts
+
+                def get_exon_haplotypes(ht, exons):
+                    if len(exons) <= 0:
+                        return []
+                    
+                    debug_ht = deepcopy(ht)
+                    ht = ht.split('-')
+                    assert len(ht) >= 2
+                    ht[0], ht[-1] = int(ht[0]), int(ht[-1])
+                    exon_hts = []
+                    for e_left, e_right in exons:
+                        assert len(ht) >= 2
+                        ht_left, ht_right = ht[0], ht[-1]
+                        if e_left > ht_right or e_right < ht_left:
+                            continue
+
+                        new_ht = deepcopy(ht)
+                        if ht_left < e_left:
+                            split = False
+                            for i in range(1, len(new_ht) - 1):
+                                var_id = new_ht[i]
+                                type, left, data = gene_vars[var_id]
+                                if (type != "deletion" and left >= e_left) or \
+                                   (type == "deletion" and left - 1 >= e_left):
+                                    ht_left = e_left
+                                    new_ht = [ht_left] + new_ht[i:]
+                                    split = True
+                                    break
+                                if type == "deletion":
+                                    right = left + int(data)
+                                    if right >= e_left:
+                                        ht_left = right
+                                        new_ht = [right] + new_ht[i+1:]
+                                        split = True
+                                        break
+                            if not split:
+                                ht_left = e_left
+                                new_ht = [ht_left, ht_right]
+                        assert ht_left >= e_left
+                        if ht_right > e_right:
+                            split = False
+                            for i in reversed(range(1, len(new_ht) - 1)):
+                                var_id = new_ht[i]
+                                type, right, data = gene_vars[var_id]
+                                if type == "deletion":
+                                    right = right + int(data) - 1
+                                if (type != "deletion" and right <= e_right) or \
+                                   (type == "deletion" and right + 1 <= e_right):
+                                    ht_right = e_right
+                                    new_ht = new_ht[:i+1] + [ht_right]
+                                    split = True
+                                    break
+                                if type == "deletion":
+                                    left = right - int(data)
+                                    if left <= e_right:
+                                        ht_right = left
+                                        new_ht = new_ht[:i] + [ht_right]
+                                        split = True
+                                        break
+                            if not split:
+                                ht_right = e_right
+                                new_ht = [ht_left, ht_right]
+
+                        if len(new_ht) == 2:
+                            new_ht = "%d-%d" % (new_ht[0], new_ht[-1])
+                        else:
+                            assert len(new_ht) > 2
+                            new_ht = "%d-%s-%d" % (new_ht[0], '-'.join(new_ht[1:-1]), new_ht[-1])
+                        assert ht_left <= ht_right
+                        exon_hts.append(new_ht)
+
+                    return exon_hts
+
+                # Positive evidence for left and right reads
+                left_positive_hts, right_positive_hts = set(), set()
+                
+                # Cigar regular expression
+                cigar_re = re.compile('\d+\w')
+                for line in alignview_proc.stdout:
+                    line = line.strip()
+                    cols = line.split()
+                    read_id, flag, chr, pos, mapQ, cigar_str = cols[:6]
+                    node_read_id = orig_read_id = read_id
+                    if simulation:
+                        read_id = read_id.split('|')[0]
+                    read_seq, read_qual = cols[9], cols[10]
+                    flag, pos = int(flag), int(pos)
+                    pos -= (base_locus + 1)
+                    if pos < 0:
+                        continue
+
+                    # Unalined?
+                    if flag & 0x4 != 0:
+                        if simulation and verbose >= 2:
+                            print "Unaligned"
+                            print "\t", line
+                        continue
+
+                    # Concordantly mapped?
+                    if flag & 0x2 != 0:
+                        concordant = True
+                    else:
+                        concordant = False
+
+                    NM, Zs, MD, NH = "", "", "", ""
+                    for i in range(11, len(cols)):
+                        col = cols[i]
+                        if col.startswith("Zs"):
+                            Zs = col[5:]
+                        elif col.startswith("MD"):
+                            MD = col[5:]
+                        elif col.startswith("NM"):
+                            NM = int(col[5:])
+                        elif col.startswith("NH"):
+                            NH = int(col[5:])
+
+                    if NM > num_editdist:
+                        continue
+
+                    # Only consider unique alignment
+                    if NH > 1:
+                        continue
+
+                    # Concordantly aligned mate pairs
+                    if not allow_discordant and not concordant:
+                        continue
+
+                    # Left read?
+                    is_left_read = flag & 0x40 != 0
+                    if is_left_read:
+                        if read_id in left_read_ids:
+                            continue
+                        left_read_ids.add(read_id)
+                        if not simulation:
+                            node_read_id += '|L'
+                    else: # Right read?
+                        assert flag & 0x80 != 0
+                        if read_id in right_read_ids:
+                            continue
+                        right_read_ids.add(read_id)
+                        if not simulation:
+                            node_read_id += '|R'
+
+                    if Zs:
+                        Zs = Zs.split(',')             
+
+                    assert MD != ""
+                    MD_str_pos, MD_len = 0, 0
+                    Zs_pos, Zs_i = 0, 0
+                    for _i in range(len(Zs)):
+                        Zs[_i] = Zs[_i].split('|')
+                        Zs[_i][0] = int(Zs[_i][0])
+                    if Zs_i < len(Zs):
+                        Zs_pos += Zs[Zs_i][0]
+                    read_pos, left_pos = 0, pos
+                    right_pos = left_pos
+                    cigars = cigar_re.findall(cigar_str)
+                    cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
+                    cmp_list = []
+                    num_error_correction = 0
+                    likely_misalignment = False
+
+                    # Extract variants w.r.t backbone from CIGAR string
+                    softclip = [0, 0]
+                    for i in range(len(cigars)):
+                        cigar_op, length = cigars[i]
+                        if cigar_op == 'M':
+                            first = True
+                            MD_len_used = 0
+                            cmp_list_i = len(cmp_list)
+                            while True:
+                                if not first or MD_len == 0:
+                                    if MD[MD_str_pos].isdigit():
+                                        num = int(MD[MD_str_pos])
+                                        MD_str_pos += 1
+                                        while MD_str_pos < len(MD):
+                                            if MD[MD_str_pos].isdigit():
+                                                num = num * 10 + int(MD[MD_str_pos])
+                                                MD_str_pos += 1
+                                            else:
+                                                break
+                                        MD_len += num
+                                # Insertion or full match followed
+                                if MD_len >= length:
+                                    MD_len -= length
+                                    if length > MD_len_used:
+                                        cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used])
+                                    break
+                                first = False
+                                read_base = read_seq[read_pos + MD_len]
+                                MD_ref_base = MD[MD_str_pos]
+                                MD_str_pos += 1
+                                assert MD_ref_base in "ACGT"
+                                if MD_len > MD_len_used:
+                                    cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used])
+
+                                _var_id = "unknown"
+                                if read_pos + MD_len == Zs_pos and Zs_i < len(Zs):
+                                    assert Zs[Zs_i][1] == 'S'
+                                    _var_id = Zs[Zs_i][2]
+                                    Zs_i += 1
+                                    Zs_pos += 1
+                                    if Zs_i < len(Zs):
+                                        Zs_pos += Zs[Zs_i][0]
+                                else:
+                                    # Search for a known (yet not indexed) variant or a novel variant
+                                    ref_pos = right_pos + MD_len
+                                    var_idx = typing_common.lower_bound(gene_var_list, ref_pos)
+                                    while var_idx < len(gene_var_list):
+                                        var_pos, var_id = gene_var_list[var_idx]
+                                        if var_pos > ref_pos:
+                                            break
+                                        if var_pos == ref_pos:
+                                            var_type, _, var_data = gene_vars[var_id]
+                                            if var_type == "single" and var_data == read_base:
+                                                _var_id = var_id
+                                                break
+                                        var_idx += 1
+
+                                cmp_list.append(["mismatch", right_pos + MD_len, 1, _var_id])
+                                MD_len_used = MD_len + 1
+                                MD_len += 1
+                                # Full match
+                                if MD_len == length:
+                                    MD_len = 0
+                                    break
+
+                            # Correction for sequencing errors and update for cmp_list
+                            if error_correction:
+                                assert cmp_list_i < len(cmp_list)
+                                new_cmp_list, read_seq, _num_error_correction = error_correct(ref_seq,
+                                                                                              read_seq,
+                                                                                              read_pos,
+                                                                                              mpileup,
+                                                                                              gene_vars,
+                                                                                              gene_var_list,
+                                                                                              cmp_list[cmp_list_i:],
+                                                                                              node_read_id == "aHSQ1008:175:C0JVFACXX:5:1109:17665:21583|L")
+                                cmp_list = cmp_list[:cmp_list_i] + new_cmp_list
+                                num_error_correction += _num_error_correction
+
+                        elif cigar_op == 'I':
+                            _var_id = "unknown"
+                            if read_pos == Zs_pos and Zs_i < len(Zs):
+                                assert Zs[Zs_i][1] == 'I'
+                                _var_id = Zs[Zs_i][2]
+                                Zs_i += 1
+                                if Zs_i < len(Zs):
+                                    Zs_pos += Zs[Zs_i][0]
+                            else:
+                                # Search for a known (yet not indexed) variant or a novel variant
+                                var_idx = typing_common.lower_bound(gene_var_list, right_pos)
+                                while var_idx < len(gene_var_list):
+                                    var_pos, var_id = gene_var_list[var_idx]
+                                    if var_pos > right_pos:
+                                        break
+                                    if var_pos == right_pos:
+                                        var_type, _, var_data = gene_vars[var_id]
+                                        if var_type == "insertion" and len(var_data) == length:
+                                            _var_id = var_id
+                                            break
+                                    var_idx += 1                            
+                            cmp_list.append(["insertion", right_pos, length, _var_id])
+                            if 'N' in read_seq[read_pos:read_pos+length]:
+                                likely_misalignment = True
+                                
+                        elif cigar_op == 'D':
+                            if MD[MD_str_pos] == '0':
+                                MD_str_pos += 1
+                            assert MD[MD_str_pos] == '^'
+                            MD_str_pos += 1
+                            while MD_str_pos < len(MD):
+                                if not MD[MD_str_pos] in "ACGT":
+                                    break
+                                MD_str_pos += 1
+                            _var_id = "unknown"
+                            if read_pos == Zs_pos and \
+                               Zs_i < len(Zs) and \
+                               Zs[Zs_i][1] == 'D':
+                                _var_id = Zs[Zs_i][2]
+                                Zs_i += 1
+                                if Zs_i < len(Zs):
+                                    Zs_pos += Zs[Zs_i][0]
+                            else:
+                                # Search for a known (yet not indexed) variant or a novel variant
+                                var_idx = typing_common.lower_bound(gene_var_list, right_pos)
+                                while var_idx < len(gene_var_list):
+                                    var_pos, var_id = gene_var_list[var_idx]
+                                    if var_pos > right_pos:
+                                        break
+                                    if var_pos == right_pos:
+                                        var_type, _, var_data = gene_vars[var_id]
+                                        if var_type == "deletion" and int(var_data) == length:
+                                            _var_id = var_id
+                                            break
+                                    var_idx += 1
+
+                            cmp_list.append(["deletion", right_pos, length, _var_id])
+
+                            # Check if this deletion is artificial alignment
+                            if right_pos < len(mpileup):
+                                del_count, nt_count = 0, 0
+                                for nt, value in mpileup[right_pos][1].items():
+                                    count = value[0]
+                                    if nt == 'D':
+                                        del_count += count
+                                    else:
+                                        nt_count += count
+
+                                # DK - debugging purposes
+                                if base_fname == "hla":
+                                    if del_count * 6 < nt_count: # and nt_count >= 15:
+                                        likely_misalignment = True
+                            
+                        elif cigar_op == 'S':
+                            if i == 0:
+                                softclip[0] = length
+                                Zs_pos += length
+                            else:
+                                assert i + 1 == len(cigars)
+                                softclip[1] = length
+                        else:                    
+                            assert cigar_op == 'N'
+                            assert False
+                            cmp_list.append(["intron", right_pos, length])
+
+                        if cigar_op in "MND":
+                            right_pos += length
+
+                        if cigar_op in "MIS":
+                            read_pos += length
+
+                    # Remove softclip in cigar and modify read_seq and read_qual accordingly
+                    if sum(softclip) > 0:
+                        if softclip[0] > 0:
+                            cigars = cigars[1:]
+                            read_seq = read_seq[softclip[0]:]
+                            read_qual = read_qual[softclip[0]:]
+                        if softclip[1] > 0:
+                            cigars = cigars[:-1]
+                            read_seq = read_seq[:-softclip[1]]
+                            read_qual = read_qual[:-softclip[1]]
+
+                        cigar_str = ""
+                        for type, length in cigars:
+                            cigar_str += str(length)
+                            cigar_str += type
+                   
+                    if right_pos > len(ref_seq):
+                        continue
+
+                    if num_error_correction > max(1, num_editdist):
+                        continue
+                        
+                    if likely_misalignment:
+                        continue
+
+                    # Add novel variants
+                    read_pos = 0
+                    for cmp_i in range(len(cmp_list)):
+                        type_, pos_, length_ = cmp_list[cmp_i][:3]
+                        if type_ != "match":
+                            var_id_ = cmp_list[cmp_i][3]
+                            if var_id_ == "unknown":
+                                add = True
+                                if type_ == "mismatch":
+                                    data_ = read_seq[read_pos]
+                                    if data_ == 'N':
+                                        add = False
+                                elif type_ == "deletion":
+                                    data_ = str(length_)
+                                else:
+                                    assert type_ == "insertion"
+                                    data_ = read_seq[read_pos:read_pos + length_]
+                                if add:
+                                    var_id, novel_var_count = add_novel_var(gene_vars,
+                                                                            gene_var_list,
+                                                                            novel_var_count,
+                                                                            type_ if type_ != "mismatch" else "single",
+                                                                            pos_,
+                                                                            data_)
+                                    cmp_list[cmp_i][3] = var_id
+                            if var_id not in var_count:
+                                var_count[var_id] = 1
+                            else:
+                                var_count[var_id] += 1
+                                
+                        if type_ != "deletion":
+                            read_pos += length_
+
+                    # Count the number of reads aligned uniquely with some constraints
+                    num_reads += 1
+
+                    def add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read, include_alleles = set()):
+                        max_count = max(Gene_count_per_read.values())
+                        cur_cmpt = set()
+                        for allele, count in Gene_count_per_read.items():
+                            if count < max_count:
+                                continue
+                            if len(include_alleles) > 0 and allele not in include_alleles:
+                                continue
+                            
+                            cur_cmpt.add(allele)                    
+                            if allele not in Gene_counts:
+                                Gene_counts[allele] = 1
+                            else:
+                                Gene_counts[allele] += 1
+
+                        if len(cur_cmpt) == 0:
+                            return ""
+
+                        if verbose >= 2:
+                            alleles = ["", ""]
+                            allele1_found, allele2_found = False, False
+                            if alleles[0] != "":
+                                for allele, count in Gene_count_per_read.items():
+                                    if count < max_count:
+                                        continue
+                                    if allele == alleles[0]:
+                                        allele1_found = True
+                                    elif allele == alleles[1]:
+                                        allele2_found = True
+                                if allele1_found != allele2_found:
+                                    print >> sys.stderr, alleles[0], Gene_count_per_read[alleles[0]]
+                                    print >> sys.stderr, alleles[1], Gene_count_per_read[alleles[1]]
+                                    if allele1_found:
+                                        print >> sys.stderr, ("%s\tread_id %s - %d vs. %d]" % (alleles[0], prev_read_id, max_count, Gene_count_per_read[alleles[1]]))
+                                    else:
+                                        print >> sys.stderr, ("%s\tread_id %s - %d vs. %d]" % (alleles[1], prev_read_id, max_count, Gene_count_per_read[alleles[0]]))
+
+                        cur_cmpt = sorted(list(cur_cmpt))
+                        cur_cmpt = '-'.join(cur_cmpt)
+                        if not cur_cmpt in Gene_cmpt:
+                            Gene_cmpt[cur_cmpt] = 1
+                        else:
+                            Gene_cmpt[cur_cmpt] += 1
+
+                        return cur_cmpt
+
+                    if read_id != prev_read_id:
+                        if prev_read_id != None:
+                            num_pairs += 1
+                            if base_fname == "codis" and gene == "D18S51":
+                                left_positive_hts, right_positive_hts = choose_pairs(left_positive_hts, right_positive_hts)
+
+                            for positive_ht in left_positive_hts | right_positive_hts:
+                                exon_hts = get_exon_haplotypes(positive_ht, ref_exons)
+
+                                if prev_read_id == "aHSQ1008:175:C0JVFACXX:5:1109:17665:21583":
+                                    print "positive_ht:", positive_ht, "exon_hts:", exon_hts
+                                    
+                                for exon_ht in exon_hts:
+                                    add_count(Gene_count_per_read, exon_ht, 1)
+                                add_count(Gene_gen_count_per_read, positive_ht, 1)
+
+                            # DK - debugging purposes
+                            if prev_read_id.startswith("a30"):
+                                print Gene_gen_count_per_read
+
+                            # DK - debugging purposes
+                            """
+                            debug_allele_id = "A*02:406"
+                            assert debug_allele_id in Gene_count_per_read
+                            debug_max_read_count = max(Gene_count_per_read.values())
+                            debug_read_count = Gene_count_per_read[debug_allele_id]
+                            if debug_read_count == debug_max_read_count and \
+                               Gene_count_per_read["A*11:01:01:01"] < debug_max_read_count and \
+                               Gene_count_per_read["A*02:01:01:01"] < debug_max_read_count:
+                                print prev_read_id
+                                None
+                            if prev_read_id == "HSQ1008:175:C0JVFACXX:5:1109:17665:21583":
+                                for line in prev_lines:
+                                    print line
+                                print "left_positive_hts :", left_positive_hts
+                                print "right_positive_hts:", right_positive_hts
+                                print "exon:", debug_read_count, "max:", debug_max_read_count
+                                print "gen:", Gene_gen_count_per_read[debug_allele_id], "max:", max(Gene_gen_count_per_read.values())
+
+                                for allele_id, count in Gene_count_per_read.items():
+                                    if count == debug_max_read_count:
+                                        None
+                                        # print "allele max:", allele_id, count
+                                # sys.exit(1)
+                                None
+                            """
+                                
+
+                            cur_cmpt, cur_cmpt_gen = "", ""
+                            if base_fname == "hla":
+                                cur_cmpt = add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read, allele_rep_set)
+                                cur_cmpt_gen = add_stat(Gene_gen_cmpt, Gene_gen_counts, Gene_gen_count_per_read)
+                            else:
+                                cur_cmpt = add_stat(Gene_gen_cmpt, Gene_gen_counts, Gene_gen_count_per_read)
+                            for read_id_, read_node in read_nodes:
+                                asm_graph.add_node(read_id_,
+                                                   read_node,
+                                                   simulation)
+                            read_nodes, read_var_list = [], []
+                            if simulation and \
+                               verbose >= 2 and \
+                               base_fname in ["hla", "codis"]:
+                                cur_cmpt = cur_cmpt.split('-') if cur_cmpt != "" else set()
+                                cur_cmpt_gen = cur_cmpt_gen.split('-') if cur_cmpt_gen != "" else set()
+                                show_debug = (partial and cur_cmpt != "" and not set(cur_cmpt) & set(test_Gene_names)) or \
+                                              (not partial and cur_cmpt_gen != "" and not set(cur_cmpt_gen) & set(test_Gene_names))
+                                              
+                                if show_debug:
+                                    print "%s are chosen instead of %s" % (cur_cmpt if partial else cur_cmpt_gen, '-'.join(test_Gene_names))
+                                    for prev_line in prev_lines:
+                                        print "\t", prev_line
+
+                            prev_lines = []
+
+                        left_positive_hts, right_positive_hts = set(), set()
+                        
+                        Gene_count_per_read, Gene_gen_count_per_read = {}, {}
+                        for Gene_name in Gene_names[gene]:
+                            if Gene_name.find("BACKBONE") != -1:
+                                continue
+                            Gene_count_per_read[Gene_name] = 0
+                            Gene_gen_count_per_read[Gene_name] = 0
+
+                    prev_lines.append(line)
+
+                    # Remove mismatches due to unknown or novel variants
+                    cmp_list2 = []
+                    for cmp in cmp_list:
+                        cmp = deepcopy(cmp)
+                        type, pos, length = cmp[:3]
+                        if type == "match":
+                            if len(cmp_list2) > 0 and cmp_list2[-1][0] == "match":
+                                cmp_list2[-1][2] += length
+                            else:
+                                cmp_list2.append(cmp)
+                        elif type == "mismatch" and \
+                             (cmp[3] == "unknown" or cmp[3].startswith("nv")):
+                            if len(cmp_list2) > 0 and cmp_list2[-1][0] == "match":
+                                cmp_list2[-1][2] += 1
+                            else:
+                                cmp_list2.append(["match", pos, 1])
+                        else:
+                            cmp_list2.append(cmp)
+                    cmp_list_left, cmp_list_right, cmp_left_alts, cmp_right_alts = \
+                    typing_common.identify_ambigious_diffs(ref_seq,
+                                                           gene_vars,
+                                                           Alts_left,
+                                                           Alts_right,
+                                                           Alts_left_list,
+                                                           Alts_right_list,
+                                                           cmp_list2,
+                                                           verbose,
+                                                           orig_read_id.startswith("a30|R"))  # debug?
+
+                    mid_ht = []
+                    for cmp in cmp_list2[cmp_list_left:cmp_list_right+1]:
+                        type = cmp[0]
+                        if type not in ["mismatch", "deletion", "insertion"]:
+                            continue                            
+                        var_id = cmp[3]
+                        if var_id == "unknown" or var_id.startswith("nv"):
+                            continue
+                        mid_ht.append(var_id)
+
+                    for l in range(len(cmp_left_alts)):
+                        left_ht = cmp_left_alts[l].split('-')
+                        left_ht += mid_ht
+                        for r in range(len(cmp_right_alts)):
+                            right_ht = cmp_right_alts[r].split('-')
+                            ht = left_ht + right_ht
+                            if len(ht) <= 0:
+                                continue
+                            ht_str = '-'.join(ht)
+                            if is_left_read:
+                                left_positive_hts.add(ht_str)
+                            else:
+                                right_positive_hts.add(ht_str)
+
+                    # DK - debugging purposes
+                    DK_debug = False
+                    if orig_read_id.startswith("a30|R"):
+                        DK_debug = True
+                        print line
+                        print cmp_list
+                        print "positive hts:", left_positive_hts, right_positive_hts
+                        print "cmp_list [%d, %d]" % (cmp_list_left, cmp_list_right)
+
+                    # Node
+                    read_node_pos, read_node_seq, read_node_qual, read_node_var = -1, [], [], []
+                    read_vars = []
+                    ref_pos, read_pos = left_pos, 0
+                    cmp_i = 0
+                    while cmp_i < len(cmp_list):
+                        cmp = cmp_list[cmp_i]
+                        type, length = cmp[0], cmp[2]
+                        if type in ["match", "mismatch"]:
+                            if read_node_pos < 0:
+                                read_node_pos = ref_pos
+                        if type == "match":
+                            read_node_seq += list(read_seq[read_pos:read_pos+length])
+                            read_node_qual += list(read_qual[read_pos:read_pos+length])
+                            read_node_var += ([''] * length)
+                            read_pos += length
+                        elif type == "mismatch":
+                            var_id = cmp[3]
+                            read_base, qual = read_seq[read_pos], read_qual[read_pos]
+                            read_node_seq += [read_base]
+                            read_node_qual += [qual]
+                            read_node_var.append(var_id)
+                            read_pos += 1
+                        elif type == "insertion":
+                            var_id = cmp[3]
+                            ins_len = length
+                            ins_seq = read_seq[read_pos:read_pos+ins_len]
+                            read_node_seq += ["I%s" % nt for nt in ins_seq]
+                            read_node_qual += list(read_qual[read_pos:read_pos+ins_len])
+                            read_node_var += ([var_id] * ins_len)                                        
+                            read_pos += length
+                        elif type == "deletion":
+                            var_id = cmp[3]
+                            del_len = length
+                            read_node_seq += (['D'] * del_len)
+                            read_node_qual += ([''] * del_len)
+                            if len(read_node_seq) > len(read_node_var):
+                                assert len(read_node_seq) == len(read_node_var) + del_len
+                                read_node_var += ([var_id] * del_len)
+                        else:
+                            assert type == "intron"
+                        cmp_i += 1
+
+                    # Node
+                    if assembly:
+                        read_nodes.append([node_read_id,
+                                           assembly_graph.Node(node_read_id,
+                                                               read_node_pos,
+                                                               read_node_seq,
+                                                               read_node_qual,
+                                                               read_node_var,
+                                                               ref_seq,
+                                                               gene_vars,
+                                                               mpileup,
+                                                               simulation)])
+
+                    prev_read_id = read_id
+                    prev_right_pos = right_pos
+
+                if prev_read_id != None:
+                    num_pairs += 1
+                    if base_fname == "codis" and gene == "D18S51":
+                        left_positive_hts, right_positive_hts = choose_pairs(left_positive_hts, right_positive_hts)                            
+                    for positive_ht in left_positive_hts | right_positive_hts:
+                        exon_hts = get_exon_haplotypes(positive_ht, ref_exons)
+                        for exon_ht in exon_hts:
+                            add_count(Gene_count_per_read, exon_ht, 1)
+                        add_count(Gene_gen_count_per_read, positive_ht, 1)
+
+                    if base_fname == "hla":
+                        add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read, allele_rep_set)
+                    add_stat(Gene_gen_cmpt, Gene_gen_counts, Gene_gen_count_per_read)
+                    for read_id_, read_node in read_nodes:
+                        asm_graph.add_node(read_id_,
+                                           read_node,
+                                           simulation)
+                    read_nodes, read_var_list = [], []
+
+                if num_reads <= 0:
+                    continue
+
+                for f_ in [sys.stderr, report_file]:
+                    print >> f_, "\t\t\t%d reads and %d pairs are aligned" % (num_reads, num_pairs)
+                
+            else:
+                assert index_type == "linear"
+                def add_alleles(alleles):
+                    if not allele in Gene_counts:
+                        Gene_counts[allele] = 1
+                    else:
+                        Gene_counts[allele] += 1
+
+                    cur_cmpt = sorted(list(alleles))
+                    cur_cmpt = '-'.join(cur_cmpt)
+                    if not cur_cmpt in Gene_cmpt:
+                        Gene_cmpt[cur_cmpt] = 1
+                    else:
+                        Gene_cmpt[cur_cmpt] += 1
+
+                prev_read_id, prev_AS = None, None
+                alleles = set()
+                for line in alignview_proc.stdout:
+                    cols = line[:-1].split()
+                    read_id, flag, allele = cols[:3]
+                    flag = int(flag)
+                    if flag & 0x4 != 0:
+                        continue
+                    if not allele.startswith(gene):
+                        continue
+                    if allele.find("BACKBONE") != -1:
+                        continue
+
+                    AS = None
+                    for i in range(11, len(cols)):
+                        col = cols[i]
+                        if col.startswith("AS"):
+                            AS = int(col[5:])
+                    assert AS != None
+                    if read_id != prev_read_id:
+                        if alleles:
+                            if aligner == "hisat2" or \
+                                    (aligner == "bowtie2" and len(alleles) < 10):
+                                add_alleles(alleles)
+                            alleles = set()
+                        prev_AS = None
+                    if prev_AS != None and AS < prev_AS:
+                        continue
+                    prev_read_id = read_id
+                    prev_AS = AS
+                    alleles.add(allele)
+
+                if alleles:
+                    add_alleles(alleles)
+
+            if base_fname != "hla":
+                Gene_cmpt, Gene_counts = Gene_gen_cmpt, Gene_gen_counts
+                
+            Gene_counts = [[allele, count] for allele, count in Gene_counts.items()]
+            def Gene_count_cmp(a, b):
+                if a[1] != b[1]:
+                    return b[1] - a[1]
+                assert a[0] != b[0]
+                if a[0] < b[0]:
+                    return -1
+                else:
+                    return 1
+            Gene_counts = sorted(Gene_counts, cmp=Gene_count_cmp)
+            for count_i in range(len(Gene_counts)):
+                count = Gene_counts[count_i]
+                if simulation:
+                    found = False
+                    for test_Gene_name in test_Gene_names:
+                        if count[0] == test_Gene_name:
+                            for f_ in [sys.stderr, report_file]:
+                                print >> f_, "\t\t\t*** %d ranked %s (count: %d)" % (count_i + 1, test_Gene_name, count[1])
+                            found = True
+                    if count_i < 5 and not found:
+                        for f_ in [sys.stderr, report_file]:
+                            print >> f_, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
+                else:
+                    for f_ in [sys.stderr, report_file]:
+                        print >> f_, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
+                    if count_i >= 9:
+                        break
+            for f_ in [sys.stderr, report_file]:
+                print >> f_
+
+            # Calculate the abundance of representative alleles on exonic sequences
+            if base_fname == "hla":
+                # Incorporate non representative alleles (full length alleles)
+                Gene_prob = typing_common.single_abundance(Gene_cmpt,
+                                                           Gene_lengths[gene],
+                                                           True) # exonic sequence
+
+                gen_alleles = set()
+                gen_prob_sum = 0.0
+                for prob_i in range(len(Gene_prob)):
+                    allele, prob = Gene_prob[prob_i][:2]
+                    if prob_i >= 10 and prob < 0.03:
+                        break
+                    if allele in partial_alleles:
+                        continue
+
+                    gen_prob_sum += prob
+                    gen_alleles |= set(allele_rep_groups[allele])
+
+                if len(gen_alleles) > 0:
+                    Gene_gen_cmpt2 = {}
+                    for cmpt, value in Gene_gen_cmpt.items():
+                        cmpt2 = []
+                        for allele in cmpt.split('-'):
+                            if allele in gen_alleles:
+                                cmpt2.append(allele)
+                        if len(cmpt2) == 0:
+                            continue
+                        cmpt2 = '-'.join(cmpt2)
+                        if cmpt2 not in Gene_gen_cmpt2:
+                            Gene_gen_cmpt2[cmpt2] = value
+                        else:
+                            Gene_gen_cmpt2[cmpt2] += value
+                    Gene_gen_cmpt = Gene_gen_cmpt2
+                    Gene_gen_prob = typing_common.single_abundance(Gene_gen_cmpt,
+                                                                   Gene_lengths[gene],
+                                                                   False) # whole gene sequence
+                    
+                    Gene_combined_prob = {}
+                    for allele, prob in Gene_prob:
+                        if allele not in gen_alleles:
+                            Gene_combined_prob[allele] = prob
+                    for allele, prob in Gene_gen_prob:
+                        Gene_combined_prob[allele] = prob * gen_prob_sum
+                    Gene_prob = [[allele, prob] for allele, prob in Gene_combined_prob.items()]
+                    Gene_prob = sorted(Gene_prob, cmp=typing_common.Gene_prob_cmp)
+            else:
+                Gene_prob = typing_common.single_abundance(Gene_cmpt, Gene_lengths[gene])
+
+            if index_type == "graph" and assembly:
+                allele_node_order = []
+                predicted_allele_nodes = {}
+                for allele_name, prob in Gene_prob:
+                    if prob < 0.1: # abundance of 10%
+                        break
+                    predicted_allele_nodes[allele_name] = create_allele_node(allele_name)
+                    allele_node_order.append([allele_name, prob])
+                    if len(predicted_allele_nodes) >= 2:
+                        break
+                asm_graph.predicted_allele_nodes = predicted_allele_nodes
+                asm_graph.allele_node_order = allele_node_order
+
+                # Start drawing assembly graph
+                asm_graph.begin_draw(output_base)
+
+                # Draw assembly graph
+                begin_y = asm_graph.draw(0, "Initial graph")
+                begin_y += 200
+                
+                # Apply De Bruijn graph
+                asm_graph.guided_DeBruijn()
+
+                # Draw assembly graph
+                begin_y = asm_graph.draw(begin_y, "Asssembly")
+                begin_y += 200
+
+                # Draw assembly graph
+                asm_graph.nodes = asm_graph.nodes2
+                asm_graph.to_node, asm_graph.from_node = {}, {}
+                begin_y = asm_graph.draw(begin_y, "Assembly with known alleles")
+
+                # End drawing assembly graph
+                asm_graph.end_draw()
+
+                # Compare two alleles
+                if simulation and len(test_Gene_names) == 2:
+                    allele_name1, allele_name2 = test_Gene_names
+                    print >> sys.stderr, allele_name1, "vs.", allele_name2
+                    asm_graph.print_node_comparison(asm_graph.true_allele_nodes)
+
+                def compare_alleles(vars1, vars2, print_output = True):
+                    skip = True
+                    var_i, var_j = 0, 0
+                    exon_i = 0
+                    mismatches = 0
+                    while var_i < len(vars1) and var_j < len(vars2):
+                        cmp_var_id, node_var_id = vars1[var_i], vars2[var_j]
+                        cmp_var, node_var = gene_vars[cmp_var_id], gene_vars[node_var_id]
+
+                        min_pos = min(cmp_var[1], node_var[1])
+                        cmp_var_in_exon, node_var_in_exon = False, False
+                        while exon_i < len(ref_exons):
+                            exon_left, exon_right = ref_exons[exon_i]
+                            if min_pos <= exon_right:
+                                if cmp_var[1] >= exon_left and cmp_var[1] <= exon_right:
+                                    cmp_var_in_exon = True
+                                else:
+                                    cmp_var_in_exon = False
+                                if node_var[1] >= exon_left and node_var[1] <= exon_right:
+                                    node_var_in_exon = True
+                                else:
+                                    node_var_in_exon = False                                
+                                break
+                            exon_i += 1
+                        
+                        if cmp_var_id == node_var_id:
+                            skip = False
+                            if print_output:
+                                if cmp_var_in_exon:
+                                    print >> sys.stderr, "\033[94mexon%d\033[00m" % (exon_i + 1),
+                                print >> sys.stderr, cmp_var_id, cmp_var, "\t\t\t", mpileup[cmp_var[1]]
+                            var_i += 1; var_j += 1
+                            continue
+                        if cmp_var[1] <= node_var[1]:
+                            if not skip:
+                                if (var_i > 0 and var_i + 1 < len(vars1)) or cmp_var[0] != "deletion":
+                                    if print_output:
+                                        if cmp_var_in_exon:
+                                            for f_ in [sys.stderr, report_file]:
+                                                print >> f_, "\033[94mexon%d\033[00m" % (exon_i + 1),
+                                        for f_ in [sys.stderr, report_file]:
+                                            print >> f_, "***", cmp_var_id, cmp_var, "==", "\t\t\t", mpileup[cmp_var[1]]
+                                    mismatches += 1
+                            var_i += 1
+                        else:
+                            if print_output:
+                                if node_var_in_exon:
+                                    for f_ in [sys.stderr, report_file]:
+                                        print >> f_, "\033[94mexon%d\033[00m" % (exon_i + 1),
+                                for f_ in [sys.stderr, report_file]:
+                                    print >> f_, "*** ==", node_var_id, node_var, "\t\t\t", mpileup[node_var[1]]
+                            mismatches += 1
+                            var_j += 1
+                            
+                    return mismatches
+                    
+                tmp_nodes = asm_graph.nodes
+                print >> sys.stderr, "Number of tmp nodes:", len(tmp_nodes)
+                count = 0
+                for id, node in tmp_nodes.items():
+                    count += 1
+                    if count > 10:
+                        break
+                    node_vars = node.get_var_ids()
+                    node.print_info(); print >> sys.stderr
+                    if node.id in asm_graph.to_node:
+                        for id2, at in asm_graph.to_node[node.id]:
+                            print >> sys.stderr, "\tat %d ==> %s" % (at, id2)
+
+                    if simulation:
+                        cmp_Gene_names = test_Gene_names
+                    else:
+                        cmp_Gene_names = [allele_name for allele_name, _ in allele_node_order]
+                        
+                    alleles, cmp_vars, max_common = [], [], -sys.maxint
+                    for cmp_Gene_name in cmp_Gene_names:
+                        tmp_vars = allele_nodes[cmp_Gene_name].get_var_ids(node.left, node.right)
+                        tmp_common = len(set(node_vars) & set(tmp_vars))
+                        tmp_common -= len(set(node_vars) | set(tmp_vars))
+                        if max_common < tmp_common:
+                            max_common = tmp_common
+                            alleles = [[cmp_Gene_name, tmp_vars]]
+                        elif max_common == tmp_common:
+                            alleles.append([cmp_Gene_name, tmp_vars])
+
+                    for allele_name, cmp_vars in alleles:
+                        for f_ in [sys.stderr, report_file]:
+                            print >> f_, "vs.", allele_name
+                        compare_alleles(cmp_vars, node_vars)
+
+                    print >> sys.stderr
+                    print >> sys.stderr
+
+
+            # Identify alleles that perfectly or closesly match assembled alleles
+            for node_name, node in asm_graph.nodes.items():
+                vars = set(node.get_var_ids())
+
+                max_allele_names, max_common = [], -sys.maxint
+                for allele_name, vars2 in allele_vars.items():
+                    vars2 = set(vars2)
+                    tmp_common = len(vars & vars2) - len(vars | vars2)
+                    if tmp_common > max_common:
+                        max_common = tmp_common
+                        max_allele_names = [allele_name]                        
+                    elif tmp_common == max_common:
+                        max_allele_names.append(allele_name)
+
+                for f_ in [sys.stderr, report_file]:
+                    print >> f_, "Genomic:", node_name
+                    node_vars = node.get_var_ids()
+                    min_mismatches = sys.maxint
+                    for max_allele_name in max_allele_names:
+                        cmp_vars = allele_vars[max_allele_name]
+                        cmp_vars = sorted(cmp_vars, cmp=lambda a, b: int(a[2:]) - int(b[2:]))
+                        print_output = False
+                        tmp_mismatches = compare_alleles(cmp_vars, node_vars, print_output)
+                        print >> f_, "\t\t%s:" % max_allele_name, max_common, tmp_mismatches
+                        if tmp_mismatches < min_mismatches:
+                            min_mismatches = tmp_mismatches
+                    if min_mismatches > 0:
+                        print >> f_, "Novel allele"
+                    else:
+                        print >> f_, "Known allele"
+
+            """
+            allele_exon_vars = {}
+            for allele_name, vars in allele_vars.items():
+                allele_exon_vars[allele_name] = set(vars) & exon_vars
+
+            for node_name, node in asm_graph.nodes.items():
+                vars = []
+                for left, right in ref_exons:
+                    vars += node.get_var_ids(left, right)
+                vars = set(vars) & exon_vars
+
+                max_allele_names, max_common = [], -sys.maxint
+                for allele_name, vars2 in allele_exon_vars.items():
+                    tmp_common = len(vars & vars2) - len(vars | vars2)
+                    if tmp_common > max_common:
+                        max_common = tmp_common
+                        max_allele_names = [allele_name]                        
+                    elif tmp_common == max_common:
+                        max_allele_names.append(allele_name)
+
+                for f_ in [sys.stderr, report_file]:
+                    print >> f_, "Exonic:", node_name
+                    for max_allele_name in max_allele_names:
+                        print >> f_, "\t\t%s:" % max_allele_name, max_common
+            """
+            
+            success = [False for i in range(len(test_Gene_names))]
+            found_list = [False for i in range(len(test_Gene_names))]
+            for prob_i in range(len(Gene_prob)):
+                prob = Gene_prob[prob_i]
+                found = False
+                _allele_rep = prob[0]
+                """
+                if partial and exonic_only:
+                    _fields = _allele_rep.split(':')
+                    if len(_fields) == 4:
+                        _allele_rep = ':'.join(_fields[:-1])
+                """
+
+                if simulation:
+                    for name_i in range(len(test_Gene_names)):
+                        test_Gene_name = test_Gene_names[name_i]
+                        if prob[0] == test_Gene_name:
+                            rank_i = prob_i
+                            while rank_i > 0:
+                                if prob == Gene_prob[rank_i - 1][1]:
+                                    rank_i -= 1
+                                else:
+                                    break
+                            for f_ in [sys.stderr, report_file]:
+                                print >> f_, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, test_Gene_name, prob[1] * 100.0)
+                            if rank_i < len(success):
+                                success[rank_i] = True
+                            found_list[name_i] = True
+                            found = True
+                    # DK - for debugging purposes
+                    if not False in found_list and prob_i >= 10:
+                        break
+                if not found:
+                    for f_ in [sys.stderr, report_file]:
+                        print >> f_, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, _allele_rep, prob[1] * 100.0)
+                    if best_alleles and prob_i < 2:
+                        for f_ in [sys.stderr, report_file]:
+                            print >> f_, "SingleModel %s (abundance: %.2f%%)" % (_allele_rep, prob[1] * 100.0)
+                if not simulation and prob_i >= 9:
+                    break
+                if prob_i >= 19:
+                    break
+            print >> sys.stderr
+
+            if simulation and not False in success:
+                aligner_type = "%s %s" % (aligner, index_type)
+                if not aligner_type in test_passed:
+                    test_passed[aligner_type] = 1
+                else:
+                    test_passed[aligner_type] += 1
+
+        if remove_alignment_file and not simulation:
+            os.system("rm %s*" % (alignment_fname))
+
+    report_file.close()
+    if simulation:
+        return test_passed
+
+    
+"""
+"""
+def read_backbone_alleles(genotype_genome, refGene_loci, Genes):
+    for gene_name in refGene_loci:
+        allele_name, chr, left, right = refGene_loci[gene_name][:4]
+        seq_extract_cmd = ["samtools",
+                           "faidx",
+                           "%s.fa" % genotype_genome,
+                           "%s:%d-%d" % (chr, left+1, right+1)]
+
+        length = right - left + 1
+        proc = subprocess.Popen(seq_extract_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w'))
+        seq = ""
+        for line in proc.stdout:
+            line = line.strip()
+            if line.startswith('>'):
+                continue
+            seq += line
+        assert len(seq) == length
+        assert gene_name not in Genes
+        Genes[gene_name] = {}
+        Genes[gene_name][allele_name] = seq
+
+        
+"""
+"""
+def read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes):
+    for gene_name in Genes:
+        # Assert there is only one allele per gene, which is a backbone allele
+        assert len(Genes[gene_name]) == 1
+        backbone_allele_name, backbone_seq = Genes[gene_name].items()[0]
+        gene_vars, gene_var_list = Vars[gene_name], Var_list[gene_name]
+        allele_vars = {}
+        for _, var_id in gene_var_list:
+            for allele_name in Links[var_id]:
+                if allele_name not in allele_vars:
+                    allele_vars[allele_name] = []
+                allele_vars[allele_name].append(var_id)
+
+        for allele_name, vars in allele_vars.items():
+            seq = ""
+            prev_pos = 0
+            for var_id in vars:
+                type, pos, data = gene_vars[var_id]
+                assert prev_pos <= pos
+                if pos > prev_pos:
+                    seq += backbone_seq[prev_pos:pos]
+                if type == "single":
+                    prev_pos = pos + 1
+                    seq += data
+                elif type == "deletion":
+                    prev_pos = pos + int(data)
+                else:
+                    assert type == "insertion"
+                    seq += data
+                    prev_pos = pos
+            if prev_pos < len(backbone_seq):
+                seq += backbone_seq[prev_pos:]
+            Genes[gene_name][allele_name] = seq
+            
+    
+"""
+"""
+def read_Gene_alleles(fname, Genes):
+    for line in open(fname):
+        if line.startswith(">"):
+            allele_name = line.strip().split()[0][1:]
+            gene_name = allele_name.split('*')[0]
+            if not gene_name in Genes:
+                Genes[gene_name] = {}
+            if not allele_name in Genes[gene_name]:
+                Genes[gene_name][allele_name] = ""
+        else:
+            Genes[gene_name][allele_name] += line.strip()
+    return Genes
+
+
+"""
+"""
+def read_Gene_vars(fname):
+    Vars, Var_list = {}, {}
+    for line in open(fname):
+        var_id, var_type, allele, pos, data = line.strip().split('\t')
+        pos = int(pos)
+        gene = allele.split('*')[0]
+        if not gene in Vars:
+            Vars[gene] = {}
+            assert not gene in Var_list
+            Var_list[gene] = []
+            
+        assert not var_id in Vars[gene]
+        Vars[gene][var_id] = [var_type, pos, data]
+        Var_list[gene].append([pos, var_id])
+        
+    for gene, in_var_list in Var_list.items():
+        Var_list[gene] = sorted(in_var_list)
+
+    return Vars, Var_list
+
+
+"""
+"""
+def read_Gene_vars_genotype_genome(fname, refGene_loci):
+    loci = {}
+    for gene, values in refGene_loci.items():
+        allele_name, chr, left, right = values[:4]
+        if chr not in loci:
+            loci[chr] = []
+        loci[chr].append([allele_name, left, right])
+        
+    Vars, Var_list = {}, {}
+    for line in open(fname):
+        var_id, var_type, var_chr, pos, data = line.strip().split('\t')
+        if var_chr not in loci:
+            continue
+        pos = int(pos)
+        found = False
+        for allele_name, left, right in loci[var_chr]:
+            if pos >= left and pos <= right:
+                found = True
+                break
+        if not found:
+            continue
+        
+        gene = allele_name.split('*')[0]
+        if not gene in Vars:
+            Vars[gene] = {}
+            assert not gene in Var_list
+            Var_list[gene] = []
+            
+        assert not var_id in Vars[gene]
+        Vars[gene][var_id] = [var_type, pos - left, data]
+        Var_list[gene].append([pos - left, var_id])
+        
+    for gene, in_var_list in Var_list.items():
+        Var_list[gene] = sorted(in_var_list)
+
+    return Vars, Var_list
+
+
+"""
+"""
+def read_Gene_links(fname):
+    Links = {}
+    for line in open(fname):
+        var_id, alleles = line.strip().split('\t')
+        alleles = alleles.split()
+        assert not var_id in Links
+        Links[var_id] = alleles
+
+    return Links
+
+
+"""
+"""
+def genotyping_locus(base_fname,
+                     locus_list,
+                     genotype_genome,
+                     only_locus_list,
+                     partial,
+                     aligners,
+                     read_fname,
+                     fastq,
+                     alignment_fname,
+                     threads,
+                     simulate_interval,
+                     read_len,
+                     fragment_len,
+                     best_alleles,
+                     num_editdist,
+                     perbase_errorrate,
+                     perbase_snprate,
+                     skip_fragment_regions,
+                     assembly,
+                     output_base,
+                     error_correction,
+                     discordant,
+                     display_alleles,
+                     verbose,
+                     debug_instr):
+    if not os.path.exists("hisatgenotype_db"):
+        typing_common.clone_hisatgenotype_database()
+
+    simulation = (read_fname == [] and alignment_fname == "")
+
+    # Download human genome and HISAT2 index
+    HISAT2_fnames = ["grch38",
+                     "genome.fa",
+                     "genome.fa.fai"]
+
+    if not typing_common.check_files(HISAT2_fnames):
+        typing_common.download_genome_and_index()
+
+    # Check if the pre-existing files (hla*) are compatible with the current parameter setting
+    if genotype_genome != "":
+        if os.path.exists("%s.locus" % base_fname):
+            left = 0
+            Gene_genes = []
+            BACKBONE = False
+            for line in open("%s.locus" % base_fname):
+                Gene_name = line.strip().split()[0]
+                if Gene_name.find("BACKBONE") != -1:
+                    BACKBONE = True
+                Gene_gene = Gene_name.split('*')[0]
+                Gene_genes.append(Gene_gene)
+            delete_hla_files = False
+            if not BACKBONE:
+                delete_hla_files = True
+            if len(locus_list) == 0:
+                locus_list = Gene_genes
+            if not set(locus_list).issubset(set(Gene_genes)):
+                delete_hla_files = True
+            if delete_hla_files:
+                os.system("rm %s*" % base_fname)
+
+    # Extract variants, backbone sequence, and other sequeces  
+    if genotype_genome != "":
+        genome_fnames = [genotype_genome + ".fa",
+                         genotype_genome + ".fa.fai",
+                         genotype_genome + ".locus",
+                         genotype_genome + ".snp",
+                         genotype_genome + ".index.snp",
+                         genotype_genome + ".haplotype",
+                         genotype_genome + ".link",
+                         genotype_genome + ".clnsig",
+                         genotype_genome + ".coord",
+                         genotype_genome + ".partial"]
+        for i in range(8):
+            genome_fnames.append(genotype_genome + ".%d.ht2" % (i+1))
+
+        if not typing_common.check_files(genome_fnames):
+            print >> sys.stderr, "Error: some of the following files are not available:", ' '.join(genome_fnames)
+            sys.exit(1)
+    else:
+        typing_common.extract_database_if_not_exists(base_fname,
+                                                     only_locus_list,
+                                                     30,              # inter_gap
+                                                     50,              # intra_gap
+                                                     partial,
+                                                     verbose >= 1)        
+        for aligner, index_type in aligners:
+            typing_common.build_index_if_not_exists(base_fname,
+                                                    aligner,
+                                                    index_type,
+                                                    threads,
+                                                    verbose >= 1)
+
+    # Read partial alleles
+    partial_alleles = set()
+    if genotype_genome != "":
+        for line in open("%s.partial" % genotype_genome):
+            family, allele_name = line.strip().split('\t')
+            if family == base_fname:
+                partial_alleles.add(allele_name)
+
+    else:
+        for line in open("%s.partial" % base_fname):
+            partial_alleles.add(line.strip())
+
+    # Read alleles (names and sequences)
+    refGenes, refGene_loci = {}, {}
+    for line in open("%s.locus" % (genotype_genome if genotype_genome != "" else base_fname)):
+        fields = line.strip().split()
+        if genotype_genome != "" and base_fname != fields[0].lower():
+            continue
+        if genotype_genome != "":
+            _, Gene_name, chr, left, right, exon_str, strand = fields
+        else:
+            Gene_name, chr, left, right, _, exon_str, strand = fields
+        Gene_gene = Gene_name.split('*')[0]
+        assert not Gene_gene in refGenes
+        refGenes[Gene_gene] = Gene_name
+        left, right = int(left), int(right)
+        exons = []
+        for exon in exon_str.split(','):
+            exon_left, exon_right = exon.split('-')
+            exons.append([int(exon_left), int(exon_right)])
+        refGene_loci[Gene_gene] = [Gene_name, chr, left, right, exons]
+    Genes = {}
+    if len(locus_list) == 0:
+        locus_list = refGene_loci.keys()
+
+    # Read HLA variants, and link information
+    if genotype_genome:
+        Vars, Var_list = read_Gene_vars_genotype_genome("%s.snp" % genotype_genome, refGene_loci)
+        Links = read_Gene_links("%s.link" % genotype_genome)
+    else:
+        Vars, Var_list = read_Gene_vars("%s.snp" % base_fname)
+        Links = read_Gene_links("%s.link" % base_fname)
+
+    # Read allele sequences
+    if genotype_genome != "":
+        read_backbone_alleles(genotype_genome, refGene_loci, Genes)
+        read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes)        
+    else:
+        read_Gene_alleles(base_fname + "_backbone.fa", Genes)
+        read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes)
+
+    # Sanity Check
+    if os.path.exists(base_fname + "_backbone.fa") and \
+       os.path.exists(base_fname + "_sequences.fa"):
+        Genes2 = {}
+        read_Gene_alleles(base_fname + "_backbone.fa", Genes2)
+        read_Gene_alleles(base_fname + "_sequences.fa", Genes2)
+        for gene_name, alleles in Genes.items():
+            assert gene_name in Genes2
+            for allele_name, allele_seq in alleles.items():
+                assert allele_name in Genes2[gene_name]
+                allele_seq2 = Genes2[gene_name][allele_name]
+                assert allele_seq == allele_seq2
+
+    # HLA gene alleles
+    Gene_names = {}
+    for Gene_gene, data in Genes.items():
+        Gene_names[Gene_gene] = list(data.keys())
+
+    # HLA gene allele lengths
+    Gene_lengths = {}
+    for Gene_gene, Gene_alleles in Genes.items():
+        Gene_lengths[Gene_gene] = {}
+        for allele_name, seq in Gene_alleles.items():
+            Gene_lengths[Gene_gene][allele_name] = len(seq)
+
+    # Test HLA typing
+    test_list = []
+    if simulation:
+        basic_test, pair_test = True, False
+        if debug_instr and "pair" in debug_instr:
+            basic_test, pair_test = False, True
+
+        test_passed = {}
+        test_list = []
+        genes = list(set(locus_list) & set(Gene_names.keys()))
+        if basic_test:
+            for gene in genes:
+                Gene_gene_alleles = Gene_names[gene]
+                for allele in Gene_gene_alleles:
+                    if allele.find("BACKBONE") != -1:
+                        continue
+                    test_list.append([[allele]])
+                random.shuffle(test_list)
+        if pair_test:
+            test_size = 200
+            allele_count = 2
+            for test_i in range(test_size):
+                test_pairs = []
+                for gene in genes:
+                    Gene_gene_alleles = []
+
+                    for allele in Gene_names[gene]:
+                        if allele.find("BACKBONE") != -1:
+                            continue
+
+                        if "full" in debug:
+                            if allele in partial_alleles:
+                                continue
+
+                        Gene_gene_alleles.append(allele)
+                    nums = [i for i in range(len(Gene_gene_alleles))]
+                    random.shuffle(nums)
+                    test_pairs.append(sorted([Gene_gene_alleles[nums[i]] for i in range(allele_count)]))
+                test_list.append(test_pairs)
+
+        if "test_list" in debug_instr:
+            test_list = [[debug_instr["test_list"].split('-')]]
+            
+        for test_i in range(len(test_list)):
+            if "test_id" in debug_instr:
+                test_ids = debug_instr["test_id"].split('-')
+                if str(test_i + 1) not in test_ids:
+                    continue
+
+            print >> sys.stderr, "Test %d" % (test_i + 1), str(datetime.now())
+            test_locus_list = test_list[test_i]
+            num_frag_list = typing_common.simulate_reads(Genes,
+                                                         base_fname,
+                                                         test_locus_list,
+                                                         Vars,
+                                                         Links,
+                                                         simulate_interval,
+                                                         read_len,
+                                                         fragment_len,
+                                                         perbase_errorrate,
+                                                         perbase_snprate,
+                                                         skip_fragment_regions)
+
+            assert len(num_frag_list) == len(test_locus_list)
+            for i_ in range(len(test_locus_list)):
+                test_Gene_names = test_locus_list[i_]
+                num_frag_list_i = num_frag_list[i_]
+                assert len(num_frag_list_i) == len(test_Gene_names)
+                for j_ in range(len(test_Gene_names)):
+                    test_Gene_name = test_Gene_names[j_]
+                    gene = test_Gene_name.split('*')[0]
+                    test_Gene_seq = Genes[gene][test_Gene_name]
+                    seq_type = "partial" if test_Gene_name in partial_alleles else "full"
+                    print >> sys.stderr, "\t%s - %d bp (%s sequence, %d pairs)" % (test_Gene_name, len(test_Gene_seq), seq_type, num_frag_list_i[j_])
+
+            if "single-end" in debug_instr:
+                read_fname = ["%s_input_1.fa" % base_fname]
+            else:
+                read_fname = ["%s_input_1.fa" % base_fname, "%s_input_2.fa" % base_fname]
+
+            fastq = False
+            tmp_test_passed = typing(simulation,
+                                     base_fname,
+                                     test_locus_list,
+                                     genotype_genome,
+                                     partial,
+                                     partial_alleles,
+                                     refGenes,
+                                     Genes,                       
+                                     Gene_names,
+                                     Gene_lengths,
+                                     refGene_loci,
+                                     Vars,
+                                     Var_list,
+                                     Links,
+                                     aligners,
+                                     num_editdist,
+                                     assembly,
+                                     output_base,
+                                     error_correction,
+                                     discordant,
+                                     display_alleles,
+                                     fastq,
+                                     read_fname,
+                                     alignment_fname,
+                                     num_frag_list,
+                                     read_len,
+                                     fragment_len,
+                                     threads,
+                                     best_alleles,
+                                     verbose)
+
+            for aligner_type, passed in tmp_test_passed.items():
+                if aligner_type in test_passed:
+                    test_passed[aligner_type] += passed
+                else:
+                    test_passed[aligner_type] = passed
+
+                print >> sys.stderr, "\t\tPassed so far: %d/%d (%.2f%%)" % (test_passed[aligner_type], test_i + 1, (test_passed[aligner_type] * 100.0 / (test_i + 1)))
+
+
+        for aligner_type, passed in test_passed.items():
+            print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list))
+    
+    else: # With real reads or BAMs
+        print >> sys.stderr, "\t", ' '.join(locus_list)
+        typing(simulation,
+               base_fname,
+               locus_list,
+               genotype_genome,
+               partial,
+               partial_alleles,
+               refGenes,
+               Genes,                       
+               Gene_names,
+               Gene_lengths,
+               refGene_loci,
+               Vars,
+               Var_list,
+               Links,
+               aligners,
+               num_editdist,
+               assembly,
+               output_base,
+               error_correction,
+               discordant,
+               display_alleles,
+               fastq,
+               read_fname,
+               alignment_fname,
+               [],
+               read_len,
+               fragment_len,
+               threads,
+               best_alleles,
+               verbose)
+
+
+"""
+"""
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        description='hisatgenotype_locus')
+    parser.add_argument("--base", "--base-fname",
+                        dest="base_fname",
+                        type=str,
+                        default="hla",
+                        help="base filename for backbone sequence, variants, and linking info (default: hla)")
+    parser.add_argument("--locus-list",
+                        dest="locus_list",
+                        type=str,
+                        default="",
+                        help="A comma-separated list of genes (default: empty, all genes)")
+    parser.add_argument("--genotype-genome",
+                        dest="genotype_genome",
+                        type=str,
+                        default="",
+                        help="Base name for genotype genome, which the program will use instead of region-based small indexes (default: empty)")
+    parser.add_argument("-f", "--fasta",
+                        dest='fastq',
+                        action='store_false',
+                        help='FASTA format')
+    parser.add_argument("-U",
+                        dest="read_fname_U",
+                        type=str,
+                        default="",
+                        help="filename for single-end reads")
+    parser.add_argument("-1",
+                        dest="read_fname_1",
+                        type=str,
+                        default="",
+                        help="filename for paired-end reads")
+    parser.add_argument("-2",
+                        dest="read_fname_2",
+                        type=str,
+                        default="",
+                        help="filename for paired-end reads")    
+    parser.add_argument("--alignment",
+                        dest="alignment_fname",
+                        type=str,
+                        default="",
+                        help="BAM file name")
+    parser.add_argument("-p", "--threads",
+                        dest="threads",
+                        type=int,
+                        default=1,
+                        help="Number of threads")
+    parser.add_argument('--no-partial',
+                        dest='partial',
+                        action='store_false',
+                        help='Include partial alleles (e.g. A_nuc.fasta)')
+    parser.add_argument("--aligner-list",
+                        dest="aligners",
+                        type=str,
+                        default="hisat2.graph",
+                        help="A comma-separated list of aligners such as hisat2.graph,hisat2.linear,bowtie2.linear (default: hisat2.graph)")
+    parser.add_argument("--simulate-interval",
+                        dest="simulate_interval",
+                        type=int,
+                        default=10,
+                        help="Reads simulated at every these base pairs (default: 10)")
+    parser.add_argument("--read-len",
+                        dest="read_len",
+                        type=int,
+                        default=100,
+                        help="Length of simulated reads (default: 100)")
+    parser.add_argument("--fragment-len",
+                        dest="fragment_len",
+                        type=int,
+                        default=350,
+                        help="Length of fragments (default: 350)")
+    parser.add_argument("--best-alleles",
+                        dest="best_alleles",
+                        action='store_true',
+                        help="")
+    parser.add_argument("--random-seed",
+                        dest="random_seed",
+                        type=int,
+                        default=1,
+                        help="A seeding number for randomness (default: 1)")
+    parser.add_argument("--num-editdist",
+                        dest="num_editdist",
+                        type=int,
+                        default=2,
+                        help="Maximum number of mismatches per read alignment to be considered (default: 2)")
+    parser.add_argument("--perbase-errorrate",
+                        dest="perbase_errorrate",
+                        type=float,
+                        default=0.0,
+                        help="Per basepair error rate in percentage when simulating reads (default: 0.0)")
+    parser.add_argument("--perbase-snprate",
+                        dest="perbase_snprate",
+                        type=float,
+                        default=0.0,
+                        help="Per basepair SNP rate in percentage when simulating reads (default: 0.0)")
+    parser.add_argument("--skip-fragment-regions",
+                        dest="skip_fragment_regions",
+                        type=str,
+                        default="",
+                        help="A comma-separated list of regions from which no reads originate, e.g., 500-600,1200-1400 (default: None).")
+    parser.add_argument('-v', '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        help='also print some statistics to stderr')
+    parser.add_argument('--verbose-level',
+                        dest='verbose_level',
+                        type=int,
+                        default=0,
+                        help='also print some statistics to stderr (default: 0)')
+    parser.add_argument("--debug",
+                        dest="debug",
+                        type=str,
+                        default="",
+                        help="e.g., test_id:10,read_id:10000,basic_test")
+    parser.add_argument("--output-base", "--assembly-base",
+                        dest="output_base",
+                        type=str,
+                        default="assembly_graph",
+                        help="base file name (default: assembly_graph)")
+    parser.add_argument("--assembly",
+                        dest="assembly",
+                        action="store_true",
+                        help="Perform assembly")
+    parser.add_argument("--no-error-correction",
+                        dest="error_correction",
+                        action="store_false",
+                        help="Correct sequencing errors")
+    parser.add_argument("--only-locus-list",
+                        dest="only_locus_list",
+                        type=str,
+                        default="",
+                        help="A comma-separated list of genes (default: empty, all genes)")
+    parser.add_argument("--discordant",
+                        dest="discordant",
+                        action="store_true",
+                        help="Allow discordantly mapped pairs or singletons")
+    parser.add_argument("--display-alleles",
+                        dest="display_alleles",
+                        type=str,
+                        default="",
+                        help="A comma-separated list of alleles to display in HTML (default: empty)")
+
+    args = parser.parse_args()
+    if args.locus_list == "":
+        locus_list = []
+    else:
+        locus_list = args.locus_list.split(',')
+    if args.only_locus_list == "":
+        only_locus_list = []
+    else:
+        locus_list = only_locus_list = args.only_locus_list.split(',')    
+    if args.aligners == "":
+        print >> sys.stderr, "Error: --aligners must be non-empty."
+        sys.exit(1)    
+    args.aligners = args.aligners.split(',')
+    for i in range(len(args.aligners)):
+        args.aligners[i] = args.aligners[i].split('.')
+    if args.read_fname_U != "":
+        args.read_fname = [args.read_fname_U]
+    elif args.read_fname_1 != "" or args.read_fname_2 != "":
+        if args.read_fname_1 == "" or args.read_fname_2 == "":
+            print >> sys.stderr, "Error: please specify both -1 and -2."
+            sys.exit(1)
+        args.read_fname = [args.read_fname_1, args.read_fname_2]
+    else:
+        args.read_fname = []
+    if args.alignment_fname != "" and \
+            not os.path.exists(args.alignment_fname):
+        print >> sys.stderr, "Error: %s doesn't exist." % args.alignment_fname
+        sys.exit(1)
+
+    if args.verbose and args.verbose_level == 0:
+        args.verbose_level = 1
+        
+    debug = {}
+    if args.debug != "":
+        for item in args.debug.split(','):
+            if ':' in item:
+                fields = item.split(':')
+                assert len(fields) >= 2
+                key, value = fields[0], ':'.join(fields[1:])
+                debug[key] = value
+            else:
+                debug[item] = 1
+
+    if not args.partial:
+        print >> sys.stderr, "Warning: --no-partial should be used for debugging purpose only."
+
+    if args.read_len * 2 > args.fragment_len:
+        print >> sys.stderr, "Warning: fragment might be too short (%d)" % (args.fragment_len)
+
+    skip_fragment_regions = []
+    if args.skip_fragment_regions != "":
+        prev_left, prev_right = -1, -1
+        for region in args.skip_fragment_regions.split(','):
+            left, right = region.split('-')
+            left, right = int(left), int(right)
+            assert left < right
+            assert prev_right < left
+            prev_left, prev_right = left, right
+            skip_fragment_regions.append([left, right])
+
+    if args.display_alleles == "":
+        display_alleles = []
+    else:
+        display_alleles = args.display_alleles.split(',')
+
+    random.seed(args.random_seed)
+    genotyping_locus(args.base_fname,
+                     locus_list,
+                     args.genotype_genome,
+                     only_locus_list,
+                     args.partial,
+                     args.aligners,
+                     args.read_fname,
+                     args.fastq,
+                     args.alignment_fname,
+                     args.threads,
+                     args.simulate_interval,
+                     args.read_len,
+                     args.fragment_len,
+                     args.best_alleles,
+                     args.num_editdist,
+                     args.perbase_errorrate,
+                     args.perbase_snprate,
+                     skip_fragment_regions,
+                     args.assembly,
+                     args.output_base,
+                     args.error_correction,
+                     args.discordant,
+                     display_alleles,
+                     args.verbose_level,
+                     debug)
+
diff --git a/hisatgenotype_modules/__init__.py b/hisatgenotype_modules/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/hisatgenotype_modules/hisatgenotype_assembly_graph.py b/hisatgenotype_modules/hisatgenotype_assembly_graph.py
new file mode 100755
index 0000000..475f23d
--- /dev/null
+++ b/hisatgenotype_modules/hisatgenotype_assembly_graph.py
@@ -0,0 +1,1771 @@
+#!/usr/bin/env python
+
+import sys
+import math, random
+from datetime import datetime, date, time
+from collections import deque
+from copy import deepcopy
+
+
+#
+def get_major_nt(nt_dic):
+    nt = ''
+    max_count = 0
+    for tmp_nt, tmp_value in nt_dic.items():
+        tmp_count, tmp_var_id = tmp_value
+        if len(tmp_nt) == 1:
+            assert tmp_nt in "ACGTDN"
+        else:
+            assert len(tmp_nt) == 2 and tmp_nt[0] == 'I' and tmp_nt[1] in "ACGT"
+        if tmp_count > max_count:
+            max_count = tmp_count
+            nt = tmp_nt
+    if len(nt) == 1:
+        assert nt in "ACGTDN"
+    else:
+        assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT"
+    return nt                
+
+
+#
+def match_score(nt_dic1, nt_dic2):
+    sum_1 = sum([count for count, _ in nt_dic1.values()])
+    sum_2 = sum([count for count, _ in nt_dic2.values()])
+    total1, total2 = sum_1 * 2.0, sum_2 * 2.0
+    best = 0.0
+    for nt in "ACGT":
+        if nt not in nt_dic1 or nt not in nt_dic2:
+            continue
+        tmp_best = nt_dic1[nt][0] / total1 + nt_dic2[nt][0] / total2
+        if tmp_best > best:
+            best = tmp_best
+    return best
+
+
+#
+def get_ungapped_seq(seq):
+    ungapped_seq = []
+    for i in range(len(seq)):
+        nt_dic = seq[i]
+        nt = get_major_nt(nt_dic)
+        if nt == 'D':
+            continue
+        ungapped_seq.append(nt_dic)
+    return ungapped_seq
+
+
+#
+def get_ungapped_seq_pos(seq, pos):
+    tot_del_len, tot_ins_len = 0, 0
+    for i in range(len(seq)):
+        nt_dic = seq[i]
+        nt = get_major_nt(nt_dic)
+        if nt == 'D':
+            tot_del_len += 1
+        elif nt[0] == 'I':
+            tot_ins_len += 1
+        if i - tot_ins_len == pos:
+            return pos - tot_del_len
+    return -1
+
+
+# Get mate node id
+#  HSQ1008:141:D0CC8ACXX:3:2304:4780:36964|L to HSQ1008:141:D0CC8ACXX:3:2304:4780:36964|R or vice versa
+def get_mate_node_id(node_id):
+    node_id2, end = node_id.split('|')
+    if end == 'L':
+        end = 'R'
+    else:
+        end = 'L'
+    node_id2 = '|'.join([node_id2, end])
+    return node_id2
+
+
+
+class Node:
+    # Initialize
+    def __init__(self,
+                 id,
+                 left,
+                 seq,
+                 qual,
+                 var,
+                 ref_seq,
+                 ref_vars,
+                 mpileup,
+                 simulation):
+        self.next = [] # list of next nodes
+
+        if simulation:
+            id = id.split('_')[0]
+        self.id = id # Node ID
+        self.left = left # starting position
+
+        # sequence that node represents
+        #   with information about how the sequence is related to backbone
+        assert len(seq) == len(var)
+        assert len(seq) == len(qual)
+        self.seq = []
+        self.ins_len = 0
+        for s in range(len(seq)):
+            nt = seq[s]
+            if len(nt) == 1:
+                assert nt in "ACGTDN"
+            else:
+                assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT"
+                self.ins_len += 1                
+            var_id = var[s]
+            self.seq.append({nt : [1, var_id]})
+        self.qual = []
+        for q in qual:
+            if q != '':
+                self.qual.append(max(0, ord(q) / 10 - 3))
+            else:
+                self.qual.append(0)
+
+        self.right = self.left + len(seq) - 1 - self.ins_len
+
+        self.read_ids = set([id])
+        self.mate_ids = set([id.split('|')[0]])
+
+        self.calculate_avg_cov()
+
+        self.ref_seq = ref_seq
+        self.ref_vars = ref_vars
+
+        self.mpileup = mpileup
+
+        
+    # Check how compatible allele is in regard to read or pair
+    def compatible_with_rnode(self, rnode):
+        assert False
+        assert rnode.left + len(rnode.seq) <= len(self.seq)
+        score = 0
+        for i in range(len(rnode.seq)):
+            allele_bp = self.seq[rnode.left + i]
+            read_bp = rnode.seq[i]
+            if allele_bp == read_bp:
+                score += 1
+
+        return float(score) / len(rnode.seq)
+
+
+    # Check how nodes overlap with each other without considering deletions
+    def overlap_with(self, other, vars, skipN = False, debug = False):
+        assert self.left <= other.left
+        if self.right < other.left:
+            return -1, -1
+        seq = get_ungapped_seq(self.seq)
+        other_seq = get_ungapped_seq(other.seq)
+        add_mm = len(self.mate_ids & other.mate_ids)
+        i_left = get_ungapped_seq_pos(self.seq, other.left - self.left)
+        for i in range(i_left - 5, i_left + 6):
+            max_mm = 0.012 * (len(seq) - i) # 1 mismatch per 83 bases
+            tmp_mm = 0.0
+            for j in range(len(other_seq)):
+                if i + j >= len(seq):
+                    break
+                nt_dic, other_nt_dic = seq[i+j], other_seq[j]
+                nt, other_nt = get_major_nt(nt_dic), get_major_nt(other_nt_dic)
+                mismatch = 0.0
+                if skipN and (nt == 'N' or other_nt == 'N'):
+                    mismatch = 0.0
+                elif nt != other_nt:
+                    mismatch = 1.0 - match_score(seq[i+j], other_seq[j])
+                    
+                    # Higher penalty for mismatches in variants
+                    nt_var, other_nt_var = nt_dic[nt][1], other_nt_dic[other_nt][1]
+                    if nt_var != other_nt_var:
+                        mismatch = 5.0
+                        adjust = min(1.0, nt_dic[nt][0] / self.get_avg_cov()) * \
+                                 min(1.0, other_nt_dic[other_nt][0] / other.get_avg_cov())
+                        mismatch *= adjust
+                        if mismatch < 1.0:
+                            mismatch = 1.0
+
+                assert mismatch >= 0.0
+                tmp_mm += mismatch
+                if tmp_mm > max_mm:
+                    break
+
+            if debug:
+                print "at %d (%d) with overlap of %d and mismatch of %.2f" % (i, self.left + i, j, tmp_mm)
+
+            if tmp_mm <= max_mm:
+                return i, min(len(seq) - i, len(other_seq)), tmp_mm
+                
+        return -1, -1, sys.maxint
+
+    
+    # Combine two nodes with considering deletions
+    def combine_with(self, other):
+
+        # DK - debugging purposes
+        if self.left > other.left:
+            self.print_info()
+            other.print_info()
+        
+        assert self.left <= other.left
+
+        # Merge two sequences
+        assert len(other.seq) > 0 and 'D' not in other.seq[0].keys()
+        j = 0        
+        # Merge the overlapped parts
+        if self.right >= other.left:
+            overlap, ins_len = False, 0
+            for i in range(len(self.seq)):
+                nt_dic = self.seq[i]
+                nt = get_major_nt(nt_dic)
+                if nt.startswith('I'):
+                    ins_len += 1
+                if i == other.left - self.left + ins_len:
+                    overlap = True
+                    break
+            assert overlap
+            new_seq = self.seq[:i]
+            while i < len(self.seq) and j < len(other.seq):
+                nt_dic, nt_dic2 = self.seq[i], other.seq[j]
+                for nt, value in nt_dic2.items():
+                    count, var_id = value
+                    if nt in nt_dic:
+                        nt_dic[nt][0] += count
+                        # if nt != 'D':
+                        #    assert nt_dic[nt][1] == var_id
+                    else:
+                        nt_dic[nt] = [count, var_id]
+                new_seq.append(nt_dic)
+                i += 1
+                j += 1
+            # this node contains the other node
+            if i < len(self.seq):
+                new_seq += self.seq[i:]
+        # Fill in the gap between the two nodes if exists
+        else:
+            new_seq = self.seq[:]
+            sum_1 = sum([count for count, _ in self.seq[-1].values()])
+            sum_2 = sum([count for count, _ in other.seq[0].values()])
+            flank_cov = (sum_1 + sum_2) / 2.0
+            for k in range(other.left - self.right - 1):
+                ref_nt_dic = self.mpileup[k + 1 + self.right][1]
+                nt_dic = {}
+                # Fill in the gap with Ns for now
+                if len(ref_nt_dic) == 0 or True:
+                    nt_dic = {'N' : [1, ""]}
+                else:
+                    weight = flank_cov / max(1.0, sum([count for count, _ in ref_nt_dic.values()]))
+                    for nt, value in ref_nt_dic.items():
+                        count, var_id = value
+                        nt_dic[nt] = [count * weight, var_id]
+                new_seq.append(nt_dic)
+
+        # Append the rest of the other sequence to it
+        if j < len(other.seq):
+            new_seq += deepcopy(other.seq[j:])
+        self.read_ids |= other.read_ids
+        self.mate_ids |= other.mate_ids
+
+        self.seq = new_seq
+        self.ins_len = 0
+        for i in range(len(self.seq)):
+            nt_dic = self.seq[i]
+            nt = get_major_nt(nt_dic)
+            if nt[0] == 'I':
+                self.ins_len += 1
+        self.right = self.left + len(self.seq) - 1 - self.ins_len
+        
+        # Update coverage
+        self.calculate_avg_cov()
+
+
+    # Return the length of the ungapped sequence
+    def ungapped_length(self):
+        return len(get_ungapped_seq(self.seq))
+
+
+    # Contains Ns?
+    def contain_Ns(self):
+        for i in range(len(self.seq)):
+            nt_dic = self.seq[i]
+            nt = get_major_nt(nt_dic)
+            if nt == 'N':
+                return True
+        return False
+
+    
+    # Get variant ids
+    def get_var_ids(self, left = 0, right = sys.maxint):
+        vars = []
+        left = max(left, self.left)
+        right = min(right, self.right)
+        ins_len = 0
+        for pos in range(left, right + 1):
+            var_i = pos - self.left + ins_len
+            while var_i < len(self.seq):
+                nt_dic = self.seq[var_i]
+                nt = get_major_nt(nt_dic)
+                if nt.startswith('I'):
+                    var_i += 1
+                    ins_len += 1
+                else:
+                    break            
+            for _, var in nt_dic.values():
+                if var == "" or \
+                   var == "unknown":
+                    continue
+                assert var in self.ref_vars
+                if len(vars) > 0 and var == vars[-1]:
+                    continue
+                type, pos, data = self.ref_vars[var]
+                if (type == "single" and data == nt) or \
+                   (type == "deletion" and nt == 'D') or \
+                   (type == "insertion" and len(nt) == 2 and nt[1] == data):
+                    vars.append(var)
+
+        return vars
+
+    
+    # Get variant ids
+    #   left, right are absolute coordinates
+    def get_vars(self, left = 0, right = sys.maxint):
+        vars = []
+        left = max(left, self.left)
+        right = min(right, self.right)
+        skip_pos = -1
+        ins_len = 0
+        for pos in range(left, right + 1):
+            if pos <= skip_pos:
+                continue
+            var_i = pos - self.left + ins_len
+            while var_i < len(self.seq):
+                nt_dic = self.seq[var_i]
+                nt = get_major_nt(nt_dic)
+                if nt.startswith('I'):
+                    var_i += 1
+                    ins_len += 1
+                    var = nt_dic[nt][1]
+                    if len(vars) > 0 and var != vars[-1][0]:
+                        vars.append([var, pos])
+                else:
+                    break
+            if nt == self.ref_seq[pos]:
+                continue
+            if nt == 'N':
+                vars.append(["gap", pos])
+                continue            
+            added = False
+            for _, var in nt_dic.values():
+                if var == "" or \
+                   var == "unknown":
+                    continue
+                if len(vars) > 0 and var == vars[-1][0]:
+                    continue
+                assert var in self.ref_vars
+                type, var_pos, data = self.ref_vars[var]                    
+                if data == nt or (type == "deletion" and nt == 'D'):
+                    assert pos >= var_pos
+                    if type == "deletion" and pos > var_pos:
+                        continue                    
+                    if type == "deletion":
+                        skip_pos = pos + int(data) - 1
+                    added = True
+                    vars.append([var, pos])
+            if not added and "unknown" in [var_id for _, var_id in nt_dic.values()]:
+                vars.append(["unknown", pos])
+
+        return vars
+
+
+    # Get average coverage
+    def get_avg_cov(self):
+        return self.avg
+
+    
+    # Calculate average coverage
+    def calculate_avg_cov(self):
+        self.avg = 0.0
+        for nt_dic in self.seq:
+            for count, _ in nt_dic.values():
+                self.avg += count
+        self.avg /= len(self.seq)
+        return self.avg
+
+        
+    # Display node information
+    def print_info(self, output=sys.stderr):
+        seq, var_str = "", ""
+        prev_var = ""
+        ins_len = 0
+        for i in range(len(self.seq)):
+            if (self.left + i - ins_len) % 100 == 0:
+                seq += ("|%d|" % (self.left + i - ins_len))
+            elif (self.left + i - ins_len) % 20 == 0:
+                seq += '|'
+            nt_dic = self.seq[i]
+            nt = get_major_nt(nt_dic)
+            if nt[0] == 'I':
+                seq += "\033[93m"
+            elif nt != self.ref_seq[self.left + i - ins_len]:
+                var_id = nt_dic[nt][1]
+                if var_id == "unknown" or var_id.startswith("nv"):
+                    seq += "\033[91m" # red
+                else:
+                    seq += "\033[94m" # blue
+            if nt[0] == 'I':
+                seq += nt[1]
+            else:
+                seq += nt
+            if nt[0] == 'I' or nt != self.ref_seq[self.left + i - ins_len]:
+                seq += "\033[00m"
+
+            var = []
+            for _, var_id in nt_dic.values():
+                if var_id == "":
+                    continue
+                var.append(var_id)
+            var = '-'.join(var)
+            if var != "" and var != prev_var:
+                var_str += "\t%d: %s %s" % (self.left + i - ins_len, var, str(nt_dic))
+            prev_var = var
+            if nt[0] == 'I':
+                ins_len += 1
+        
+        print >> output, "Node ID:", self.id
+        print >> output, "Pos: [%d, %d], Avg. coverage: %.1f" % (self.left, self.right, self.get_avg_cov())
+        print >> output, "\t", seq
+        print >> output, "\t", var_str
+        print >> output, "mates:", len(self.mate_ids) # sorted(self.mate_ids)
+        print >> output, "reads:", len(self.read_ids) # sorted(self.read_ids)
+        print >> output
+
+                
+class Graph:
+    def __init__(self,
+                 backbone,
+                 gene_vars,
+                 exons,
+                 partial_allele_ids,
+                 true_allele_nodes = {},
+                 predicted_allele_nodes = {},
+                 display_allele_nodes = {},
+                 simulation = False):
+        self.backbone = backbone # backbone sequence
+        self.gene_vars = gene_vars
+        self.exons = exons
+        self.partial_allele_ids = partial_allele_ids
+        self.true_allele_nodes = true_allele_nodes
+        self.predicted_allele_nodes = predicted_allele_nodes
+        self.allele_node_order = []
+        self.display_allele_nodes = display_allele_nodes
+        self.simulation = simulation
+
+        self.read_nodes = self.nodes = {}
+        self.edges = {}
+        self.to_node, self.from_node = {}, {}
+
+        self.left_margin = 300
+        self.right_margin = 20
+        self.top_margin = 20
+        self.bottom_margin = 20
+
+        if len(backbone) <= 4000:
+            self.scalex = 5
+        elif len(backbone) <= 8000:
+            self.scalex = 2
+        else:
+            self.scalex = 1
+        self.scaley = 2
+        self.width = len(self.backbone) * self.scalex + self.left_margin + self.right_margin
+        self.unscaled_height = 6000
+        self.height = self.unscaled_height * self.scaley
+
+
+    # Add node, which is an alignment w.r.t. the reference
+    def add_node(self, id, node, simulation = False):
+        if simulation:
+            id = id.split('_')[0]
+        if id in self.nodes:
+            print >> sys.stderr, "Warning) multi-mapped read:", id
+            # assert False
+            return
+        assert id not in self.nodes
+        self.nodes[id] = node
+
+        
+    # Remove nodes that are inside other nodes or with low coverage
+    def remove_nodes(self, nodes):
+        delete_ids = set()
+        node_list = [[id, node.left, node.right] for id, node in nodes.items()]
+        def node_cmp(a, b):
+            if a[2] != b[2]:
+                return a[2] - b[2]
+            else:
+                return a[1] - b[1]
+        node_list = sorted(node_list, cmp=node_cmp)
+        for n in range(len(node_list)):
+            id, left, right = node_list[n]
+            node = nodes[id]
+            i = n - 1
+            while i >= 0:
+                id2, left2, right2 = node_list[i]
+                if right2 < left:
+                    break
+                node2 = nodes[id2]
+                if left <= left2 and right2 <= right:
+                    at, overlap, mm = node.overlap_with(node2, self.gene_vars)
+
+                    # DK - debugging purposes
+                    """
+                    print node.id, "vs.", node2.id
+                    print "at %d: overlap of %d with %d mismatches (mult: %.2f)" % \
+                        (at, overlap, mm, mult)
+                    """
+                    if mm < 1.0:
+                        mult = overlap / float(max(right - left, right2 - left2))
+                        if node2.get_avg_cov() * mult * 10 < node.get_avg_cov():
+                            delete_ids.add(id2)
+                        elif left == left2 and right == right2:
+                            delete_ids.add(id)
+                    elif overlap > 0:
+                        if node2.get_avg_cov() * 10 < node.get_avg_cov():
+                            delete_ids.add(id2)
+                        elif node.get_avg_cov() * 10 < node2.get_avg_cov():
+                            delete_ids.add(id)
+                i -= 1
+
+        for delete_id in delete_ids:
+            del nodes[delete_id]
+
+            
+    #
+    # 
+    def guided_DeBruijn(self,
+                        print_msg = False):
+        assert len(self.nodes) > 0
+        k = 60 # k-mer
+
+        DRB1_debug = False
+        CP_IMPL = True
+
+        node_seq = {}
+        for id, node in self.nodes.items():
+            s, seq = 0, []
+            while s < len(node.seq):
+                nt_dic = node.seq[s] # {'C': [1, '']}
+                nt = get_major_nt(nt_dic)
+                if nt in "ACGTND":
+                    seq.append(nt)
+                else:
+                    assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT"
+                s += 1
+
+            if len(seq) < k:
+                continue
+
+            def leftshift(seq, ref_seq):
+                seq_len = len(seq)
+                assert seq_len > 0 and seq[0] != 'D'
+
+                bp_i = 0
+                while bp_i < seq_len:
+                    bp = seq[bp_i]
+                    if bp != 'D':
+                        bp_i += 1
+                        continue
+                    bp_j = bp_i + 1
+                    while bp_j < seq_len:
+                        bp2 = seq[bp_j]
+                        if bp2 != 'D':
+                            break
+                        else:
+                            bp_j += 1
+
+                    if bp_j >= seq_len:
+                        bp_i = bp_j
+                        break
+
+                    prev_i, prev_j = bp_i, bp_j
+                    while bp_i > 0 and seq[bp_i-1] in "ACGT" and ref_seq[bp_j-1] in "ACGT":
+                        if seq[bp_i-1] != ref_seq[bp_j-1]:
+                            break
+                        seq[bp_j-1] = seq[bp_i-1]
+                        seq[bp_i-1] = 'D'
+                        bp_i -= 1
+                        bp_j -= 1
+                    bp_i = bp_j
+                    while bp_i < seq_len:
+                        if seq[bp_i] in "ACGT":
+                            break
+                        bp_i += 1
+
+            if DRB1_debug:
+                leftshift(seq, self.backbone[node.left:node.left + len(seq)])
+            node_seq[id] = seq
+
+        try_hard = False
+        while True:
+            delete_ids = set()
+            nodes = []
+            for id, node in self.nodes.items():
+                seq = node_seq[id]
+                if len(seq) < k:
+                    continue
+                kmer, seq = seq[:k], seq[k:]
+                nodes.append([id, node.left, node.right, kmer, seq])
+                
+            def node_cmp(a, b):
+                if a[1] != b[1]:
+                    return a[1] - b[1]
+                else:
+                    return a[2] - b[2]
+            nodes = sorted(nodes, cmp=node_cmp)
+
+            # Generate numerical read IDs
+            id_to_num = {}
+            num_to_id = []
+            for id in [node[0] for node in nodes]:
+                id_to_num[id] = len(id_to_num)
+                num_to_id.append(id)
+
+            # Construct De Bruijn graph with 60-mer
+            self.debruijn = debruijn = [[] for i in range(len(self.backbone) - k + 1)]
+            min_n = 0
+            for pos in range(len(debruijn)):
+                for n in range(min_n, len(nodes)):
+                    id, node_pos, node_right, kmer, seq = nodes[n]
+                    if node_pos < pos:
+                        min_n = n + 1
+                        continue
+                    elif node_pos > pos:
+                        break
+
+                    assert len(kmer) == k
+
+                    # Add a new node or update the De Bruijn graph
+                    curr_vertices = debruijn[pos]
+                    found = False
+                    kmer_seq = ''.join(kmer)
+                    for v in range(len(curr_vertices)):
+                        cmp_nt, cmp_k_m1_mer = curr_vertices[v][:2]
+                        if kmer_seq == cmp_k_m1_mer + cmp_nt:                        
+                            curr_vertices[v][3].append(n)
+                            found = True
+                            break
+
+                    if not found:
+                        predecessors = []
+                        if pos > 0:
+                            prev_vertices = debruijn[pos - 1]
+                            for v in range(len(prev_vertices)):
+                                cmp_nt, cmp_k_m1_mer = prev_vertices[v][:2]
+                                if kmer_seq[:-1] == cmp_k_m1_mer[1:] + cmp_nt:
+                                    predecessors.append(v)
+                        debruijn[pos].append([kmer_seq[-1],           # base
+                                              ''.join(kmer_seq[:-1]), # (k-1)-mer
+                                              predecessors,           # predecessors
+                                              [n]])                   # numeric read IDs
+
+                    # Update k-mer
+                    if len(seq) > 0:
+                        kmer, seq = kmer[1:] + seq[:1], seq[1:]
+                        nodes[n] = [id, node_pos + 1, node_right, kmer, seq]
+
+            # Average number of kmers
+            total_kmers = 0
+            for pos in range(len(debruijn)):
+                vertices = debruijn[pos]
+                for _, _, _, num_ids in vertices:
+                    total_kmers += len(num_ids)
+            avg_kmers = float(total_kmers) / len(debruijn)
+
+            # Filter out reads
+            for pos in range(len(debruijn)):
+                vertices = debruijn[pos]
+                num_vertices = 0
+                num_kmers = 0
+                for v in range(len(vertices)):
+                    _, _, predecessors, num_ids = vertices[v]
+                    if not (set(num_ids) <= delete_ids):
+                        num_vertices += 1
+                        if DRB1_debug:
+                            num_kmers = len(set(num_ids) - delete_ids)
+                if num_vertices <= 1:
+                    if DRB1_debug:
+                        if pos > 300 and pos + 300 < len(debruijn):
+                            if num_vertices == 1 and num_kmers * 8 < avg_kmers:
+                                for _, _, _, num_ids in vertices:
+                                    delete_ids |= set(num_ids)
+                    continue
+                
+                vertice_count = [0] * len(vertices)
+                for v in range(len(vertices)):
+                    _, _, predecessors, num_ids = vertices[v]
+                    for num_id in num_ids:
+                        if num_id in delete_ids:
+                            continue
+                        read_id = num_to_id[num_id]
+                        mate_read_id = get_mate_node_id(read_id)
+                        if mate_read_id in self.nodes:
+                            vertice_count[v] += 1
+
+                # DK - debugging purposes
+                debug_msg = False
+                if debug_msg:
+                    print >> sys.stderr, "at", pos, vertices
+                    print >> sys.stderr, "count:", vertice_count
+
+                if try_hard:
+                    vertice_with_id = [[vertice_count[v], v] for v in range(len(vertice_count))]
+                    vertice_with_id = sorted(vertice_with_id, key=lambda a: a[0])
+                    for v in range(len(vertice_count) - 2):
+                        v = vertice_with_id[v][1]
+                        num_ids = vertices[v][3]
+                        delete_ids |= set(num_ids)
+                        if debug_msg:
+                            print >> sys.stderr, v, "is removed with", num_ids
+                else:
+                    for v in range(len(vertices)):
+                        assert len(vertices) >= 2
+                        relative_avg = (sum(vertice_count) - vertice_count[v]) / float(len(vertice_count) - 1)
+                        if len(vertices) == 2:
+                            # Eliminate reads that have conflicts with other reads due to a deletion
+                            if vertice_count[v] * 2 < relative_avg:
+                                nt, kmer, _, num_ids = vertices[1-v]
+                                if nt == 'D':
+                                    num_id = num_ids[0]
+                                    read_id = num_to_id[num_id]
+                                    left, seq = pos - self.nodes[read_id].left, node_seq[read_id]
+                                    seq_right = ''.join(seq[left+k:])
+                                    seq_right = seq_right.replace('D', '')
+                                    success = True
+                                    for num_id2 in vertices[v][3]:
+                                        read_id2 = num_to_id[num_id2]
+                                        left2, seq2 = pos-self.nodes[read_id2].left, node_seq[read_id2]
+                                        seq2_right = ''.join(seq2[left2+k:])
+                                        if seq_right.find(seq2_right) != 0:
+                                            success = False
+                                            break
+                                    if success:
+                                        delete_ids |= set(vertices[v][3])
+
+                            # DK - working on ...
+                            if DRB1_debug:
+                                if vertice_count[v] * 8 < relative_avg:
+                                    num_ids = vertices[v][3]
+                                    delete_ids |= set(num_ids)
+                                    if debug_msg:
+                                        print >> sys.stderr, v, "is removed with", num_ids
+                                elif vertice_count[v] * 8 < avg_kmers:
+                                    num_ids = vertices[v][3]
+                                    delete_ids |= set(num_ids)
+                        else:
+                            if vertice_count[v] * 3 < relative_avg:
+                                num_ids = vertices[v][3]
+                                delete_ids |= set(num_ids)
+                                if debug_msg:
+                                    print >> sys.stderr, v, "is removed with", num_ids
+
+                if debug_msg:
+                    print >> sys.stderr
+                    print >> sys.stderr             
+                
+            if len(delete_ids) == 0:
+                if try_hard:
+                    break
+                else:
+                    try_hard = True
+
+            for num_id in delete_ids:
+                read_id = num_to_id[num_id]
+                del self.nodes[read_id]
+
+        # Print De Bruijn graph
+        # """
+        # for i in range(len(debruijn)):
+        for i in range(len(debruijn)):
+            curr_vertices = debruijn[i]
+            if len(curr_vertices) == 0:
+                continue
+            consensus_seq = [{} for j in range(k)]
+            for v in range(len(curr_vertices)):
+                nt, k_m1_mer = curr_vertices[v][:2]
+                kmer = k_m1_mer + nt
+                assert len(kmer) == k
+                for j in range(k):
+                    nt = kmer[j]
+                    if nt not in consensus_seq[j]:
+                        consensus_seq[j][nt] = 1
+                    else:
+                        consensus_seq[j][nt] += 1
+
+            if print_msg: print >> sys.stderr, i
+            for v in range(len(curr_vertices)):
+                nt, k_m1_mer, predecessors, num_ids = curr_vertices[v]
+                kmer = k_m1_mer + nt
+                kmer_seq = ""
+                for j in range(k):
+                    nt = kmer[j]
+                    if len(consensus_seq[j]) >= 2:
+                        kmer_seq += "\033[94m"
+                    kmer_seq += nt
+                    if len(consensus_seq[j]) >= 2:
+                        kmer_seq += "\033[00m"
+                    
+                if print_msg: print >> sys.stderr, "\t%d:" % v, kmer_seq, len(num_ids), predecessors, num_ids
+                    
+
+        # """
+
+        # Generate compressed nodes
+        paths = []
+        path_queue, done = deque(), set()
+        for i in range(len(debruijn)):
+            if len(debruijn[i]) == 0:
+                continue
+            for i2 in range(len(debruijn[i])):
+                path_queue.append("%d-%d" % (i, i2))
+            break
+
+        while len(path_queue) > 0:
+            i_str = path_queue.popleft()
+            if i_str in done:
+                continue
+
+            i, i2 = i_str.split('-')
+            i, i2 = int(i), int(i2)
+            num_ids = debruijn[i][i2][3]
+            j = i + 1
+            while j < len(debruijn):
+                merge, branch = len(debruijn[j-1]) > len(debruijn[j]), len(debruijn[j-1]) < len(debruijn[j])
+                new_i2 = -1
+                tmp_num_ids = []
+                found = False
+                for j2 in range(len(debruijn[j])):
+                    _, _, predecessors, add_read_ids = debruijn[j][j2]
+                    if len(predecessors) == 0:
+                        branch = True
+                        path_queue.append("%d-%d" % (j, j2))
+                    elif i2 in predecessors:
+                        found = True
+                        # merge into one node
+                        if len(predecessors) > 1:
+                            merge = True
+                        if new_i2 >= 0:
+                            branch = True
+                        new_i2 = j2
+                        tmp_num_ids += add_read_ids
+
+                if merge or branch:
+                    for j2 in range(len(debruijn[j])):
+                        _, _, predecessors, add_num_ids = debruijn[j][j2]
+                        if i2 in predecessors:
+                            path_queue.append("%d-%d" % (j, j2))
+                    break
+                if not found:
+                    break
+                
+                num_ids += tmp_num_ids
+                i2 = new_i2
+                j += 1
+
+            done.add(i_str)
+
+            num_ids = set(num_ids)
+            paths.append([i, j, num_ids])
+
+            if j < len(debruijn) and len(debruijn[j]) == 0:
+                j += 1
+                while j < len(debruijn) and len(debruijn[j]) == 0:
+                    j += 1
+                if j < len(debruijn):
+                    for j2 in range(len(debruijn[j])):
+                        path_queue.append("%d-%d" % (j, j2))
+                        
+
+        def get_mate_num_ids(num_ids):
+            mate_num_ids = set()
+            for num_id in num_ids:
+                read_id = num_to_id[num_id]
+                mate_read_id = get_mate_node_id(read_id)
+                if mate_read_id in id_to_num:
+                    mate_num_id = id_to_num[mate_read_id]
+                    mate_num_ids.add(mate_num_id)
+                    
+            return mate_num_ids
+        
+
+        # Generate a compressed assembly graph
+        def path_cmp(a, b):
+            if a[0] != b[0]:
+                return a[0] - b[0]
+            else:
+                return a[1] - b[1]
+        paths = sorted(paths, cmp=path_cmp)
+
+        # DK - debugging purposes
+        for p in range(len(paths)):
+            if print_msg: print >> sys.stderr, "path:", p, paths[p]
+
+        excl_num_ids = set() # exclusive num ids
+        equiv_list = []
+        p = 0
+        while p < len(paths):
+            left, right, num_ids = paths[p]
+            p2 = p + 1
+            while p2 < len(paths):
+                next_left, next_right, next_num_ids = paths[p2]
+                if next_left >= right:
+                    break
+                p2 += 1
+
+            equiv_list.append([])
+            for i in range(p, p2):
+                left, right, num_ids = paths[i]
+                equiv_list[-1].append([[i], num_ids, num_ids | get_mate_num_ids(num_ids), []])
+                if p + 1 < p2:
+                    assert p + 2 == p2
+                    excl_num_ids |= num_ids
+
+            p = p2
+
+        new_equiv_list = []
+        for classes in equiv_list:
+            if len(classes) > 1:
+                new_equiv_list.append(classes)
+                continue
+            assert len(classes) == 1
+            num_ids = classes[0][1] - excl_num_ids
+            if len(num_ids) <= 0:
+                continue
+            classes[0][1] = num_ids
+            classes[0][2] = num_ids | get_mate_num_ids(num_ids)
+            new_equiv_list.append(classes)
+        equiv_list = new_equiv_list
+
+        known_alleles = False
+        while True:
+            # DK - debugging purposes
+            # """
+            for i in range(len(equiv_list)):
+                classes = equiv_list[i]
+                for j in range(len(classes)):
+                    ids, num_ids, all_ids, alleles = classes[j]
+                    if print_msg: print >> sys.stderr, i, j, ids, len(num_ids), sorted(list(num_ids))[:20], alleles
+
+                if print_msg: print >> sys.stderr
+            # """
+
+            if known_alleles:
+                for i in range(len(equiv_list)):
+                    classes = equiv_list[i]
+                    for j in range(len(classes)):
+                        num_ids = sorted(list(classes[j][1]))
+                        node_id = "(%d-%d)%s" % (i, j, num_to_id[num_ids[0]])
+                        node = self.nodes2[node_id]
+                        node_vars = node.get_var_ids()
+                        max_alleles, max_common = set(), -sys.maxint
+                        for anode in self.predicted_allele_nodes.values():
+                            allele_vars = anode.get_var_ids(node.left, node.right)
+                            tmp_common = len(set(node_vars) & set(allele_vars)) - len(set(node_vars) | set(allele_vars))
+                            if tmp_common > max_common:
+                                max_common = tmp_common
+                                max_alleles = set([anode.id])
+                            elif tmp_common == max_common:
+                                max_alleles.add(anode.id)
+                        classes[j][3] = max_alleles
+
+            
+            best_common_mat, best_stat, best_i, best_i2 = [], -sys.maxint, -1, -1
+            for i in range(len(equiv_list) - 1):
+                classes = equiv_list[i]
+                for i2 in range(i + 1, len(equiv_list)):
+                    classes2 = equiv_list[i2]
+                    common_mat = []
+                    for j in range(len(classes)):
+                        common_mat.append([])
+                        if known_alleles:
+                            ids = classes[j][3]
+                        else:
+                            ids = classes[j][2]
+                        for j2 in range(len(classes2)):
+                            if known_alleles:
+                                ids2 = classes2[j2][3]
+                            else:
+                                ids2 = classes2[j2][2]
+                            common_mat[-1].append(len(ids & ids2))
+
+                    # Calculate stat
+                    common_stat = 0
+                    if len(classes) == 1 or len(classes2) == 1:
+                        for row in common_mat:
+                            common_stat += sum(row)
+                    else:
+                        for row in common_mat:
+                            sorted_row = sorted(row, reverse=True)
+                            common_stat += (sorted_row[0] - sorted_row[1])
+                        if common_mat[0][0] + common_mat[1][1] == \
+                           common_mat[1][0] + common_mat[0][1]:
+                            common_stat = -1
+
+                    if common_stat > best_stat:
+                        best_common_mat, best_stat, best_i, best_i2 = common_mat, common_stat, i, i2
+
+            # DK - debugging purposes
+            # """
+            if print_msg:
+                print >> sys.stderr, "best:", best_i, best_i2, best_stat, best_common_mat
+                print >> sys.stderr
+                print >> sys.stderr
+            # """
+
+            if known_alleles and best_stat < 0:
+                self.remove_nodes(self.nodes2)
+                break
+            if best_stat < 0:
+                known_alleles = True
+                new_nodes = {}
+                for i in range(len(equiv_list)):
+                    classes = equiv_list[i]
+                    for j in range(len(classes)):
+                        ids, num_ids, all_ids, alleles = classes[j]
+                        num_ids = sorted(list(num_ids))
+
+                        # DK - debugging purposes
+                        if print_msg: print >> sys.stderr, i, j, num_ids
+
+                        assert (num_ids) > 0
+                        read_id = num_to_id[num_ids[0]]
+                        node = deepcopy(self.nodes[read_id])
+                        for num_id2 in num_ids[1:]:
+                            read_id2 = num_to_id[num_id2]
+                            node2 = self.nodes[read_id2]
+                            node.combine_with(node2)
+
+                        new_read_id = "(%d-%d)%s" % (i, j, read_id)
+                        node.id = new_read_id
+                        new_read_id not in new_nodes
+                        new_nodes[new_read_id] = node
+                        
+                self.nodes = new_nodes                
+                self.nodes2 = deepcopy(self.nodes)
+                self.remove_nodes(self.nodes)
+                continue
+
+            # DK - for the moment
+            mat = best_common_mat
+            classes, classes2 = equiv_list[best_i], equiv_list[best_i2]
+
+            # Filter vertices further if necessary
+            def del_row(classes, mat, r):
+                return classes[:r] + classes[r+1:], mat[:r] + mat[r+1:]
+            
+            def del_col(classes, mat, c):                    
+                new_mat = []
+                for row in mat:
+                    row = row[:c] + row[c+1:]
+                    new_mat.append(row)
+                return classes[:c] + classes[c+1:], new_mat
+                
+            assert len(classes) <= 2 and len(classes2) <= 2
+            if len(classes) == 2 and len(classes2) == 2:
+                # Check row
+                num_ids1, num_ids2 = len(classes[0][1]), len(classes[1][1])
+                if num_ids1 * 6 < num_ids2 or num_ids2 * 6 < num_ids1:
+                    row_sum1, row_sum2 = sum(mat[0]), sum(mat[1])
+                    if row_sum1 > max(2, row_sum2 * 6):
+                        classes, mat = del_row(classes, mat, 1)
+                        classes[0][1] -= excl_num_ids
+                    elif row_sum2 > max(2, row_sum1 * 6):
+                        classes, mat = del_row(classes, mat, 0)
+                        classes[0][1] -= excl_num_ids
+                # Check column
+                if len(classes) == 2:
+                    num_ids1, num_ids2 = len(classes2[0][1]), len(classes2[1][1])
+                    if num_ids1 * 6 < num_ids2 or num_ids2 * 6 < num_ids1:
+                        col_sum1, col_sum2 = mat[0][0] + mat[1][0], mat[0][1] + mat[1][1]
+                        if col_sum1 > max(2, col_sum2 * 6):
+                            classes2, mat = del_col(classes2, mat, 1)
+                            classes2[0][1] -= excl_num_ids
+                        elif col_sum2 > max(2, col_sum1 * 6):
+                            classes2, mat = del_col(classes2, mat, 0)
+                            classes2[0][1] -= excl_num_ids
+
+            merge_list = []
+            def add_merge(classes, classes2, i, j, k):
+                if known_alleles:
+                    num_ids1, num_ids2 = classes[i][1], classes2[j][1]
+                    num_ids1, num_ids2 = sorted(list(num_ids1)), sorted(list(num_ids2))
+                    num_id1, num_id2 = num_ids1[0], num_ids2[0]
+                    node_id1 = "(%d-%d)%s" % (best_i, i, num_to_id[num_id1])
+                    node_id2 = "(%d-%d)%s" % (best_i2, j, num_to_id[num_id2])
+                    node_id3 = "(%d-%d)%s" % (best_i, k, num_to_id[min(num_id1, num_id2)])
+                    merge_list.append([node_id1, node_id2, node_id3])
+
+                classes[i][0] = sorted(classes[i][0] + classes2[j][0])
+                classes[i][1] |= classes2[j][1]
+
+            copy_list = []
+            def add_copy(classes, classes2, i, j, k):
+                if known_alleles:
+                    num_ids = classes2[j][1]
+                    num_ids = sorted(list(num_ids))
+                    num_id = num_ids[0]
+                    node_id = "(%d-%d)%s" % (best_i2, j, num_to_id[num_id])
+                    node_id2 = "(%d-%d)%s" % (best_i, k, num_to_id[num_id])
+                    copy_list.append([node_id, node_id2])
+
+                classes[i] = classes2[j]
+
+            remove_list = []
+            def add_remove(classes, i):
+                if known_alleles:
+                    num_ids = classes[i][1]
+                    num_ids = sorted(list(num_ids))
+                    num_id = num_ids[0]
+                    node_id = "(%d-%d)%s" % (best_i, i, num_to_id[num_id])
+                    remove_list.append([node_id])
+
+                classes = [classes[1-i]]
+                         
+            if len(classes) == 1 and len(classes2) == 1:
+                add_merge(classes, classes2, 0, 0, 0)
+                
+            elif len(classes) == 1:
+                if 0 not in classes[0][0] and \
+                   mat[0][0] > max(2, mat[0][1] * 6) and \
+                   len(classes2[0][1]) > len(classes2[1][1]) * 2:
+                    add_merge(classes, classes2, 0, 0, 0)
+                elif 0 not in classes[0][0] and \
+                     mat[0][1] > max(2, mat[0][0] * 6) and \
+                     len(classes2[1][1]) > len(classes2[0][1]) * 2:
+                    add_merge(classes, classes2, 0, 1, 0)
+                else:
+                    classes.append(deepcopy(classes[0]))
+                    # Handle a special case at 5' end
+                    if 0 in classes[0][0] and len(classes[0][0]) == 1 and mat[0][0] != mat[0][1]:
+                        if mat[0][0] > mat[0][1]:
+                            add_merge(classes, classes2, 0, 0, 0)
+                            add_copy(classes, classes2, 1, 1, 1)
+                        else:
+                            assert mat[0][1] > mat[0][0]
+                            add_copy(classes, classes2, 0, 0, 0)
+                            add_merge(classes, classes2, 1, 1, 1)
+                    else:
+                        add_merge(classes, classes2, 0, 0, 0)
+                        add_merge(classes, classes2, 1, 1, 1)
+                        
+            elif len(classes2) == 1:
+                if mat[0][0] > max(2, mat[1][0] * 6):
+                    add_merge(classes, classes2, 0, 0, 0)
+                    if len(classes[0][1]) > len(classes[1][1]) * 6:
+                        add_remove(classes, 1)
+                elif mat[1][0] > max(2, mat[0][0] * 6):
+                    add_merge(classes, classes2, 1, 0, 0)
+                    if len(classes[1][1]) > len(classes[0][1]) * 6:
+                        add_remove(classes, 0)
+                else:
+                    add_merge(classes, classes2, 0, 0, 0)
+                    add_merge(classes, classes2, 1, 0, 1)
+                    
+            else:                
+                score00 = mat[0][0] + mat[1][1]
+                score01 = mat[0][1] + mat[1][0]
+                if score00 > score01:
+                    add_merge(classes, classes2, 0, 0, 0)
+                    add_merge(classes, classes2, 1, 1, 1)
+                elif score00 < score01:
+                    add_merge(classes, classes2, 0, 1, 0)
+                    add_merge(classes, classes2, 1, 0, 1)
+                else:
+                    break
+
+            for c in range(len(classes)):
+                classes[c][2] = classes[c][1] | get_mate_num_ids(classes[c][1])
+
+            equiv_list[best_i] = classes            
+            equiv_list = equiv_list[:best_i2] + equiv_list[best_i2+1:]
+            
+            if known_alleles:
+                exclude_ids = set()
+                new_nodes = {}
+                for node_id1, node_id2, node_id3 in merge_list:
+                    if self.nodes2[node_id1].left <= self.nodes2[node_id2].left:
+                        node = deepcopy(self.nodes2[node_id1])
+                        node2 = self.nodes2[node_id2]
+                    else:                        
+                        node = deepcopy(self.nodes2[node_id2])
+                        node2 = self.nodes2[node_id1]
+                    node.combine_with(node2)
+                    node.id = node_id3
+                    new_nodes[node_id3] = node
+                    exclude_ids.add(node_id1)
+                    exclude_ids.add(node_id2)
+
+                for node_id1, node_id2 in copy_list:
+                    node = self.nodes2[node_id1]
+                    node.id = node_id2
+                    new_nodes[node_id2] = node
+                    exclude_ids.add(node_id1)
+
+                exclude_ids |= set(remove_list)
+
+                for node_id, node in self.nodes2.items():
+                    if node_id in exclude_ids:
+                        continue
+                    num, id = node_id.split(')')
+                    i, i2 = num[1:].split('-')
+                    i, i2 = int(i), int(i2)
+                    if i > best_i2:
+                        i -= 1
+                    node_id = "(%d-%d)%s" % (i, i2, id)
+                    node.id = node_id
+                    new_nodes[node_id] = node
+                        
+                self.nodes2 = new_nodes
+            
+        # DK - debugging purposes
+        # sys.exit(1)
+            
+        
+    # Display graph information
+    def print_info(self): 
+        print >> sys.stderr, "Backbone len: %d" % len(self.backbone)
+        print >> sys.stderr, "\t%s" % self.backbone   
+
+
+    # Compare nodes and get information
+    def get_node_comparison_info(self, node_dic):
+        assert len(node_dic) > 0
+        nodes = [[id, node.left, node.right] for id, node in node_dic.items()]
+        def node_cmp(a, b):
+            if a[1] != b[1]:
+                return a[1] - b[1]
+            else:
+                return a[2] - b[2]
+        nodes = sorted(nodes, cmp=node_cmp)
+        seqs, colors = [], []
+        for p in range(len(self.backbone)):
+            nts = set()
+            for n in range(len(nodes)):
+                id, left, right = nodes[n]
+                node = node_dic[id]
+                if p >= left and p <= right:
+                    nt_dic = node.seq[p - left]
+                    nt = get_major_nt(nt_dic)
+                    nts.add(nt)
+
+            for n in range(len(nodes)):
+                if p == 0:
+                    seqs.append([])
+                    colors.append([])
+                id, left, right = nodes[n]
+                node = node_dic[id]
+                if p >= left and p <= right:
+                    nt_dic = node.seq[p - left]
+                    nt = get_major_nt(nt_dic)
+                    seqs[n].append(nt)
+                    if nt != self.backbone[p]:
+                        if len(nts) > 1:
+                            colors[n].append('R')
+                        else:
+                            colors[n].append('B')
+                    else:
+                        colors[n].append('N')
+                else:
+                    seqs[n].append(' ')
+
+        assert len(nodes) == len(seqs)
+        for n in range(len(nodes)):
+            node, seq, color = nodes[n], seqs[n], colors[n]
+            new_left, new_right = 0, len(seq) - 1
+            while seq[new_left] == 'D':
+                new_left += 1
+            while seq[new_right] == 'D':
+                new_right -= 1
+
+            node[1] = new_left
+            node[2] = new_right
+            seqs[n] = seq[new_left:new_right+1]
+            colors[n] = color[new_left:new_right+1]
+
+        return nodes, seqs, colors
+
+
+    # Compare nodes
+    def print_node_comparison(self, node_dic):
+        nodes, seqs, colors = self.get_node_comparison_info(node_dic)
+        interval = 100
+        for p in range(0, (len(self.backbone) + interval - 1) / interval * interval, interval):
+            cur_seqs = []
+            for n in range(len(nodes)):
+                id, left, right = nodes[n] # inclusive coordinate
+                right += 1
+                seq = []
+                seq_left, seq_right = max(p, left), min(p+interval, right)
+                if seq_left >= seq_right:
+                    continue
+                if p < left:
+                    seq += ([' '] * (left - p))
+                for s in range(seq_left, seq_right):
+                    nt, color = seqs[n][s-left], colors[n][s-left]
+                    if color in "RB":
+                        if color == 'R':
+                            nt = "\033[91m" + nt
+                        else:
+                            nt = "\033[94m" + nt
+                        nt += "\033[00m"        
+                    seq.append(nt)
+                if right < p + interval:
+                    seq += ([' '] * (p + interval - right))
+                seq = ''.join(seq)
+                cur_seqs.append([seq, id])
+
+            if len(cur_seqs) <= 0:
+                continue
+                
+            print >> sys.stderr, p
+            for seq, id in cur_seqs:
+                print >> sys.stderr, "\t", seq, id
+                                
+        
+    # Begin drawing graph
+    def begin_draw(self, fname_base):
+        assert len(self.nodes) > 0
+        nodes = [[id, node.left, node.right] for id, node in self.nodes.items()]
+        def node_cmp(a, b):
+            return a[1] - b[1]
+        nodes = sorted(nodes, cmp=node_cmp)
+
+        def get_x(x):
+            return self.left_margin + x * self.scalex
+
+        def get_y(y):
+            return self.top_margin + y * self.scaley
+
+        # Get scalar
+        def get_sx(x):
+            return x * self.scalex
+
+        def get_sy(y):
+            return y * self.scaley
+
+        htmlDraw = self.htmlDraw = HtmlDraw(fname_base)
+        htmlDraw.write_html_css(self.width, self.height)
+        htmlDraw.start_js()
+        # htmlDraw.draw_smile()
+        js_file = htmlDraw.js_file
+
+        # Choose font
+        print >> js_file, r'ctx.font = "12px Times New Roman";'
+
+        # Draw vertical dotted lines at every 100nt and thick lines at every 500nt
+        print >> js_file, r'ctx.fillStyle = "gray";'
+        for pos in range(0, nodes[-1][2], 100):
+            if pos != 0 and pos % 500 == 0:
+                print >> js_file, r'ctx.setLineDash([]);'
+                print >> js_file, r'ctx.lineWidth = 1;'
+            else:
+                print >> js_file, r'ctx.setLineDash([5, 15]);'
+                print >> js_file, r'ctx.lineWidth = 0.2;'
+
+            print >> js_file, r'ctx.beginPath();'
+            print >> js_file, r'ctx.moveTo(%d, %d);' % \
+                (get_x(pos), self.top_margin)
+            print >> js_file, r'ctx.lineTo(%d, %d);' % \
+                (get_x(pos), self.height)
+            print >> js_file, r'ctx.stroke();'
+
+        print >> js_file, r'ctx.setLineDash([]);'
+
+
+    # End drawing graph
+    def end_draw(self):
+        self.htmlDraw.end_js()
+
+        
+    # Draw graph
+    #   Top left as (0, 0) and Bottom right as (width, height)
+    def draw(self,
+             begin_y,
+             title = ""):
+        assert len(self.nodes) > 0
+        nodes = [[id, node.left, node.right] for id, node in self.nodes.items()]
+        def node_cmp(a, b):
+            return a[1] - b[1]
+        nodes = sorted(nodes, cmp=node_cmp)
+        max_right = len(self.backbone)
+
+        # display space
+        end_y = self.unscaled_height if begin_y > 0 else self.unscaled_height * 0.8
+        dspace = [[[begin_y, end_y]]] * (max_right + 1)
+        def get_dspace(left, right, height):
+            assert left < len(dspace) and right < len(dspace)
+            range1 = dspace[left]
+            for range2 in dspace[left + 1:right + 1]:
+                new_range = []
+                # sub range
+                for t1, b1 in range1:
+                    for t2, b2 in range2:
+                        if b1 < t2:
+                            break
+                        if b2 < t1:
+                            continue
+                        t, b = max(t1, t2), min(b1, b2)
+                        if b - t >= height:
+                            new_range.append([t, b])
+
+                range1 = new_range
+            if len(range1) <= 0:
+                return -1
+
+            t, b = range1[0]
+            assert b - t >= height
+            b = t + height
+            for i in range(left, right+1):
+                range1 = dspace[i]
+                range2 = []
+                found = False
+                for j in range(len(range1)):
+                    t2, b2 = range1[j]
+                    if t2 <= t and b <= b2:
+                        found = True
+                        if t2 < t:
+                            range2.append([t2, t])
+                        if b < b2:
+                            range2.append([b, b2])
+                    else:
+                        range2.append([t2, b2])
+                dspace[i] = range2
+                assert found
+            return t
+
+        def get_x(x):
+            return self.left_margin + x * self.scalex
+
+        def get_y(y):
+            return self.top_margin + y * self.scaley
+
+        # Get scalar
+        def get_sx(x):
+            return x * self.scalex
+
+        def get_sy(y):
+            return y * self.scaley
+
+        htmlDraw = self.htmlDraw
+        # htmlDraw.draw_smile()
+        js_file = htmlDraw.js_file
+
+        # Draw exons
+        y = get_dspace(0, max_right, 14)
+        for e in range(len(self.exons)):
+            left, right = self.exons[e]
+            right += 1
+
+            # Draw node
+            print >> js_file, r'ctx.beginPath();'
+            print >> js_file, r'ctx.rect(%d, %d, %d, %d);' % \
+                (get_x(left), get_y(y), get_x(right) - get_x(left), get_sy(10))
+            print >> js_file, r'ctx.fillStyle = "white";'
+            print >> js_file, r'ctx.fill();'
+            print >> js_file, r'ctx.lineWidth = 2;'
+            print >> js_file, r'ctx.strokeStyle = "black";'
+            print >> js_file, r'ctx.stroke();'
+
+            # Draw label
+            print >> js_file, r'ctx.fillStyle = "blue";'
+            print >> js_file, r'ctx.fillText("Exon %d", %d, %d);' % \
+                (e+1, get_x(left + 2), get_y(y + 7))
+
+            if e > 0:
+                prev_right = self.exons[e-1][1] + 1
+                print >> js_file, r'ctx.beginPath();'
+                print >> js_file, r'ctx.moveTo(%d, %d);' % (get_x(left), get_y(y + 5))
+                print >> js_file, r'ctx.lineTo(%d, %d);' % (get_x(prev_right), get_y(y + 5))
+                print >> js_file, r'ctx.stroke();'
+
+        # Draw true or predicted alleles
+        node_colors = ["#FFFF00", "#00FF00", "#FFCBA4", "#C14581"]
+        allele_node_colors = ["#DDDD00", "#008800", "#DDA982", "#A12561"]
+        def draw_alleles(allele_node_dic, allele_node_colors, display = False):
+            if len(allele_node_dic) <= 0:
+                return
+            allele_nodes, seqs, colors = self.get_node_comparison_info(allele_node_dic)
+            for n_ in range(len(allele_nodes)):
+                n = -1
+                prob = ""
+                if not display and \
+                   not self.simulation and \
+                   len(self.allele_node_order) == len(allele_node_dic):
+                    allele_id, prob = self.allele_node_order[n_]
+                    for n2_ in range(len(allele_nodes)):
+                        if allele_id == allele_nodes[n2_][0]:
+                            n = n2_
+                            break
+                    prob = ": %.2f" % prob
+                else:
+                    n = n_
+                assert n >= 0 and n < len(allele_nodes)
+                allele_id, left, right = allele_nodes[n]
+                right += 1
+                allele_node = allele_node_dic[allele_id]
+                y = get_dspace(0, max_right, 14)
+
+                # Draw allele name
+                if display:
+                    allele_type = "Omixon"
+                else:
+                    if self.simulation:
+                        allele_type = "true"
+                    else:
+                        allele_type = "predicted"
+                print >> js_file, r'ctx.fillStyle = "blue";'
+                print >> js_file, r'ctx.font = "20px Times New Roman";'
+                print >> js_file, r'ctx.fillText("%s (%s, %s%s)", %d, %d);' % \
+                    (allele_id,
+                     "partial" if allele_id in self.partial_allele_ids else "full",
+                     allele_type,
+                     # prob,
+                     "",
+                     10,
+                     get_y(y + 5))
+                print >> js_file, r'ctx.font = "12px Times New Roman";'
+        
+                # Draw node
+                print >> js_file, r'ctx.beginPath();'
+                print >> js_file, r'ctx.rect(%d, %d, %d, %d);' % \
+                    (get_x(left), get_y(y), get_x(right) - get_x(left), get_sy(10))
+                print >> js_file, r'ctx.fillStyle = "%s";' % (allele_node_colors[n % len(allele_node_colors)])
+                print >> js_file, r'ctx.fill();'
+                print >> js_file, r'ctx.lineWidth = 2;'
+                print >> js_file, r'ctx.strokeStyle = "black";'
+                print >> js_file, r'ctx.stroke();'
+
+                color_boxes = []
+                c = 0
+                while c < len(colors[n]):
+                    color = colors[n][c]
+                    c2 = c + 1
+                    if color != 'N':                        
+                        while c2 < len(colors[n]):
+                            color2 = colors[n][c2]
+                            if color != color2:
+                                break
+                            c2 += 1
+                        color_boxes.append([c, c2, color])
+                    c = c2
+
+                # Draw variants
+                for color_box in color_boxes:
+                    cleft, cright, color = color_box
+                    cleft += left; cright += left
+                    if color == 'B':
+                        color = "blue" 
+                    else:
+                        color = "#1E90FF"
+                    # DK - debugging purposes
+                    color = "blue"
+                    print >> js_file, r'ctx.beginPath();'
+                    print >> js_file, r'ctx.rect(%d, %d, %d, %d);' % \
+                        (get_x(cleft), get_y(y + 1), get_x(cright) - get_x(cleft), get_sy(8))
+                    print >> js_file, r'ctx.fillStyle = "%s";' % (color)
+                    print >> js_file, r'ctx.fill();'
+            return allele_nodes, seqs, colors
+
+        allele_nodes, seqs, colors = draw_alleles(self.true_allele_nodes if self.simulation else self.predicted_allele_nodes,
+                                                  allele_node_colors)
+        draw_alleles(self.display_allele_nodes,
+                     ["#FFF5EE"],
+                     True) # display alleles?
+
+        # Draw location at every 100bp
+        y = get_dspace(0, nodes[-1][2], 14)
+        for pos in range(0, nodes[-1][2], 100):
+            # Draw label
+            print >> js_file, r'ctx.fillStyle = "blue";'
+            print >> js_file, r'ctx.fillText("%d", %d, %d);' % \
+                (pos, get_x(pos+2), get_y(y + 2))
+
+        # Draw nodes
+        node_to_y = {}
+        draw_title = False
+        for id, left, right in nodes:
+            node = self.nodes[id]
+
+            # Get y position
+            y = get_dspace(left, right, 14)
+            if y < 0:
+                continue
+            node_to_y[id] = y
+
+            node_vars = node.get_vars()
+            node_var_ids = node.get_var_ids()
+            if len(allele_nodes) > 0:
+                color = "white"
+                max_common = -sys.maxint
+                for a in range(len(allele_nodes)):
+                    allele_node_id, allele_left, allele_right = allele_nodes[a]
+                    if right - left <= 500 and (left < allele_left or right > allele_right):
+                        continue
+                    if self.simulation:
+                        allele_node = self.true_allele_nodes[allele_node_id]
+                    else:
+                        allele_node = self.predicted_allele_nodes[allele_node_id]
+                    allele_vars = allele_node.get_var_ids(left, right)
+                    common_vars = set(node_var_ids) & set(allele_vars)
+                    tmp_common = len(common_vars) - len(set(node_var_ids) | set(allele_vars))
+                    if max_common < tmp_common:
+                        max_common = tmp_common
+                        color = node_colors[a % len(node_colors)]
+                    elif max_common == tmp_common:
+                        color = "white"
+            else:
+                color = "yellow"    
+
+            # Draw node
+            right += 1
+            print >> js_file, r'ctx.beginPath();'
+            print >> js_file, r'ctx.rect(%d, %d, %d, %d);' % \
+                (get_x(left), get_y(y), get_x(right) - get_x(left), get_sy(10))
+            print >> js_file, r'ctx.fillStyle = "%s";' % color
+            print >> js_file, r'ctx.fill();'
+            print >> js_file, r'ctx.lineWidth = 2;'
+            print >> js_file, r'ctx.strokeStyle = "black";'
+            print >> js_file, r'ctx.stroke();'
+
+            # Draw variants
+            for var_id, pos in node_vars:
+                if var_id == "gap":
+                    var_type, var_left = "single", pos
+                    color = "black"
+                elif var_id == "unknown" or var_id.startswith("nv"):
+                    var_type, var_left = "single", pos
+                    color = "red"
+                else:
+                    var_type, var_left, var_data = self.gene_vars[var_id]
+                    color = "blue"
+                if var_type == "single":
+                    var_right = var_left + 1
+                else:
+                    assert var_type == "deletion"
+                    var_right = var_left + int(var_data)
+                print >> js_file, r'ctx.beginPath();'
+                print >> js_file, r'ctx.rect(%d, %d, %d, %d);' % \
+                    (get_x(var_left), get_y(y + 1), get_x(var_right) - get_x(var_left), get_sy(8))
+                print >> js_file, r'ctx.fillStyle = "%s";' % (color)
+                print >> js_file, r'ctx.fill();'
+
+            # Draw label
+            if get_sx(right - left) >= 300:
+                print >> js_file, r'ctx.fillStyle = "blue";'
+                print >> js_file, r'ctx.fillText("%s", %d, %d);' % \
+                    (node.id, get_x(left + 2), get_y(y + 7))
+
+            if not draw_title:
+                draw_title = True
+                print >> js_file, r'ctx.font = "24px Times New Roman";'
+                print >> js_file, r'ctx.fillText("%s", %d, %d);' % \
+                    (title, 10, get_y(y + 7))
+                print >> js_file, r'ctx.font = "12px Times New Roman";'
+
+
+        # Draw edges
+        print >> js_file, r'ctx.lineWidth = 1;'
+        line_colors = ["red", "black", "blue"]
+        for node_id, to_node_ids in self.to_node.items():
+            node = self.nodes[node_id]
+            node_x = (get_x(node.left) + get_x(node.right)) / 2
+            node_y = get_y(node_to_y[node_id] + 5)
+            print >> js_file, r'ctx.strokeStyle = "%s";' % \
+                line_colors[random.randrange(len(line_colors))]
+            for to_node_id, _ in to_node_ids:
+                to_node = self.nodes[to_node_id]
+                to_node_x = (get_x(to_node.left) + get_x(to_node.right) + (random.random() * 10 - 5)) / 2
+                to_node_y = get_y(node_to_y[to_node_id] + 5)
+
+                jitter1, jitter2 = (random.random() * 10 - 5), (random.random() * 10 - 5)
+                jitter1, jitter2 = get_sx(jitter1), get_sx(jitter2)
+
+                print >> js_file, r'ctx.beginPath();'
+                print >> js_file, r'ctx.moveTo(%d, %d);' % (node_x + jitter1, node_y)
+                print >> js_file, r'ctx.lineTo(%d, %d);' % (to_node_x + jitter2, to_node_y)
+                print >> js_file, r'ctx.stroke();'
+
+        curr_y = get_dspace(0, nodes[-1][2], 1)
+        return curr_y if curr_y > 0 else end_y
+
+        
+class HtmlDraw:
+    def __init__(self, base_fname):
+        self.base_fname = base_fname
+
+        
+    def write_html_css(self, width = 2000, height = 1000):
+        base_fname = self.base_fname
+        html_file = open("%s.html" % base_fname, 'w')
+        print >> html_file, r'<!DOCTYPE html>'
+        print >> html_file, r'<html>'
+        print >> html_file, r'<head>'
+        print >> html_file, r'<title>HISAT-genotyping HLA</title>'
+        print >> html_file, r'<link rel="stylesheet" type="text/css" href="%s.css"/>' % (base_fname.split("/")[-1])
+        print >> html_file, r'</head>'
+        print >> html_file, r'<body>'
+        print >> html_file, r'<canvas id="a" width="%d" height="%d">' % (width, height)
+        print >> html_file, r'This text is displayed if your browser does not support HTML5 Canvas.'
+        print >> html_file, r'</canvas>'
+        print >> html_file, r'<script type="text/javascript" src="%s.js"></script>' % (base_fname.split("/")[-1])
+        print >> html_file, r'</body>'
+        print >> html_file, r'</html>'
+        html_file.close()
+
+        css_file = open("%s.css" % base_fname, 'w')
+        print >> css_file, r'canvas {'
+        print >> css_file, r'border: 1px dotted black;'
+        print >> css_file, r'}'
+        css_file.close()
+
+        
+    def start_js(self):
+        self.js_file = open("%s.js" % self.base_fname, 'w')
+        print >> self.js_file, r'var a_canvas = document.getElementById("a");'
+        print >> self.js_file, r'var ctx = a_canvas.getContext("2d");'
+
+        
+    def end_js(self):
+        self.js_file.close()
+
+        
+    def draw_smile(self):
+        js_file = self.js_file
+        
+        # Draw the face
+        print >> js_file, r'ctx.fillStyle = "yellow";'
+        print >> js_file, r'ctx.beginPath();'
+        print >> js_file, r'ctx.arc(95, 85, 40, 0, 2*Math.PI);'
+        print >> js_file, r'ctx.closePath();'
+        print >> js_file, r'ctx.fill();'
+        print >> js_file, r'ctx.lineWidth = 2;'
+        print >> js_file, r'ctx.stroke();'
+        print >> js_file, r'ctx.fillStyle = "black";'
+        
+        # Draw the left eye
+        print >> js_file, r'ctx.beginPath();'
+        print >> js_file, r'ctx.arc(75, 75, 5, 0, 2*Math.PI);'
+        print >> js_file, r'ctx.closePath();'
+        print >> js_file, r'ctx.fill();'
+
+        # Draw the right eye
+        print >> js_file, r'ctx.beginPath();'
+        print >> js_file, r'ctx.arc(114, 75, 5, 0, 2*Math.PI);'
+        print >> js_file, r'ctx.closePath();'
+        print >> js_file, r'ctx.fill();'
+
+        # Draw the mouth
+        print >> js_file, r'ctx.beginPath();'
+        print >> js_file, r'ctx.arc(95, 90, 26, Math.PI, 2*Math.PI, true);'
+        print >> js_file, r'ctx.closePath();'
+        print >> js_file, r'ctx.fill();'
+
+        # Write "Hello, World!"
+        print >> js_file, r'ctx.font = "30px Garamond";'
+        print >> js_file, r'ctx.fillText("Hello, World!", 15, 175);'
+       
diff --git a/hisatgenotype_modules/hisatgenotype_convert_codis.py b/hisatgenotype_modules/hisatgenotype_convert_codis.py
new file mode 100755
index 0000000..9c4d1ee
--- /dev/null
+++ b/hisatgenotype_modules/hisatgenotype_convert_codis.py
@@ -0,0 +1,627 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2017, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT 2.
+#
+# HISAT 2 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT 2 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import os, sys, subprocess, re
+import inspect, operator
+from copy import deepcopy
+from argparse import ArgumentParser, FileType
+import typing_common
+
+# sequences for DNA fingerprinting loci are available at http://www.cstl.nist.gov/biotech/strbase/seq_ref.htm
+
+orig_CODIS_seq = {
+    "CSF1PO" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_CSF1PO.htm
+    # allele 13: 5:150076172-150076490 - (samtools faidx genome.fa - GRCh38)
+    ["[AGAT]13",
+     "AACCTGAGTCTGCCAAGGACTAGCAGGTTGCTAACCACCCTGTGTCTCAGTTTTCCTACCTGTAAAATGAAGATATTAACAGTAACTGCCTTCATAGATAGAAGATAGATAGATT", # left flanking sequence
+     "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT", # STR
+     "AGGAAGTACTTAGAACAGGGTCTGACACAGGAAATGCTGTCCAAGTGTGCACCAGGAGATAGTATCTGAGAAGGCTCAGTCTGGCACCATGTGGGTTGGGTGGGAACCTGGAGGCTGGAGAATGGGCTGAAGATGGCCAGTGGTGTGTGGAA"], # right flanking sequence
+             
+    "FGA" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_FGA.htm
+    # allele 22: 4:154587696-154587891 -
+    ["[TTTC]3TTTTTTCT[CTTT]14CTCC[TTCC]2",
+     "GCCCCATAGGTTTTGAACTCACAGATTAAACTGTAACCAAAATAAAATTAGGCATATTTACAAGCTAG",
+     "TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC",
+     "TTTCTTCCTTTCTTTTTTGCTGGCAATTACAGACAAATCA"],
+
+    "TH01" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_TH01.htm
+    # allele 7: 11:2170990-2171176 +
+    ["[AATG]7",
+     "GTGGGCTGAAAAGCTCCCGATTATCCAGCCTGGCCCACACAGTCCCCTGTACACAGGGCTTCCGAGTGCAGGTCACAGGGAACACAGACTCCATGGTG",
+     "AATGAATGAATGAATGAATGAATGAATG",
+     "AGGGAAATAAGGGAGGAACAGGCCAATGGGAATCACCCCAGAGCCCAGATACCCTTTGAAT"],
+             
+    "TPOX" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_TPOX.htm
+    # allele 8: 2:1489617-1489848
+    ["[AATG]8",
+     "ACTGGCACAGAACAGGCACTTAGGGAACCCTCACTG",
+     "AATGAATGAATGAATGAATGAATGAATGAATG",
+     "TTTGGGCAAATAAACGCTGACAAGGACAGAAGGGCCTAGCGGGAAGGGAACAGGAGTAAGACCAGCGCACAGCCCGACTTGTGTTCAGAAGACCTGGGATTGGACCTGAGGAGTTCAATTTTGGATGAATCTCTTAATTAACCTGTGGGGTTCCCAGTTCCTCC"],
+             
+    "VWA" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_VWA.htm
+    # allele unknown: 12:5983938-5984087 -
+    ["TCTA[TCTG]5[TCTA]11TCCA TCTA",
+     "CCCTAGTGGATGATAAGAATAATCAGTATGTGACTTGGATTGA",
+     "TCTATCTGTCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA",
+     "TCCATCCATCCTATGTATTTATCATCTGTCC"],
+             
+    "D3S1358" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_D3S1358.htm
+    # allele unknown: 3:45540713-45540843 +
+    ["TCTATCTG[TCTA]14",
+     "ATGAAATCAACAGAGGCTTGCATGTA",
+     "TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA",
+     "TGAGACAGGGTCTTGCTCTGTCACCCAGATTGGACTGCAGT"],
+             
+    "D5S818" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_D5S818.htm
+    # allele 11: 5:123775504-123775638 -
+    ["[AGAT]11",
+     "GGTGATTTTCCTCTTTGGTATCCTTATGTAATATTTTGA",
+     "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT",
+     "AGAGGTATAAATAAGGATACAGATAAAGATACAAATGTTGTAAACTGTGGCT"],
+             
+    "D7S820" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_D7S820.htm
+    # allele 13: 7:84160125-84160367 -
+    ["[GATA]13",
+     "ATGTTGGTCAGGCTGACTATGGAGTTATTTTAAGGTTAATATATATAAAGGGTATGATAGAACACTTGTCATAGTTTAGAACGAACTAAC",
+     "GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA",
+     "GACAGATTGATAGTTTTTTTTAATCTCACTAAATAGTCTATAGTAAACATTTAATTACCAATATTTGGTGCAATTCTGTCAATGAGGATAAATGTGGAATC"],
+             
+    "D8S1179" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_D8S1179.htm
+    # allele 13: 8:124894838-124895018 +
+    ["[TCTA]1[TCTG]1[TCTA]11",
+     "TTTTTGTATTTCATGTGTACATTCGTA",
+     "TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA",
+     "TTCCCCACAGTGAAAATAATCTACAGGATAGGTAAATAAATTAAGGCATATTCACGCAATGGGATACGATACAGTGATGAAAATGAACTAATTATAGCTACG"],
+             
+    "D13S317" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_D13S317.htm
+    # Perhaps, allele 11: 13:82147921-82148112 +
+    ["[TATC]11",
+     "ATCACAGAAGTCTGGGATGTGGAGGAGAGTTCATTTCTTTAGTGGGCATCCGTGACTCTCTGGACTCTGACCCATCTAACGCCTATCTGTATTTACAAATACAT",
+     "TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC",
+     "AATCAATCATCTATCTATCTTTCTGTCTGTCTTTTTGGGCTGCC"],
+             
+    "D16S539" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_D16S539.htm
+    # allele 11: 16:86352518-86352805 +
+    ["[GATA]11",
+     "GGGGGTCTAAGAGCTTGTAAAAAGTGTACAAGTGCCAGATGCTCGTTGTGCACAAATCTAAATGCAGAAAAGCACTGAAAGAAGAATCCAGAAAACCACAGTTCCCATTTTTATATGGGAGCAAACAAAGGCAGATCCCAAGCTCTTCCTCTTCCCTAGATCAATACAGACAGACAGACAGGTG",
+     "GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA",
+     "TCATTGAAAGACAAAACAGAGATGGATGATAGATACATGCTTACAGATGCACACACAAAC"],
+             
+    "D18S51" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_D18S51.htm
+    # allele 18: 18:63281611-63281916 +
+    ["[AGAA]18",
+     "GAGCCATGTTCATGCCACTGCACTTCACTCTGAGTGACAAATTGAGACCTTGTCTC",
+     "AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAA",
+     "AAAGAGAGAGGAAAGAAAGAGAAAAAGAAAAGAAATAGTAGCAACTGTTATTGTAAGACATCTCCACACACCAGAGAAGTTAATTTTAATTTTAACATGTTAAGAACAGAGAGAAGCCAACATGTCCACCTTAGGCTGACGGTTTGTTTATTTGTGTTGTTGCTGGTAGTCGGGTTTG"],
+             
+    "D21S11" :
+    # http://www.cstl.nist.gov/biotech/strbase/str_D21S11.htm
+    # Perhaps, allele 29: 21:19181945-19182165 +
+    ["[TCTA]4[TCTG]6[TCTA]3TA[TCTA]3TCA[TCTA]2TCCATA[TCTA]11",
+     "GTGAGTCAATTCCCCAAGTGAATTGCCT",
+     "TCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA",
+     "TCGTCTATCTATCCAGTCTATCTACCTCCTATTAGTCTGTCTCTGGAGAACATTGACTAATACAAC"]
+}
+
+# "AMEL" - http://www.cstl.nist.gov/biotech/strbase/jpg_amel.htm
+#          X chromosome has 6 bp deletion and Y chromosome doesn't
+
+CODIS_ref_name = {}
+
+
+"""
+"""
+def get_flanking_seqs(seq,
+                      flank_len = 500):
+    def align_seq(seq):
+        aligner_cmd = ["hisat2",
+                       "--score-min", "C,0",
+                       "--no-unal",
+                        "-x", "grch38/genome",
+                        "-c", seq]
+        align_proc = subprocess.Popen(aligner_cmd,
+                                      stdout=subprocess.PIPE,
+                                      stderr=open("/dev/null", 'w'))
+        chr, left, right, strand = "", -1, -1, '+'
+        for line in align_proc.stdout:
+            if line.startswith('@'):
+                continue
+            line = line.strip()
+            cols = line.split()
+            allele_id, flag, chr, left, _, cigar_str = cols[:6]
+            assert cigar_str[-1] == 'M'
+            left = int(left)
+            flag = int(flag)
+            strand = '-' if flag & 0x10 else '+'
+            assert cigar_str == ("%dM" % len(seq))
+            right = left + len(seq)
+            break
+        
+        assert chr != "" and left >= 0 and right > left
+        return chr, left, right, strand
+    
+    chr, left, right, strand = align_seq(seq)    
+    left_flank_seq, right_flank_seq = "", ""
+    if left > 1:
+        extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, max(1, left - flank_len), left - 1)]
+        extract_seq_proc = subprocess.Popen(extract_seq_cmd,
+                                            stdout=subprocess.PIPE,
+                                            stderr=open("/dev/null", 'w'))
+        for line in extract_seq_proc.stdout:
+            if line.startswith('>'):
+                continue
+            line = line.strip()
+            left_flank_seq += line
+    extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, right, right + flank_len - 1)]
+    extract_seq_proc = subprocess.Popen(extract_seq_cmd,
+                                        stdout=subprocess.PIPE,
+                                        stderr=open("/dev/null", 'w'))
+    for line in extract_seq_proc.stdout:
+        if line.startswith('>'):
+            continue
+        line = line.strip()
+        right_flank_seq += line
+
+    if strand == '-':
+        left_flank_seq, right_flank_seq = typing_common.reverse_complement(right_flank_seq), typing_common.reverse_complement(left_flank_seq)
+
+    chr, _, _, _ = align_seq(left_flank_seq + seq + right_flank_seq)
+    assert chr != ""
+    
+    return left_flank_seq, right_flank_seq
+
+
+
+"""
+"""
+def get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j):
+    if repeat_i == repeat_j:
+        return 0
+    elif repeat_nums_i == repeat_nums_j and repeat_nums_i == set([1]):
+        return -1
+    else:
+        return -2
+
+    
+"""
+Smith Waterman Algorithm
+"""
+def SW_alignment(allele_i, allele_j):
+    n, m = len(allele_i), len(allele_j)
+    a = [[0 for j in range(m)] for i in range(n)]
+
+    # Fill 2D array
+    for i in range(n):
+        repeat_i, repeat_nums_i = allele_i[i]
+        for j in range(m):
+            repeat_j, repeat_nums_j = allele_j[j]
+            if i == 0:
+                if j == 0:
+                    if repeat_i == repeat_j:
+                        a[i][j] = 0
+                    else:
+                        a[i][j] = -1
+                else:
+                    assert j > 0
+                    a[i][j] = a[i][j-1] - 1
+            elif j == 0:
+                assert i > 0
+                a[i][j] = a[i-1][j] - 1
+            else:
+                equal_score = get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j)
+                a[i][j] = max(a[i-1][j] - 1, a[i][j-1] - 1, a[i-1][j-1] + equal_score)
+
+    return a, n, m
+
+
+"""
+"""
+def combine_alleles(backbone_allele, add_allele):
+    allele_i, allele_j = backbone_allele, add_allele
+    a, n, m = SW_alignment(allele_i, allele_j)
+
+    # Back tracking
+    new_backbone_allele = []
+    i, j = n - 1, m - 1
+    while i >= 0 and j >= 0:
+        repeat_i, repeat_nums_i = allele_i[i]
+        repeat_j, repeat_nums_j = allele_j[j]
+        if i == 0:
+            if j == 0:
+                if repeat_i == repeat_j:
+                    new_backbone_allele.append([repeat_i, repeat_nums_i | repeat_nums_j])
+                else:
+                    assert repeat_nums_i == repeat_nums_j
+                    assert repeat_nums_i == set([1])
+                    new_backbone_allele.append([repeat_i | repeat_j, repeat_nums_i | repeat_nums_j])
+            else:
+                new_backbone_allele.append([repeat_j, repeat_nums_j | set([0])])
+            j -= 1
+        elif j == 0:
+            assert i > 0
+            new_backbone_allele.append([repeat_i, repeat_nums_i | set([0])])
+            i -= 1
+        else:
+            equal_score = get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j)
+            if a[i-1][j] - 1 == a[i][j]:
+                new_backbone_allele.append([repeat_i, repeat_nums_i | set([0])])
+                i -= 1
+            elif a[i][j-1] - 1 == a[i][j]:
+                new_backbone_allele.append([repeat_j, repeat_nums_j | set([0])])
+                j -= 1
+            else:
+                assert a[i-1][j-1] + equal_score == a[i][j]
+                if repeat_i == repeat_j:
+                    new_backbone_allele.append([repeat_i, repeat_nums_i | repeat_nums_j])
+                else:
+                    assert repeat_nums_i == repeat_nums_j
+                    assert repeat_nums_i == set([1])
+                    new_backbone_allele.append([repeat_i | repeat_j, repeat_nums_i | repeat_nums_j])
+                i -= 1
+                j -= 1
+
+    new_backbone_allele = new_backbone_allele[::-1]
+
+    return new_backbone_allele
+
+
+"""
+"""
+def msf_alignment(backbone_allele, allele):
+    allele_i, allele_j = backbone_allele, allele
+    a, n, m = SW_alignment(allele_i, allele_j)
+
+    # Back tracking
+    allele_seq, backbone_seq = "", ""
+    i, j = n - 1, m - 1
+    while i >= 0 and j >= 0:
+        repeats_i, repeat_nums_i = allele_i[i]
+        repeat_i = ""
+        max_repeat = ""
+        for repeat_str in repeats_i:
+            if len(repeat_str) > len(repeat_i):
+                repeat_i = repeat_str
+        repeat_num_i = max(repeat_nums_i)
+        repeats_j, repeat_nums_j = allele_j[j]
+        assert len(repeats_j) == 1 and len(repeat_nums_j) == 1
+        repeat_j, repeat_num_j = list(repeats_j)[0], list(repeat_nums_j)[0]
+        if i == 0:
+            assert j == 0
+            if repeats_i == repeats_j:
+                add_seq = repeat_i * repeat_num_j
+                dot_seq = '.' * (len(repeat_i) * (repeat_num_i - repeat_num_j))
+                allele_seq = add_seq + dot_seq + allele_seq
+                add_seq = repeat_i * repeat_num_i
+                backbone_seq = add_seq + backbone_seq
+            else:
+                assert repeat_nums_i == repeat_nums_j and repeat_nums_i == set([1])
+                dot_seq = '.' * (len(repeat_i) - len(repeat_j))
+                allele_seq = repeat_j + dot_seq + allele_seq
+                backbone_seq = repeat_i + backbone_seq                    
+            j -= 1
+        elif j == 0:
+            assert i > 0
+            allele_seq = '.' * (len(repeat_i) * repeat_num_i) + allele_seq
+            backbone_seq = repeat_i * repeat_num_i + backbone_seq
+            i -= 1
+        else:
+            equal_score = get_equal_score(repeats_i, repeat_nums_i, repeats_j, repeat_nums_j)
+            if a[i-1][j] - 1 == a[i][j]:
+                allele_seq = '.' * (len(repeat_i) * repeat_num_i) + allele_seq
+                backbone_seq = repeat_i * repeat_num_i + backbone_seq
+                i -= 1
+            elif a[i][j-1] - 1 == a[i][j]:
+                assert False
+            else:
+                assert a[i-1][j-1] + equal_score == a[i][j]
+                if repeat_i == repeat_j:
+                    add_seq = repeat_i * repeat_num_j
+                    dot_seq = '.' * (len(repeat_i) * (repeat_num_i - repeat_num_j))
+                    allele_seq = add_seq + dot_seq + allele_seq
+                    add_seq = repeat_i * repeat_num_i
+                    backbone_seq = add_seq + backbone_seq
+                else:
+                    assert repeat_nums_i == repeat_nums_j and repeat_nums_i == set([1])
+                    dot_seq = '.' * (len(repeat_i) - len(repeat_j))
+                    allele_seq = repeat_j + dot_seq + allele_seq
+                    backbone_seq = repeat_i + backbone_seq                    
+                i -= 1
+                j -= 1
+
+    return allele_seq, backbone_seq
+
+
+"""
+Extract multiple sequence alignments
+"""
+def extract_msa(base_dname,
+                base_fname,
+                locus_list,
+                verbose):    
+    # Download human genome and HISAT2 index
+    HISAT2_fnames = ["grch38",
+                     "genome.fa",
+                     "genome.fa.fai"]
+    if not typing_common.check_files(HISAT2_fnames):
+        typing_common.download_genome_and_index(ex_path)
+
+    CODIS_seq = orig_CODIS_seq
+    if len(locus_list) > 0:
+        new_CODIS_seq = {}
+        for locus_name, fields in CODIS_seq.items():
+            if locus_name in locus_list:
+                new_CODIS_seq[locus_name] = fields
+        CODIS_seq = new_CODIS_seq        
+
+    # Add some additional sequences to allele sequences to make them reasonably long for typing and assembly
+    for locus_name, fields in CODIS_seq.items():
+        _, left_seq, repeat_seq, right_seq = fields
+        allele_seq = left_seq + repeat_seq + right_seq
+        left_flank_seq, right_flank_seq = get_flanking_seqs(allele_seq)
+        CODIS_seq[locus_name][1] = left_flank_seq + left_seq
+        CODIS_seq[locus_name][3] = right_seq + right_flank_seq
+
+        print >> sys.stderr, "%s is found on the reference genome (GRCh38)" % locus_name
+    
+    for locus_name in CODIS_seq.keys():
+        alleles = []
+        for line in open("hisatgenotype_db/CODIS/codis.dat"):
+            locus_name2, allele_id, repeat_st = line.strip().split('\t')
+            if locus_name != locus_name2:
+                continue
+            alleles.append([allele_id, repeat_st])
+
+        # From   [TTTC]3TTTTTTCT[CTTT]20CTCC[TTCC]2
+        # To     [['TTTC', [3]], ['TTTTTTCT', [1]], ['CTTT', [20]], ['CTCC', [1]], ['TTCC', [2]]]
+        def read_allele(repeat_st):
+            allele = []
+            s = 0
+            while s < len(repeat_st):
+                ch = repeat_st[s]
+                if ch == ' ':
+                    s += 1
+                    continue
+                assert ch in "[ACGT"
+                if ch == '[':
+                    s += 1
+                    repeat = ""
+                    while s < len(repeat_st):
+                        nt = repeat_st[s]
+                        if nt in "ACGT":
+                            repeat += nt
+                            s += 1
+                        else:
+                            assert nt == ']'
+                            s += 1
+                            break
+                    assert s < len(repeat_st)
+                    num = 0
+                    while s < len(repeat_st):
+                        digit = repeat_st[s]
+                        if digit.isdigit():
+                            num = num * 10 + int(digit)
+                            s += 1
+                        else:
+                            break
+                    assert num > 0
+                    allele.append([set([repeat]), set([num])])
+                else:
+                    repeat = ""
+                    while s < len(repeat_st):
+                        nt = repeat_st[s]
+                        if nt in "ACGT":
+                            repeat += nt
+                            s += 1
+                        else:
+                            assert nt == ' ' or nt == '['
+                            break
+                    allele.append([set([repeat]), set([1])])
+
+            # Sanity check
+            cmp_repeat_st = ""
+            for repeats, repeat_nums in allele:
+                repeat = list(repeats)[0]
+                repeat_num = list(repeat_nums)[0]
+                if repeat_num > 1 or locus_name == "D8S1179":
+                    cmp_repeat_st += "["
+                cmp_repeat_st += repeat
+                if repeat_num > 1 or locus_name == "D8S1179":
+                    cmp_repeat_st += "]%d" % repeat_num
+
+            assert repeat_st.replace(' ', '') == cmp_repeat_st.replace(' ', '')
+            return allele
+
+        alleles = [[allele_id, read_allele(repeat_st)] for allele_id, repeat_st in alleles]
+
+        def to_sequence(repeat_st):
+            sequence = ""
+            for repeats, repeat_nums in repeat_st:
+                repeat = list(repeats)[0]
+                repeat_num = list(repeat_nums)[0]
+                sequence += (repeat * repeat_num)
+            return sequence
+
+        def remove_redundant_alleles(alleles):
+            seq_to_ids = {}
+            new_alleles = []
+            for allele_id, repeat_st in alleles:
+                allele_seq = to_sequence(repeat_st)
+                if allele_seq in seq_to_ids:
+                    print >> sys.stderr, "Warning) %s: %s has the same sequence as %s" % \
+                        (locus_name, allele_id, seq_to_ids[allele_seq])
+                    continue
+                if allele_seq not in seq_to_ids:
+                    seq_to_ids[allele_seq] = [allele_id]
+                else:
+                    seq_to_ids[allele_seq].append(allele_id)         
+                new_alleles.append([allele_id, repeat_st])
+
+            return new_alleles
+
+        alleles = remove_redundant_alleles(alleles)
+
+        allele_seqs = [[allele_id, to_sequence(repeat_st)] for allele_id, repeat_st in alleles]
+
+        ref_allele_st, ref_allele_left, ref_allele, ref_allele_right = CODIS_seq[locus_name]
+        ref_allele_st = read_allele(ref_allele_st)
+        for allele_id, allele_seq in allele_seqs:
+            if ref_allele == allele_seq:
+                CODIS_ref_name[locus_name] = allele_id
+                break
+            
+        # Add GRCh38 allele
+        if locus_name not in CODIS_ref_name:
+            allele_id = "GRCh38"
+            CODIS_ref_name[locus_name] = allele_id
+            allele_seqs = [[allele_id, ref_allele]] + allele_seqs
+            alleles = [[allele_id, ref_allele_st]] + alleles
+
+        print >> sys.stderr, "%s: %d alleles with reference allele as %s" % (locus_name, len(alleles), CODIS_ref_name[locus_name])
+        if verbose:
+            print >> sys.stderr, "\t", ref_allele_left, ref_allele, ref_allele_right
+            for allele_id, allele in alleles:
+                print >> sys.stderr, allele_id, "\t", allele
+
+        # Create a backbone sequence
+        assert len(alleles) > 0
+        backbone_allele = deepcopy(alleles[-1][1])
+        for allele_id, allele_st in reversed(alleles[:-1]):
+            if verbose:
+                print >> sys.stderr
+                print >> sys.stderr, allele_id
+                print >> sys.stderr, "backbone         :", backbone_allele
+                print >> sys.stderr, "allele           :", allele_st
+            backbone_allele = combine_alleles(backbone_allele, allele_st)
+            msf_allele_seq, msf_backbone_seq = msf_alignment(backbone_allele, allele_st)
+            if verbose:                
+                print >> sys.stderr, "combined backbone:", backbone_allele
+                print >> sys.stderr, "msf_allele_seq  :", msf_allele_seq
+                print >> sys.stderr, "msf_backbone_seq:", msf_backbone_seq
+                print >> sys.stderr
+
+        allele_dic = {}
+        for allele_id, allele_seq in allele_seqs:
+            allele_dic[allele_id] = allele_seq
+
+        allele_repeat_msf = {}
+        for allele_id, allele_st in alleles:
+            msf_allele_seq, msf_backbone_seq = msf_alignment(backbone_allele, allele_st)
+            allele_repeat_msf[allele_id] = msf_allele_seq
+
+        # Sanity check
+        assert len(allele_dic) == len(allele_repeat_msf)
+        repeat_len = -1
+        for repeat_msf in allele_repeat_msf.values():
+            if repeat_len < 0:
+                repeat_len = len(repeat_msf)
+            else:
+                assert repeat_len == len(repeat_msf)
+
+        # Creat full multiple sequence alignment
+        ref_allele_id = CODIS_ref_name[locus_name]
+        allele_msf = {}
+        for allele_id, repeat_msf in allele_repeat_msf.items():
+            allele_msf[allele_id] = ref_allele_left + repeat_msf + ref_allele_right
+
+        # Make sure the length of allele ID is short, less than 20 characters
+        max_allele_id_len = max([len(allele_id) for allele_id in allele_dic.keys()])
+        assert max_allele_id_len < 20
+
+        # Write MSF (multiple sequence alignment file)
+        msf_len = len(ref_allele_left) + len(ref_allele_right) + repeat_len
+        msf_fname = "%s_gen.msf" % locus_name
+        msf_file = open(msf_fname, 'w')
+        for s in range(0, msf_len, 50):
+            for allele_id, msf in allele_msf.items():
+                assert len(msf) == msf_len
+                allele_name = "%s*%s" % (locus_name, allele_id)
+                print >> msf_file, "%20s" % allele_name,
+                for s2 in range(s, min(msf_len, s + 50), 10):
+                    print >> msf_file, " %s" % msf[s2:s2+10],
+                print >> msf_file
+
+            if s + 50 >= msf_len:
+                break
+            print >> msf_file
+        msf_file.close()
+
+        # Write FASTA file
+        fasta_fname = "%s_gen.fasta" % locus_name
+        fasta_file = open(fasta_fname, 'w')
+        for allele_id, allele_seq in allele_seqs:
+            gen_seq = ref_allele_left + allele_seq + ref_allele_right
+            print >> fasta_file, ">%s*%s %d bp" % (locus_name, allele_id, len(gen_seq))
+            for s in range(0, len(gen_seq), 60):
+                print >> fasta_file, gen_seq[s:s+60]
+        fasta_file.close()
+
+
+"""
+"""
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        description="Extract multiple sequence alignments for DNA Fingerprinting loci")
+    parser.add_argument("-b", "--base",
+                        dest="base_fname",
+                        type=str,
+                        default="codis",
+                        help="base filename (default: codis)")
+    parser.add_argument("--locus-list",
+                        dest="locus_list",
+                        type=str,
+                        default="",
+                        help="base filename (default: empty)")    
+    parser.add_argument("-v", "--verbose",
+                        dest="verbose",
+                        action="store_true",
+                        help="also print some statistics to stderr")
+
+    args = parser.parse_args()
+    if args.base_fname.find('/') != -1:
+        elems = args.base_fname.split('/')
+        base_fname = elems[-1]
+        base_dname = '/'.join(elems[:-1])
+    else:
+        base_fname = args.base_fname
+        base_dname = ""
+    if args.locus_list != "":
+        locus_list = args.locus_list.split(',')
+    else:
+        locus_list = []
+        
+    extract_msa(base_dname,
+                base_fname,
+                locus_list,
+                args.verbose)
+
diff --git a/hisatgenotype_modules/hisatgenotype_extract_codis_data.py b/hisatgenotype_modules/hisatgenotype_extract_codis_data.py
new file mode 100755
index 0000000..e016169
--- /dev/null
+++ b/hisatgenotype_modules/hisatgenotype_extract_codis_data.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2017, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT 2.
+#
+# HISAT 2 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT 2 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import os, sys, subprocess, re
+import inspect, operator
+from argparse import ArgumentParser, FileType
+
+# sequences for DNA fingerprinting loci are available at http://www.cstl.nist.gov/biotech/strbase/seq_ref.htm
+
+CODIS_loci = ["CSF1PO", "FGA", "TH01", "TPOX", "VWA", "D3S1358", "D5S818", "D7S820", "D8S1179", "D13S317", "D16S539", "D18S51", "D21S11"]
+
+
+"""
+## Download variant information from website
+"""
+def get_html(url):
+    download_cmd = ["wget",
+                    "-O", "-",
+                    url]
+    proc = subprocess.Popen(download_cmd,
+                            stdout=subprocess.PIPE,
+                            stderr=open("/dev/null", 'w'))
+
+    output = ""
+    for line in proc.stdout:
+        output += line
+
+    return output
+
+
+"""
+Download CODIS data
+"""
+def download_codis(base_dname,
+                   base_fname,
+                   locus_list,
+                   verbose):    
+    # CODIS database base URL
+    base_url = "http://www.cstl.nist.gov/biotech/strbase"
+    
+    # Refer to Python's regular expression at https://docs.python.org/2/library/re.html
+    #   <td width="16%" align="center"><font size="4">47.2 </font> </td>
+    allele_re = re.compile('>(\d+\.?\d?\"?\'*\(?\d*\.?\d?\"?\'*\)?\*?)</')
+    #   <td width="35%"><font size="2">[TTTC]<sub>4</sub>TTTT TT<span style="mso-spacerun: yes"> </span>[CTTT]<sub>14</sub>[CTGT]<sub>3</sub>[CTTT]<sub>14 </sub>[CTTC]<sub>4</sub>[CTTT]<sub>3</sub>CTCC[TTCC]<sub>4</sub></font> </td>
+    # repeat_re = re.compile('^(\[[ACGT]+\]\d+|[ACGT]+)+$')
+    repeat_re = re.compile('^(\[[ACGT]+\]\d+|[ACGT]+|\s)+$')
+    # Remove extra tags
+    tag_re = re.compile('(<[^>]*>)')
+    nbsp_re = re.compile(' ')
+    quot_re = re.compile('"')
+    codis_data_file = open(base_fname + ".dat", 'w')
+    for locus_name in CODIS_loci:
+        if len(locus_list) > 0 and locus_name not in locus_list:
+            continue
+        url = "%s/str_%s.htm" % (base_url, locus_name)
+        content = get_html(url).split("\r\n")
+        content = map(lambda x: x.strip(), content)
+        content2 = []
+        for line in content:
+            if line.startswith("<t") or \
+               line.startswith("</tr") or \
+               len(content2) == 0:
+                content2.append(line)
+            else:
+                content2[-1] += line
+                
+        content = content2
+        alleles = []
+        l = 0
+        while l < len(content):
+            line = content[l]
+            if line.startswith("<tr"):
+                l += 1
+                if l < len(content):
+                    line = content[l]
+                    line = re.sub(nbsp_re, '', line)
+                    line = re.sub(quot_re, "''", line)
+                    line = line.replace(' ', '')
+                    allele_match = allele_re.search(line)
+                    if not allele_match:
+                        continue
+                    allele_id = allele_match.group(1)                        
+                    l += 1
+                    repeat_match = None
+                    while l < len(content):
+                        line = content[l]                        
+                        if not line.startswith("<td"):
+                            break
+                        line = re.sub(tag_re, '', line)
+                        line = re.sub(nbsp_re, '', line)
+                        repeat_match = repeat_re.search(line)
+                        if repeat_match:
+                            break
+                        l += 1
+                        
+                    if not repeat_match:
+                        continue
+
+                    repeat_st = line
+                    alleles.append([allele_id, repeat_st])
+            else:
+                l += 1
+
+        for allele_id, repeat_st in alleles:
+            print >> codis_data_file, "%s\t%s\t%s" % (locus_name, allele_id, repeat_st)
+
+    codis_data_file.close()
+
+
+"""
+"""
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        description="Extract multiple sequence alignments for DNA Fingerprinting loci")
+    parser.add_argument("-b", "--base",
+                        dest="base_fname",
+                        type=str,
+                        default="codis",
+                        help="base filename (default: codis)")
+    parser.add_argument("--locus-list",
+                        dest="locus_list",
+                        type=str,
+                        default="",
+                        help="base filename (default: empty)")    
+    parser.add_argument("-v", "--verbose",
+                        dest="verbose",
+                        action="store_true",
+                        help="also print some statistics to stderr")
+
+    args = parser.parse_args()
+    if args.base_fname.find('/') != -1:
+        elems = args.base_fname.split('/')
+        base_fname = elems[-1]
+        base_dname = '/'.join(elems[:-1])
+    else:
+        base_fname = args.base_fname
+        base_dname = ""
+    if args.locus_list != "":
+        locus_list = args.locus_list.split(',')
+    else:
+        locus_list = []
+        
+    download_codis(base_dname,
+                   base_fname,
+                   locus_list,
+                   args.verbose)
+
diff --git a/hisatgenotype_modules/hisatgenotype_extract_cyp_data.py b/hisatgenotype_modules/hisatgenotype_extract_cyp_data.py
new file mode 100755
index 0000000..b0b4d03
--- /dev/null
+++ b/hisatgenotype_modules/hisatgenotype_extract_cyp_data.py
@@ -0,0 +1,1061 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2016, Raymon Cao <rcao5 at jhu.edu> and Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT 2.
+#
+# HISAT 2 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT 2 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import os, sys, subprocess, re
+import inspect, operator
+import glob
+from argparse import ArgumentParser, FileType
+
+
+global gene_names
+gene_names = ['cyp1a1','cyp1a2','cyp1b1','cyp2a6',
+              'cyp2a13','cyp2b6','cyp2c8','cyp2c9',
+              'cyp2c19','cyp2d6','cyp2e1','cyp2f1',
+              'cyp2j2','cyp2r1','cyp2S1','cyp2w1',
+              'cyp3a4','cyp3a5','cyp3a7','cyp3a43',
+              'cyp4a11','cyp4a22','cyp4b1','cyp4f2',
+              'cyp5a1','cyp8a1','cyp19a1','cyp21a2',
+              'cyp26a1']
+
+"""
+Download variant information from website database
+"""
+
+def get_html(url):
+    download_cmd = ["wget",
+                    "-O", "-",
+                    url]
+    proc = subprocess.Popen(download_cmd,
+                            stdout=subprocess.PIPE,
+                            stderr=open("/dev/null", 'w'))
+
+    output = ""
+    for line in proc.stdout:
+        output += line
+
+    return output
+
+
+def download_CYP(verbose):
+    print("Downloading data from:")
+    
+    # CYP database base URL
+    base_url = "http://www.cypalleles.ki.se"
+    
+    # Current script directory
+    curr_script = os.path.realpath(inspect.getsourcefile(download_CYP))
+    ex_path = os.path.dirname(curr_script)
+
+    # Refer to Python's regular expression at https://docs.python.org/2/library/re.html
+    cyp_re = re.compile('http://www.cypalleles.ki.se/cyp\w+.htm')
+    output = get_html(base_url)
+    cyp_urls = cyp_re.findall(output)
+    # Original list had duplicate urls, removes duplicates
+    cyp_urls = set(cyp_urls)
+
+    os.system('mkdir cyp_var_files')
+    for cyp_url in cyp_urls:
+        cyp_gene_name = cyp_url.split('/')[-1]
+        cyp_gene_name = cyp_gene_name.split('.')[0]
+        
+        # Hardcoded for cyp21 database (has inconsistant url naming) 
+        if cyp_gene_name.lower() == "cyp21".lower():
+            cyp_gene_name = cyp_gene_name + "a2" 
+
+        # Changed to match all instances of "cyp"
+        if not re.compile("cyp[\d\w]+", re.IGNORECASE).search(cyp_gene_name):
+            continue
+
+        # Open file to write on
+        cyp_file = open("cyp_var_files/%s.var" % (cyp_gene_name), 'w')
+        
+        print >> sys.stderr, cyp_url, cyp_gene_name
+        print >> cyp_file, cyp_url, cyp_gene_name
+
+        cyp_output = get_html(cyp_url)
+        if cyp_output == "":
+            continue
+
+        listA = cyp_output.split("<tr style=")
+
+        indStart = -1
+        foundStart = False
+        while not foundStart:
+            indStart += 1
+            foundStart = (cyp_gene_name + '*').upper() in listA[indStart].upper()
+            
+        # Look for first occurance of "[cyp_gene_name]*"
+        listA = listA[indStart:]
+
+        # Look for last occurance of "[cyp_gene_name]*"
+        indEnd = 0
+        foundEnd = False
+        while not foundEnd:
+            indEnd -= 1
+            foundEnd = (cyp_gene_name + '*').upper()  in listA[indEnd].upper()
+
+        listA = listA[:(indEnd + 1)]
+        
+        for itemA in listA:
+            tabRow = itemA.split("</td>")
+            for ind in range(len(tabRow)):
+                tabRow[ind] = tabRow[ind].replace("\r\n","")
+
+            allele_name_re = re.compile(cyp_gene_name.upper() + '\*[\w\d]+')
+            varInfo_re = re.compile('-?\d+[ACGT]\>[ACGT]|-?\d+_?-?\d+?del[ACGT]+|-?\d+_?-?\d+?ins[ACGT]+|None')
+
+            alleleName = allele_name_re.findall(tabRow[0])
+            if len(alleleName) > 0:
+                alleleName = alleleName[0]
+
+            # @RaymonFix - some databases have extra table, ignores headers (CYP2A6)
+            # @Daehwan - some databases (e.g. http://www.cypalleles.ki.se/cyp3a4.htm)
+            #            have 2 rows of Nucleotide changes (cDNA and Gene), might need
+            #            to look at all rows for snps
+            #
+            # @RaymonFix - look in 4th column for "Gene" nt changes first, then consider cDNA if applicable; updated re to remove "<>" formating expressions 
+
+            if cyp_url == 'http://www.cypalleles.ki.se/cyp21.htm': # Hardcoded for special format for cyp21a2
+                try:
+                    varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[1]))
+                except IndexError:
+                    continue
+                
+            else:
+                try:
+                    varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[3]))
+                    if len(varInfo) == 0:
+                        varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[2]))
+                except IndexError:
+                    continue
+
+            for varInd in range(len(varInfo)):
+                varInfo[varInd] = varInfo[varInd].replace('>','>')
+
+            if 'None' in varInfo:
+                try:
+                    assert len(varInfo) == 1
+                except:
+                    varInfo = filter(lambda a: a != 'None', varInfo)
+                
+        
+            if isinstance(alleleName, basestring):
+                print >> cyp_file, (str(alleleName) + "\t" + ','.join(varInfo))
+            
+        cyp_file.close()
+
+         
+"""
+Make MSF files from variants
+"""
+
+def checkNTloc(fasta_fileName,var_fileName,gene_name):
+    print "\nGene: %s" % gene_name
+    seq = ""
+    for line in open(fasta_fileName,'r'):
+        if line[0] == '>':
+            continue
+        seq += line.strip()
+
+    cyp_var_file = open(var_fileName,'r')
+    cyp_var_dict = makeVarDict(cyp_var_file)
+    cyp_var_file.close()
+
+    print "len:", len(seq)
+    varsPos = set()
+    varsNeg = set()
+
+    for varList in cyp_var_dict.values():
+        for var in varList:
+            if ">" in var: # is SNP
+                posNt = int(var[:-3])
+                ntChange = var[-3:].replace('>','')
+                assert len(ntChange) == 2
+                for nt in ntChange:
+                    assert nt in "ACGT"
+
+                if posNt > 0:
+                    varsPos.add(str(posNt) + '->' + ntChange[0])
+                else:
+                    assert posNt < 0
+                    varsNeg.add(str(posNt) + '->' + ntChange[0])
+                    
+            elif "del" in var: # is deletion
+                posNt = var.split('del')[0].split('_')
+                posNt = [int(p) for p in posNt]
+                ntDel = var.split('del')[1]
+                for nt in ntDel:
+                    assert nt in "ACGT"
+
+                if len(posNt) == 1: # single nt deletion
+                    assert len(ntDel) == 1
+                    if posNt[0] > 0:
+                        varsPos.add(str(posNt[0]) + '->' + ntDel)
+                    else:
+                        assert posNt[0] < 0
+                        varsNeg.add(str(posNt[0]) + '->' + ntDel)
+
+                else: # mutliple nt deletion
+                    assert len(posNt) == 2
+                    try:
+                        assert posNt[1] - posNt[0] + 1 == len(ntDel)
+                    except AssertionError:
+                        print "Incorrect deletion format: %s , skipping variation" % (var)
+                        '''sys.exit(1)'''
+                        continue
+                    ntDelList = list(ntDel)
+                    for i in range(posNt[0],posNt[1] + 1):
+                        if i > 0:
+                            varsPos.add(str(i) + '->' + ntDelList.pop(0))
+                        else:
+                            assert i < 0
+                            varsNeg.add(str(i) + '->' + ntDelList.pop(0))
+                    assert len(ntDelList) == 0
+                    
+            else:
+                assert ("ins" in var) or ("None" in var)
+                continue
+    
+    scorePos = {} # { position offset : number of alignments } for positive positions
+    for i in range(-len(seq), len(seq)):
+        align_score = 0
+        for var in varsPos:
+            pos, base = var.split('->')
+            pos = int(pos)
+            
+            try:
+                seq[pos+i]
+            except IndexError:
+                continue
+            
+            if seq[pos+i] == base:
+                align_score += 1
+
+        scorePos[i] = align_score
+    oSetPos = max(scorePos.iteritems(), key=operator.itemgetter(1))[0]
+    print "Positive postitions offset: %d" % oSetPos
+    print "Score: %d out of %d\n" % (scorePos[oSetPos], len(varsPos))
+    
+
+    print "Checking negative position offset: %d" % (oSetPos + 1)
+    align_score = 0
+    oSetNeg = oSetPos + 1
+    for var in varsNeg:
+        pos, base = var.split('->')
+        pos = int(pos)
+        
+        try:
+            seq[pos + oSetNeg]
+        except IndexError:
+            continue
+        
+        if seq[pos + oSetNeg] == base:
+            align_score += 1
+    print "Score: %d out of %d\n\n" % (align_score, len(varsNeg))
+
+    if len(varsNeg) == 0 and len(varsPos) != 0:
+        return oSetPos, oSetNeg, float(scorePos[oSetPos])/float(len(varsPos)), 1.0, float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg))
+    elif len(varsNeg) != 0 and len(varsPos) == 0:
+        return oSetPos, oSetNeg, 1.0, float(align_score)/float(len(varsNeg)), float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg))
+    elif len(varsNeg) == 0 and len(varsPos) == 0:
+        return oSetPos, oSetNeg, 1.0, 1.0, 1.0
+    else:
+        assert len(varsNeg) != 0 and len(varsPos) != 0
+        return oSetPos, oSetNeg, float(scorePos[oSetPos])/float(len(varsPos)), float(align_score)/float(len(varsNeg)), float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg))
+        
+
+def create_map(seq):
+    seq_map = {}
+    count = 0
+    for i in range(len(seq)):
+        bp = seq[i]
+        if bp == '.':
+            continue
+        assert bp.upper() in "ACGT"
+        seq_map[count] = i
+        count += 1
+    return seq_map
+
+def splitString(someStr,posList):
+    posList.insert(0,-1)
+    posList.append(len(someStr) - 1)
+    splitStr = []
+    for i in range(len(posList) - 1):
+        left = posList[i] + 1
+        right = posList[i+1] + 1
+        splitStr.append(someStr[left:right])
+
+    return splitStr
+
+def extractSeq(faFile):
+    seq = ""
+    for line in faFile:
+        if line.startswith(">"):
+            continue
+
+        seq += line.strip()
+
+    return seq
+
+def makeVarDict(fname):
+    alleleVarDict = {}
+
+    allLines = [line.strip() for line in fname]
+
+    ref_al_id_present = False
+    for line in allLines[1:]:
+        if 'None' in line:
+            ref_al_id_present = True
+
+    line_num = 0
+    for line in allLines[1:]:
+        line_num += 1
+        assert line.upper().startswith("CYP")
+        alleleName = line.split("\t")[0].upper()
+
+        if (not ref_al_id_present) and line_num == 1:
+            varList = ['None']            
+        else:
+            try:
+                varList = line.split("\t")[1].split(',')
+            except IndexError:
+                continue
+        
+        try:
+            assert not alleleName in alleleVarDict
+            alleleVarDict[alleleName] = set(varList)
+        except:
+            print >> sys.stdout, ("Warning, %s allele is already represented" % alleleName)
+            alleleVarDict[alleleName] = alleleVarDict[alleleName] | set(varList)
+
+    return alleleVarDict
+
+def makeSnp(oldSeq, pos, oldNt, newNt):
+    assert oldSeq[pos] == oldNt
+    newSeq = oldSeq[:pos] + newNt + oldSeq[pos+1:]
+    assert len(newSeq) == len(oldSeq)
+    return newSeq
+
+def makeDel(oldSeq, left, right, toDel):
+    assert right - left + 1 == len(toDel)
+    assert oldSeq[left:right + 1] == toDel
+    newSeq = oldSeq[:left] + '.'*len(toDel) + oldSeq[right + 1:]
+    assert len(newSeq) == len(oldSeq)
+    return newSeq
+    
+def makeIns(oldSeq,left,right,toIns):
+    assert right - left - 1 >= len(toIns)
+    for nt in oldSeq[left + 1:right]:
+      assert nt == '.'
+    remDots = right - left - 1 - len(toIns)
+    newSeq = oldSeq[:left + 1] + toIns + '.'*remDots + oldSeq[right:]
+    assert len(newSeq) == len(oldSeq)
+    return newSeq
+    
+
+def makeMSF(gene_name, oSetPos, oSetNeg):
+    cyp_var_file = open("cyp_var_files/%s.var" % gene_name,'r')
+    cyp_var_dict = makeVarDict(cyp_var_file)
+    cyp_var_file.close()
+
+    if len(cyp_var_dict) < 2:
+        print('\tOnly reference allele included, skipping gene')
+        return
+
+    try:
+        blast_allele_var = extract_var_from_blast('cyp_blast_alignment/%s_blast.align' % gene_name)
+        if len(blast_allele_var) > 0:
+            cyp_var_dict[gene_name.upper() + '*REFGRCH38P7'] = set(blast_allele_var)
+    except IOError:
+        print('\t%s blast file was skipped.' % gene_name)
+
+    cyp_faFile = open("cyp_fasta/%s.fasta" % gene_name,'r')
+    cyp_seq = extractSeq(cyp_faFile)
+    cyp_faFile.close()
+    preBackbone_seq = ''
+    
+
+    msfTable = {}
+
+    # Building backbone structure (augment length with insertions)
+    longestIns = {} # { key = position : value = length }
+    for allele,varList in cyp_var_dict.items():
+        for var in varList:
+            if not "ins" in var:
+                continue
+            pos = var.split('ins')[0].split('_')
+            pos = [int(p) for p in pos]
+            ntIns = var.split('ins')[1]
+            correctFormat = len(pos) == 2 and pos[1] - pos[0] == 1
+            if not correctFormat:
+                correctFormat = len(pos) == 1
+            try:
+                assert correctFormat
+            except:
+                print >> sys.stdout, "\tIncorrect format for insertion: variation %s on allele %s" % (var, allele)
+                continue
+
+            # convert to position in string
+            if not 'GRCH38' in allele:
+                if pos[0] > 0:
+                    pos = pos[0] + oSetPos
+                else:
+                    pos = pos[0] + oSetNeg
+            else:
+                pos = pos[0]
+                
+            # Make dictionary of longest insertions
+            if not pos in longestIns:
+                longestIns[pos] = len(ntIns)
+            else:
+                if len(ntIns) > longestIns[pos]:
+                    longestIns[pos] = len(ntIns)
+    
+    posInsList = sorted(longestIns.keys())
+    
+    splitSeq = splitString(cyp_seq,posInsList)
+    posInsList = posInsList[1:-1]
+
+    for i in range(len(posInsList)):
+        splitSeq[i] += '.' * longestIns[posInsList[i]]
+
+    for subseq in splitSeq:
+        try:
+            assert len(subseq) > 0 and not subseq.startswith('.')
+            preBackbone_seq += subseq
+        except:
+            continue
+    # pre-backbone built
+
+
+    map_cyp = create_map(preBackbone_seq) # { Index of bp in original seq : Actual index in string }
+    
+
+    for allele,varList in cyp_var_dict.items():
+        for var in varList:
+            isSnp = False
+            isDel = False
+            isIns = False
+        
+            if ">" in var:
+                isSnp = True
+            elif "del" in var:
+                isDel = True
+            elif "ins" in var:
+                isIns = True
+            else:
+                assert("None" in var)
+                isRef = True
+
+            if isSnp:
+                pos = int(var[:-3])
+                dbPos = pos
+                ntChange = var[-3:].replace('>','')
+                assert len(ntChange) == 2
+                for nt in ntChange:
+                    assert nt in "ACGT"
+
+                if not 'GRCH38' in allele:
+                    if pos > 0:
+                        pos = pos + oSetPos
+                    else:
+                        pos = pos + oSetNeg
+
+                if pos < 0 or pos > len(cyp_seq) - 1:
+                    print >> sys.stdout, "\tWarning: position %d out of bounds" % (dbPos)
+                    print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele)
+                    continue
+                    
+                try:
+                    assert(preBackbone_seq[map_cyp[pos]] == ntChange[0]) # nt at pos in seq must match database
+                except:
+                    print >> sys.stdout, "\tWarning: position %d in sequence contains %s, but expected %s from database" % (dbPos, preBackbone_seq[map_cyp[pos]], ntChange[0])
+                    print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele)
+                    continue
+                
+                # Adding to msf table
+                if not allele in msfTable:
+                    msfTable[allele] = makeSnp(preBackbone_seq, map_cyp[pos], ntChange[0], ntChange[1])
+                else:
+                    msfTable[allele] = makeSnp(msfTable[allele], map_cyp[pos], ntChange[0], ntChange[1])
+                    
+            elif isDel:
+                pos = var.split('del')[0].split('_')
+                pos = [int(p) for p in pos]
+                if len(pos) == 1: # Handle single deletion with format for multi deletion with one location (e.g. [1707] -> [1707,1707])  
+                    pos.append(pos[0])
+                assert len(pos) == 2
+                dbPos = pos
+                ntDel = var.split('del')[1]
+                for nt in ntDel:
+                    assert nt in "ACGT"
+
+                if not 'GRCH38' in allele:
+                    for i in range(len(pos)):
+                        if pos[i] > 0:
+                            pos[i] = pos[i] + oSetPos
+                        else:
+                            pos[i] = pos[i] + oSetNeg
+
+                skipDel = False
+                for i in range(len(pos)):
+                    if pos[i] < 0 or pos[i] > len(cyp_seq) - 1:
+                        print >> sys.stdout, "\tWarning: position %d out of bounds" % (dbPos[i])
+                        print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele)
+                        skipDel = True
+
+                if skipDel:
+                    continue
+                        
+            
+                try:
+                    assert pos[1] - pos[0] + 1 == len(ntDel)
+                except:
+                    print >> sys.stdout, "\tIncorrect deletion data with %s on allele %s. Skipping variation." % (var, allele)
+                    continue
+                            
+                try:
+                    assert preBackbone_seq[ map_cyp[pos[0]] : map_cyp[pos[1]] + 1 ] == ntDel
+                except:
+                    print >> sys.stdout, "\tWarning, positions %d to %d in sequence contains %s, but expected %s from database" % \
+                          (dbPos[0], dbPos[1], preBackbone_seq[ map_cyp[pos[0]] : map_cyp[pos[1]] + 1 ], ntDel)
+                    print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele)
+                    continue
+
+
+                # Adding to msf table
+                if not allele in msfTable:
+                    msfTable[allele] = makeDel(preBackbone_seq, map_cyp[pos[0]], map_cyp[pos[1]], ntDel)
+                else:
+                    msfTable[allele] = makeDel(msfTable[allele], map_cyp[pos[0]], map_cyp[pos[1]], ntDel)
+
+                        
+            elif isIns:
+                pos = var.split('ins')[0].split('_')
+                pos = [int(p) for p in pos]
+                if len(pos) == 1:
+                    pos.append(pos[0] + 1)
+                assert len(pos) == 2
+                dbPos = pos
+                try:
+                    assert pos[1] - pos[0] == 1
+                except AssertionError:
+                    print >> sys.stdout, "\tIncorrect insertion data with %s on allele %s. Skipping variation." % (var, allele)
+                    continue 
+                ntIns = var.split('ins')[1]
+                for nt in ntIns:
+                    assert nt in "ACGT"
+
+                if not 'GRCH38' in allele:
+                    for i in range(len(pos)):
+                        if pos[i] > 0:
+                            pos[i] = pos[i] + oSetPos
+                        else:
+                            pos[i] = pos[i] + oSetNeg
+
+                skipIns = False
+                for i in range(len(pos)):
+                    if pos[i] < 0 or pos[i] > len(cyp_seq) - 1:
+                        print >> sys.stdout, "Warning: position %d out of bounds" % (dbPos[i])
+                        print >> sys.stdout, "\tError occured on variation %s on allele %s. Skipping variation." % (var, allele)
+                        skipIns = True
+
+                if skipIns:
+                    continue
+
+
+                # Adding to msf table
+                if not allele in msfTable:
+                    msfTable[allele] = makeIns(preBackbone_seq, map_cyp[pos[0]], map_cyp[pos[1]], ntIns)
+                else:
+                    msfTable[allele] = makeIns(msfTable[allele], map_cyp[pos[0]], map_cyp[pos[1]], ntIns)
+
+
+            else:
+                assert isRef
+                assert not allele in msfTable
+                msfTable[allele] = preBackbone_seq
+
+    # Sanity checking
+    seq_len = 0
+    for allele, msf_seq in msfTable.items():
+        if seq_len == 0:
+            seq_len = len(msf_seq)
+        else:
+            assert seq_len == len(msf_seq)
+    assert seq_len > 0
+
+    # Follow MSF style of IMGT/HLA database
+    msfFile = open('cyp_msf/%s_gen.msf' % gene_name[3:].upper(),'w')
+    for i in range(0, seq_len, 50):
+        for allele, msf_seq in msfTable.items():
+            output = "%12s" % allele[3:].upper()
+            for j in range(i, i+50, 10):
+                if j >= seq_len:
+                    break
+                if j == i:
+                    output += "\t"
+                else:
+                    output += " "
+                output += msf_seq[j:j+10]
+            print >> msfFile, output
+        print >> msfFile
+
+    msfFile.close()
+
+
+def build_msf_files():
+    os.system('mkdir cyp_msf')
+
+    oSetPos = 0
+    oSetNeg = 0
+    oSetScorePos = 0.0
+    oSetScoreNeg = 0.0
+    tot_score = 0.0
+        
+    print('\nBuilding MSF files:')
+    for gene_name in gene_names:
+        oSetPos, oSetNeg, oSetScorePos, oSetScoreNeg, tot_score = checkNTloc("cyp_fasta/%s.fasta" % gene_name,"cyp_var_files/%s.var" % gene_name,gene_name)
+        if not (tot_score >= 0.95):
+            print "\tLess than 95% match, skipping gene."
+            continue
+        
+        makeMSF(gene_name, oSetPos, oSetNeg)
+
+
+'''
+Check MSF files against variants files
+'''
+
+global incorrect_msf_entries
+incorrect_msf_entries = []
+
+def create_inv_map(seq):
+    seq_map = {}
+    count = 0
+    for i in range(len(seq)):
+        bp = seq[i]
+        if bp == '.':
+            continue
+        assert bp.upper() in "ACGT"
+        seq_map[i] = count
+        count += 1
+    return seq_map
+
+def readMSF(msf_fname): # { Allele name : MSF sequence }
+    msf_dict = {}
+    all_lines = [line for line in msf_fname]
+    for line in all_lines:
+        line = line.strip().replace(' ','')
+        if len(line) == 0 : continue
+        allele_name = 'CYP' + line.split('\t')[0]
+        msf_seq = line.split('\t')[1]
+        if not allele_name in msf_dict:
+            msf_dict[allele_name] = msf_seq
+        else:
+            msf_dict[allele_name] = msf_dict[allele_name] + msf_seq
+
+    return msf_dict
+
+def msf_removeIns(ref_seq, al_seq):
+    assert len(ref_seq) == len(al_seq)
+    ins_ind_list = []
+    for i in range(len(ref_seq)):
+        if ref_seq[i] == '.':
+            ins_ind_list.append(i)
+
+    ori_ref_seq = ref_seq.replace('.','')
+    ori_al_seq = list(al_seq)
+
+    for i in ins_ind_list:
+        ori_al_seq[i] = '-'
+
+    ori_al_seq = ''.join(ori_al_seq).replace('-','')
+
+    assert len(ori_ref_seq) == len(ori_al_seq)
+    return ori_ref_seq, ori_al_seq
+
+def msfToVarList(ref_seq, al_seq):
+    var_list = []
+    
+    assert len(ref_seq) == len(al_seq)
+    for bp in ref_seq: assert bp in "ACGT."
+    for bp in al_seq: assert bp in "ACGT."
+    inv_map = create_inv_map(ref_seq)
+    
+    ins_re = re.compile('[ACGT]\.+')
+    ins_subStrPos = [(m.start(0), m.end(0)) for m in re.finditer(ins_re, ref_seq)] # list of duples of indicies of insertions in ref_seq
+    ins_pos_length = [(tup[0], tup[1] - tup[0] - 1) for tup in ins_subStrPos]
+
+    for tup in ins_pos_length:
+        ins_pos, ins_length = tup[0], tup[1]
+        ins_seq = al_seq[ins_pos + 1: ins_pos + ins_length  + 1]
+        ins_seq = ins_seq.replace('.','')
+        if len(ins_seq) == 0:
+            continue
+        ins_str_data = str(inv_map[tup[0]]) + '_' + str(inv_map[tup[0]] + 1) + 'ins' + ins_seq
+        var_list.append(ins_str_data)
+
+    # insertions finished
+    
+    ori_ref_seq, ori_al_seq = msf_removeIns(ref_seq, al_seq)
+
+    for i in range(len(ori_ref_seq)):
+        if ori_al_seq[i] == '.':
+            continue 
+        elif ori_al_seq[i] != ori_ref_seq[i]: # snp
+            var_list.append(str(i) + ori_ref_seq[i] + '>' + ori_al_seq[i])
+
+    del_subStrPos = [(m.start(0), m.end(0)) for m in re.finditer(ins_re, ori_al_seq)] # list of duples of indicies of deletions in ori_al_seq
+    del_pos_length = [(tup[0], tup[1] - tup[0] - 1) for tup in del_subStrPos]
+
+    for tup in del_pos_length:
+        del_pos, del_length = tup[0], tup[1]
+        del_seq = ori_ref_seq[del_pos + 1 : del_pos + del_length + 1]
+        if del_length == 1:
+            assert len(del_seq) == 1
+            del_str_data = str(tup[0] + 1) + 'del' + del_seq
+        else:
+            del_str_data = str(tup[0] + 1) + '_' + str(tup[0] + tup[1]) + 'del' + del_seq
+        var_list.append(del_str_data)
+
+    # deletions finished
+
+    return var_list
+
+def checkMSFfile(gene_name, msf_fname, var_fname, fasta_filename):
+    oSetPos, oSetNeg, oSet_pos_score, oSet_neg_score, tot_score = checkNTloc(fasta_filename, var_fname, gene_name)
+    
+    try:
+        msf_file = open(msf_fname,'r')
+        msf_dict = readMSF(msf_file) # { Allele name : MSF sequence }
+        msf_file.close()
+    except IOError:
+        print("\t%s msf file was skipped.\n" % (gene_name))
+        return
+
+    var_file = open(var_fname,'r')
+    var_dict = makeVarDict(var_file)
+    var_file.close()
+
+    try:
+        blast_allele_var = extract_var_from_blast('cyp_blast_alignment/%s_blast.align' % gene_name)
+        if len(blast_allele_var) > 0:
+            var_dict[gene_name.upper() + '*REFGRCH38P7'] = set(blast_allele_var)
+    except IOError:
+        print('\t%s blast file was skipped.' % gene_name)
+    
+    fa_file = open(fasta_filename,'r')
+    oriSeq = extractSeq(fa_file)
+    fa_file.close()
+
+
+    # Find reference allele
+    ref_allele = ''
+    for allele_name in var_dict.keys():
+        if len(var_dict[allele_name]) == 1 and list(var_dict[allele_name])[0] == "None":
+            assert ref_allele == ''
+            ref_allele = allele_name
+    assert not ref_allele == ''
+
+
+    # Check if ref allele seq in msf matches fasta
+    assert ref_allele in msf_dict
+
+    try:
+        assert msf_dict[ref_allele].replace('.','') == oriSeq
+        print("Sequences match for reference allele %s" % ref_allele)
+    except AssertionError:
+        print("Warning: sequences do not match for reference allele %s" % ref_allele)
+        sys.exit(1)
+
+
+    # Check all alleles are included
+    try:
+        assert set([k.upper() for k in msf_dict.keys()]).issubset(set([k.upper() for k in var_dict.keys()]))
+    except AssertionError:
+        print("Extra alleles in MSF!\n")
+        print(sorted(msf_dict.keys()))
+        print("\n\n")
+        print(sorted(var_dict.keys()))
+        sys.exit(1)
+
+
+    # Convert from database positions to sequence positions (using offset)
+    for allele, var_list in var_dict.items():
+        oSet_var_list = []
+        for var in var_list:
+            if '>' in var: # snp
+                pos = int(var.split('>')[0][:-1])
+                ntSnp = [var.split('>')[0][-1]]
+                ntSnp.append(var.split('>')[1])
+                assert len(ntSnp) == 2
+                if not 'GRCH38' in allele:
+                    if pos > 0:
+                        pos = pos + oSetPos
+                    else:
+                        pos = pos + oSetNeg
+
+                if pos < 0 or pos > len(oriSeq) - 1: # out of bounds
+                    continue
+                if oriSeq[pos] != ntSnp[0]: # mismatch
+                    print('\tMismatch on variation %s' % var)
+                    continue
+
+                oSet_var = str(pos) + ntSnp[0] + '>' + ntSnp[1]
+                oSet_var_list.append(oSet_var)
+
+            elif 'del' in var: # deletion
+                pos = var.split('del')[0].split('_')
+                pos = [int(p) for p in pos]
+                if len(pos) == 1: # Handle single deletion with format for multi deletion with one location (e.g. [1707] -> [1707,1707])  
+                    pos.append(pos[0])
+                assert len(pos) == 2
+                ntDel = var.split('del')[1]
+                for nt in ntDel:
+                    assert nt in "ACGT"
+
+                skipDel = False
+                if not 'GRCH38' in allele:
+                    for i in range(len(pos)):
+                        if pos[i] > 0:
+                            pos[i] = pos[i] + oSetPos
+                        else:
+                            pos[i] = pos[i] + oSetNeg
+                        if pos[i] < 0 or pos[i] > len(oriSeq) - 1: # out of bounds
+                            skipDel = True
+                if (oriSeq[ pos[0] : pos[1] + 1 ] != ntDel): # mismatch
+                    print('\tMismatch on variation %s' % var)
+                    continue
+
+                if skipDel:
+                    continue
+
+                assert pos[1] - pos[0] + 1 == len(ntDel)
+
+                oSet_var = 'del' + ntDel
+                if pos[0] == pos[1]:
+                    oSet_var = str(pos[0]) + oSet_var
+                else:
+                    oSet_var = str(pos[0]) + '_' + str(pos[1]) + oSet_var
+
+                oSet_var_list.append(oSet_var)                        
+
+            elif 'ins' in var: # insertion
+                pos = var.split('ins')[0].split('_')
+                pos = [int(p) for p in pos]
+                if len(pos) == 1:
+                    pos.append(pos[0] + 1)
+                assert len(pos) == 2
+                try:
+                    assert pos[1] - pos[0] == 1
+                except AssertionError:
+                    print('\tIncorrect insertion format on variation %s' % var)
+                    continue
+                ntIns = var.split('ins')[1]
+                for nt in ntIns:
+                    assert nt in "ACGT"
+
+                skipIns = False
+                if not 'GRCH38' in allele:
+                    for i in range(len(pos)):
+                        if pos[i] > 0:
+                            pos[i] = pos[i] + oSetPos
+                        else:
+                            pos[i] = pos[i] + oSetNeg
+                        if pos[i] < 0 or pos[i] > len(oriSeq) - 1: # out of bounds
+                            skipIns = True
+
+                if skipIns:
+                    continue
+
+                oSet_var = str(pos[0]) + '_' + str(pos[1]) + 'ins' + ntIns
+                oSet_var_list.append(oSet_var)
+
+            else:
+                assert allele == ref_allele
+                assert var == 'None'
+                assert len(oSet_var_list) == 0
+                oSet_var_list.append('None')
+
+        var_dict[allele] = set(oSet_var_list)
+
+    # Check variants created from MSF file against variants list
+    num_correct_alleles = 0
+    for allele, msf_seq in msf_dict.items():
+        if allele == ref_allele:
+            num_correct_alleles += 1
+            continue
+        msf_var_list = msfToVarList(msf_dict[ref_allele], msf_seq)
+        '''print('\t' + str(var_dict[allele] == set(msf_var_list)) + '\t' + str(allele) + '\t' + str(msf_var_list))'''
+
+        try:
+            assert var_dict[allele] == set(msf_var_list)
+            num_correct_alleles += 1
+        except AssertionError:
+            incorrect_msf_entries.append(allele)
+            print('\n')
+            print('\t\tVar File:\t' + str(var_dict[allele]))
+            print('\t\tMSF File:\t' + str(set(msf_var_list)))
+            print('\t\tDifference:\t' + str(var_dict[allele] - set(msf_var_list)) + '\n')
+            '''sys.exit(1)'''
+
+    print("\t%d out of %d alleles have correct msf sequences\n" % (num_correct_alleles, len(msf_dict)))
+
+def check_msf_files():
+    print("\nChecking MSF files:")
+
+    for gene_name in gene_names:
+        checkMSFfile(gene_name, 'cyp_msf/%s_gen.msf' % gene_name[3:].upper(), 'cyp_var_files/%s.var' % gene_name, 'cyp_fasta/%s.fasta' % gene_name)
+
+    print('\n\n%d incorrect msf entries on alleles %s\n' % (len(incorrect_msf_entries), str(incorrect_msf_entries)))
+
+
+"""
+Write allele sequences to fasta for each gene
+"""
+
+def writeGenFasta(gene_name, msf_fname, line_length):
+    try:
+        msf_file = open(msf_fname,'r')
+        msf_seq_dict = readMSF(msf_file)
+        msf_file.close()
+    except IOError:
+        print("\t%s msf file was skipped." % (gene_name))
+        return
+
+    gen_fasta_file = open('gen_fasta/%s_gen.fasta' % gene_name[3:].upper(), 'w')
+    
+    for allele, seq in msf_seq_dict.items():
+        seq = seq.replace('.','')
+        print >> gen_fasta_file, ('>' + allele[3:].upper() + ' ' + str(len(seq)) + ' bp')
+        seq_lines = [seq[i:i+line_length] for i in range(0, len(seq), line_length)]
+        print >> gen_fasta_file, ('\n'.join(seq_lines))
+
+    gen_fasta_file.close()
+    print('%s_gen.fasta completed' % gene_name)
+
+def build_gen_fasta_files():
+    os.system('mkdir gen_fasta')
+
+    print("\nBuilding alleles sequence fasta files:")
+    for gene_name in gene_names:
+        writeGenFasta(gene_name, 'cyp_msf/%s_gen.msf' % gene_name[3:].upper(), 60)
+
+
+"""
+Run script
+"""
+
+def extract_cyp_data():
+    download_CYP(True)
+    build_msf_files()
+    check_msf_files()
+    build_gen_fasta_files()
+
+####################################################################################################
+## Debuging BLASTN alignment ref alleles
+
+def adjust_blast_vars(blast_vars_list,qry_pos):
+    if len(blast_vars_list) == 0:
+        return []
+
+    qry_pos = qry_pos - 1
+    adj_blst_var_list = []
+
+    for var in blast_vars_list:
+        if '>' in var: # SNP
+            old_pos = int(var[:-3])
+            adj_var = str(old_pos + qry_pos) + var[-3:]
+            adj_blst_var_list.append(adj_var)
+        elif 'del' in var: # deletion
+            old_pos = var.split('del')[0].split('_')
+            old_pos = [int(i) for i in old_pos]
+            old_pos = [i + qry_pos for i in old_pos]
+            if len(old_pos) == 1:
+                adj_var = str(old_pos[0]) + 'del' + var.split('del')[1]
+            else:
+                assert len(old_pos) == 2
+                adj_var = str(old_pos[0]) + '_' + str(old_pos[1]) + 'del' + var.split('del')[1]
+            adj_blst_var_list.append(adj_var)
+        else: # insertion
+            assert 'ins' in var
+            old_pos = var.split('ins')[0].split('_')
+            old_pos = [int(i) for i in old_pos]
+            old_pos = [i + qry_pos for i in old_pos]
+            assert len(old_pos) == 2 and (old_pos[1] - old_pos[0] == 1)
+            adj_var = str(old_pos[0]) + '_' + str(old_pos[1]) + 'ins' + var.split('ins')[1]
+            adj_blst_var_list.append(adj_var)
+
+    return adj_blst_var_list
+
+def extract_var_from_blast(cyp_blast_fname):
+    blastn_file = open(cyp_blast_fname,'r')
+    all_lines = [line.strip() for line in blastn_file if not (len(line.strip()) == 0 or line.strip().startswith('|'))]
+    blastn_file.close()
+
+    id_match = [m.group(0) for l in all_lines[0:25] for m in [re.compile('.*(Identities.*).*').search(l)] if m][0]
+    id_match = id_match.split('%')[0].split(' (')[0].split('= ')[1].split('/')
+    id_match = [int(i) for i in id_match]
+
+    # print(id_match)    
+    assert len(id_match) == 2 and id_match[1] - id_match[0] >= 0
+    if id_match[1] - id_match[0] == 0:
+        print('\tPerfect match using blastn')
+        return []
+    
+    
+    start = -1
+    end = -1
+    for i in range(len(all_lines)): # Get rid of headers and footers
+        if all_lines[i].startswith('Score ='):
+            assert start == -1
+            start = i
+
+        if all_lines[i].startswith('Lambda'):
+            assert start != -1 and end == -1
+            end = i
+            break
+
+    all_lines = all_lines[start + 3 : end]
+    # print('\n'.join(all_lines))
+
+    blastn_var_list = []
+    for i in range(0,len(all_lines),2):
+        qry_seq = '\t'.join(all_lines[i].split())
+        qry_seq_pos = int(qry_seq.split('\t')[1])
+        sbj_seq = '\t'.join(all_lines[i + 1].split())
+        qry_seq = qry_seq.split('\t')[2].replace('-','.').upper()
+        sbj_seq = sbj_seq.split('\t')[2].replace('-','.').upper()
+        #print(qry_seq)
+        #print(sbj_seq)
+
+        temp_var_list = msfToVarList(qry_seq, sbj_seq)
+        #print(str(qry_seq_pos) + '\t' + str(temp_var_list) +  '\t' + str(adjust_blast_vars(temp_var_list,qry_seq_pos)))
+        temp_var_list = adjust_blast_vars(temp_var_list,qry_seq_pos)
+        blastn_var_list = blastn_var_list + temp_var_list
+        
+    return blastn_var_list
+
+# extract_var_from_blast('cyp_blast_alignment/cyp2d6_blast.align')
+
+extract_cyp_data()
diff --git a/hisatgenotype_modules/hisatgenotype_gene_typing.py b/hisatgenotype_modules/hisatgenotype_gene_typing.py
new file mode 100755
index 0000000..f5f02aa
--- /dev/null
+++ b/hisatgenotype_modules/hisatgenotype_gene_typing.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import sys, os
+import math
+import random
+from copy import deepcopy
+
+
+"""
+"""
+def clone_IMGTHLA_database():
+    os.system("git clone https://github.com/jrob119/IMGTHLA.git")
+    
+    # Check out one particular revision just to have the same data across multiple computers        
+    revision = "d3b559b34b96ff9e7f0d97476222d8e4cdee63ad" # Revision on November 16, 2016
+    # revision = "45c377516bdb7f1b926" # Revision on July 14, 2016
+    os.system("cd IMGTHLA; git checkout %s; cd .." % revision)
+
diff --git a/hisatgenotype_modules/hisatgenotype_typing_common.py b/hisatgenotype_modules/hisatgenotype_typing_common.py
new file mode 100755
index 0000000..19095f1
--- /dev/null
+++ b/hisatgenotype_modules/hisatgenotype_typing_common.py
@@ -0,0 +1,1549 @@
+#!/usr/bin/env python
+#
+# Copyright 2017, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT-genotype.
+#
+# HISAT-genotype is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT-genotype is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT-genotype.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import sys, os, subprocess, re
+import math
+import random
+from copy import deepcopy
+from datetime import datetime
+
+
+##################################################
+#   Sequence processing routines
+##################################################
+
+
+"""
+"""
+def reverse_complement(seq):
+    comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
+    rc_seq = ""
+    for s in reversed(seq):
+        if s in comp_table:
+            rc_seq += comp_table[s]
+        else:
+            rc_seq += s
+    return rc_seq
+
+
+"""
+"""
+def read_genome(genome_file):
+    chr_dic, chr_names, chr_full_names = {}, [], []
+    chr_name, chr_full_name, sequence = "", "", ""
+    for line in genome_file:
+        if line.startswith(">"):
+            if chr_name and sequence:
+                chr_dic[chr_name] = sequence
+                chr_names.append(chr_name)
+            chr_full_name = line.strip()[1:]
+            chr_name = line.strip().split()[0][1:]
+            chr_full_names.append(chr_full_name)
+            sequence = ""
+        else:
+            sequence += line.strip()
+    if chr_name and sequence:
+        chr_dic[chr_name] = sequence
+        chr_names.append(chr_name)
+        chr_full_names.append(chr_full_name)
+    return chr_dic, chr_names, chr_full_names
+
+
+##################################################
+#   Alleles, variants, haplotypes, etc.
+##################################################
+
+
+"""
+"""
+def read_allele_sequences(fname):
+    allele_seqs = {}
+    allele_name, sequence = "", ""
+    for line in open(fname):
+        if line.startswith(">"):
+            if allele_name != "" and allele_name not in allele_seqs:
+                allele_seqs[allele_name] = sequence
+            allele_name = line.strip()[1:]
+            sequence = ""
+        else:
+            sequence += line.strip()
+    if allele_name != "" and allele_name not in allele_seqs:
+        allele_seqs[allele_name] = sequence
+    return allele_seqs
+
+
+"""
+"""
+def read_variants(fname):
+    allele_vars = {}
+    for line in open(fname):
+        var_id, type, allele_name, left, data = line.strip().split()
+        left = int(left)
+        if type == "deletion":
+            data = int(data)
+        if allele_name not in allele_vars:
+            allele_vars[allele_name] = []
+        allele_vars[allele_name].append([left, type, data, var_id])
+    return allele_vars
+
+
+"""
+"""
+def read_haplotypes(fname):
+    allele_haplotypes = {}
+    for line in open(fname):
+        haplotype_id, allele_name, left, right, vars = line.strip().split()
+        vars = vars.split(',')
+        left, right = int(left), int(right)
+        if allele_name not in allele_haplotypes:
+            allele_haplotypes[allele_name] = []
+        allele_haplotypes[allele_name].append([left, right, vars])
+    return allele_haplotypes
+
+
+"""
+"""
+def read_links(fname):
+    links = []
+    for line in open(fname):
+        var_id, allele_names = line.strip().split('\t')
+        links.append([var_id, allele_names])
+    return links
+
+
+"""
+Compare two variants
+"""
+def compare_vars(a, b):
+    a_pos, a_type, a_data = a[:3]
+    b_pos, b_type, b_data = b[:3]
+
+    if a_pos != b_pos:
+        return a_pos - b_pos
+    if a_type != b_type:
+         if a_type == 'I':
+             return -1
+         elif b_type == 'I':
+             return 1
+         if a_type == 'S':
+             return -1
+         else:
+             return 1
+    if a_data < b_data:
+        return -1
+    elif a_data > b_data:
+        return 1
+    else:
+        return 0
+
+
+"""
+"""
+def lower_bound(Var_list, pos):
+    low, high = 0, len(Var_list)
+    while low < high:
+        m = (low + high) / 2
+        m_pos = Var_list[m][0]
+        if m_pos < pos:
+            low = m + 1
+        elif m_pos > pos:
+            high = m
+        else:
+            assert m_pos == pos
+            while m > 0:
+                if Var_list[m-1][0] < pos:
+                    break
+                m -= 1
+            return m
+    return low
+
+
+
+"""
+"""
+def check_files(fnames):
+    for fname in fnames:
+        if not os.path.exists(fname):
+            return False
+    return True
+
+
+##################################################
+#   Database releated routines
+##################################################
+
+    
+"""
+Download GRCh38 human reference and HISAT2 indexes
+"""
+def download_genome_and_index():
+    HISAT2_fnames = ["grch38",
+                     "genome.fa",
+                     "genome.fa.fai"]
+    if not check_files(HISAT2_fnames):
+        os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz")
+        os.system("hisat2-inspect grch38/genome > genome.fa")
+        os.system("samtools faidx genome.fa")
+
+
+"""
+"""
+def clone_hisatgenotype_database():
+    os.system("git clone https://github.com/infphilo/hisatgenotype_db.git")
+    
+    # Check out one particular revision just to have the same data across multiple computers        
+    # revision = "d3b559b34b96ff9e7f0d97476222d8e4cdee63ad" # Revision on November 16, 2016
+    # os.system("cd IMGTHLA; git checkout %s; cd .." % revision)
+
+
+"""
+"""
+def extract_database_if_not_exists(base,
+                                   locus_list,
+                                   inter_gap = 30,
+                                   intra_gap = 50,
+                                   partial = True,
+                                   verbose = False):
+    fnames = [base + "_backbone.fa",
+              base + "_sequences.fa",
+              base + ".locus",
+              base + ".snp",
+              base + ".index.snp",
+              base + ".haplotype",
+              base + ".link",
+              base + ".partial"]
+    if check_files(fnames):
+        return
+
+    extract_cmd = ["hisatgenotype_extract_vars.py"]
+    extract_cmd += ["--base", base]
+    if len(locus_list) > 0:
+        extract_cmd += ["--locus-list", ','.join(locus_list)]    
+    if not partial:
+        extract_cmd += ["--no-partial"]
+    if base == "codis":
+        extract_cmd += ["--whole-haplotype"]
+    else:
+        extract_cmd += ["--inter-gap", str(inter_gap),
+                        "--intra-gap", str(intra_gap)]
+    if base == "hla":
+        extract_cmd += ["--min-var-freq", "0.1"]
+
+    if base == "codis":
+        extract_cmd += ["--leftshift"]
+
+    # DK - debugging purposes
+    # extract_cmd += ["--ext-seq", "300"]
+    if verbose:
+        print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
+    proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
+    proc.communicate()
+
+    if not check_files(fnames):
+        print >> sys.stderr, "Error: hisatgenotype_extract_vars failed!"
+        sys.exit(1)
+
+        
+"""
+"""
+def build_index_if_not_exists(base,
+                              aligner,
+                              index_type,
+                              threads = 1,
+                              verbose = False):
+    if aligner == "hisat2":
+        # Build HISAT2 graph indexes based on the above information
+        if index_type == "graph":
+            hisat2_graph_index_fnames = ["%s.graph.%d.ht2" % (base, i+1) for i in range(8)]
+            if not check_files(hisat2_graph_index_fnames):
+                build_cmd = ["hisat2-build",
+                             "-p", str(threads),
+                             "--snp", "%s.index.snp" % base,
+                             "--haplotype", "%s.haplotype" % base,
+                             "%s_backbone.fa" % base,
+                             "%s.graph" % base]
+                if verbose:
+                    print >> sys.stderr, "\tRunning:", ' '.join(build_cmd)
+                proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
+                proc.communicate()        
+                if not check_files(hisat2_graph_index_fnames):
+                    print >> sys.stderr, "Error: indexing HLA failed!  Perhaps, you may have forgotten to build hisat2 executables?"
+                    sys.exit(1)
+        # Build HISAT2 linear indexes based on the above information
+        else:
+            assert index_type == "linear"
+            hisat2_linear_index_fnames = ["%s.linear.%d.ht2" % (base, i+1) for i in range(8)]
+            if not check_files(hisat2_linear_index_fnames):
+                build_cmd = ["hisat2-build",
+                             "%s_backbone.fa,%s_sequences.fa" % (base, base),
+                             "%s.linear" % base]
+                proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
+                proc.communicate()        
+                if not check_files(hisat2_linear_index_fnames):
+                    print >> sys.stderr, "Error: indexing HLA failed!"
+                    sys.exit(1)                    
+    else:
+        # Build Bowtie2 indexes based on the above information
+        assert aligner == "bowtie2" and index_type == "linear"        
+        bowtie2_index_fnames = ["%s.%d.bt2" % (base, i+1) for i in range(4)]
+        bowtie2_index_fnames += ["%s.rev.%d.bt2" % (base, i+1) for i in range(2)]
+        if not tcheck_files(bowtie2_index_fnames):
+            build_cmd = ["bowtie2-build",
+                         "%s_backbone.fa,%s_sequences.fa" % (base, base),
+                         base]
+            proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'))
+            proc.communicate()        
+            if not check_files(bowtie2_index_fnames):
+                print >> sys.stderr, "Error: indexing HLA failed!"
+                sys.exit(1)
+
+                    
+
+##################################################
+#   Read simulation and alignment
+##################################################
+
+
+"""
+Simulate reads from alleles with headers (>) filled with mapping information.
+  For an example, see hisat2_test_HLA_genotyping.py.
+"""
+def simulate_reads(seq_dic,                       # seq_dic["A"]["A*24:36N"] = "ACGTCCG ..."
+                   base_fname,                    # hla, codis, cyp, or so on
+                   allele_list,                   # ["A*32:29", "B*07:02:01"]
+                   Vars,                          # Vars["A"]["hv326"] = ["single", 604, "C"]
+                   Links,
+                   simulate_interval = 1,
+                   read_len = 100,
+                   frag_len = 250,
+                   perbase_errorrate = 0.0,
+                   perbase_snprate = 0.0,
+                   skip_fragment_regions = []):
+    reads_1, reads_2 = [], []
+    num_pairs = []
+    for allele_names in allele_list:
+        gene = allele_names[0].split('*')[0]
+        num_pairs.append([])
+
+        # Introduce SNPs into allele sequences
+        def introduce_snps(seq):
+            seq = list(seq)
+            for i in range(len(seq)):
+                if random.random() * 100 < perbase_snprate:
+                    if seq[i] == 'A':
+                        alt_bases = ['C', 'G', 'T']
+                    elif seq[i] == 'C':
+                        alt_bases = ['A', 'G', 'T']
+                    elif seq[i] == 'G':
+                        alt_bases = ['A', 'C', 'T']
+                    else:
+                        assert seq[i] == 'T'
+                        alt_bases = ['A', 'C', 'G']
+                    random.shuffle(alt_bases)
+                    alt_base = alt_bases[0]
+                    seq[i] = alt_base
+            seq = ''.join(seq)
+            return seq
+
+        # Simulate reads from two alleles
+        def simulate_reads_impl(seq,
+                                seq_map,
+                                ex_seq_map,
+                                ex_seq,
+                                ex_desc,
+                                simulate_interval = 1,
+                                read_len = 100,
+                                frag_len = 250,
+                                perbase_errorrate = 0.0,
+                                skip_fragment_regions = []):
+            # Introduce sequencing errors
+            def introduce_seq_err(read_seq, pos):
+                read_seq = list(read_seq)
+                for i in range(read_len):
+                    map_pos = seq_map[pos + i]
+                    if ex_desc[map_pos] != "":
+                        continue
+                    if random.random() * 100 < perbase_errorrate:
+                        if read_seq[i] == 'A':
+                            alt_bases = ['C', 'G', 'T']
+                        elif read_seq[i] == 'C':
+                            alt_bases = ['A', 'G', 'T']
+                        elif read_seq[i] == 'G':
+                            alt_bases = ['A', 'C', 'T']
+                        else:
+                            assert read_seq[i] == 'T'
+                            alt_bases = ['A', 'C', 'G']
+                        random.shuffle(alt_bases)
+                        alt_base = alt_bases[0]
+                        read_seq[i] = alt_base
+                read_seq = ''.join(read_seq)
+                return read_seq                            
+                            
+            # Get read alignment, e.g., 260|R_483_61M5D38M23D1M_46|S|hv154,3|S|hv162,10|D|hv185,38|D|hv266
+            def get_info(read_seq, pos):
+                info = "%d_" % (seq_map[pos] + 1)
+                total_match, match, sub_match = 0, 0, 0
+                var_str = ""
+                ins_len, ins_var = 0, ""
+                for i in range(pos, pos + read_len):
+                    map_i = ex_seq_map[i]
+                    assert ex_seq[map_i] != 'D'
+                    total_match += 1
+                    match += 1
+                    if ex_seq[map_i] == 'I':
+                        if ins_var != "":
+                            assert ins_var == ex_desc[map_i]
+                        ins_var = ex_desc[map_i]
+                        ins_len += 1
+                    elif ins_var != "":
+                        if var_str != "":
+                            var_str += ','
+                        var_str += ("%s|I|%s" % (sub_match, ins_var))
+                        ins_len, ins_var = 0, ""
+                        sub_match = 0
+                    if ex_seq[map_i] != 'I':
+                        if ex_desc[map_i] != "" or read_seq[i-pos] != ex_seq[map_i]:
+                            if var_str != "":
+                                var_str += ','
+                            var_str += ("%d|S|%s" % (sub_match, ex_desc[map_i] if ex_desc[map_i] != "" else "unknown"))
+                            sub_match = 0
+                        else:
+                            sub_match += 1
+                    if i + 1 < pos + read_len and ex_seq[map_i+1] == 'D':
+                        assert match > 0
+                        info += ("%dM" % match)
+                        match = 0
+                        del_len = 1
+                        while map_i + 1 + del_len < len(ex_seq):
+                            if ex_seq[map_i + 1 + del_len] != 'D':
+                                break
+                            del_len += 1
+                        info += ("%dD" % del_len)
+                        if var_str != "":
+                            var_str += ','
+                        var_str += ("%s|D|%s" % (sub_match, ex_desc[map_i + 1]))
+                        sub_match = 0
+                assert match > 0
+                info += ("%dM" % match)
+                assert total_match == read_len
+                if var_str:
+                    info += "_"
+                    info += var_str                
+                return info
+                
+            comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
+            reads_1, reads_2 = [], []
+            for i in range(0, len(seq) - frag_len + 1, simulate_interval):
+                if len(skip_fragment_regions) > 0:
+                    skip = False
+                    for skip_left, skip_right in skip_fragment_regions:
+                        if i <= skip_right and i + frag_len > skip_left:
+                            skip = True
+                            break
+                    if skip:
+                        continue
+                        
+                pos1 = i
+                seq1 = seq[pos1:pos1+read_len]
+                if perbase_errorrate > 0.0:
+                    seq1 = introduce_seq_err(seq1, pos1)
+                info1 = get_info(seq1, pos1)
+                reads_1.append([seq1, info1])
+                
+                pos2 = i + frag_len - read_len
+                seq2 = seq[pos2:pos2+read_len]
+                if perbase_errorrate > 0.0:
+                    seq2 = introduce_seq_err(seq2, pos2)                
+                info2 = get_info(seq2, pos2)
+                tmp_read_2 = reversed(seq2)
+                read_2 = ""
+                for s in tmp_read_2:
+                    if s in comp_table:
+                        read_2 += comp_table[s]
+                    else:
+                        read_2 += s
+                reads_2.append([read_2, info2])
+            return reads_1, reads_2
+
+        # for each allele in a list of alleles such as ['A*32:29', 'B*07:02:01']
+        for allele_name in allele_names:
+            allele_seq = seq_dic[gene][allele_name]
+            backbone_seq = seq_dic[gene]["%s*BACKBONE" % gene]
+            allele_ex_seq = list(backbone_seq)
+            allele_ex_desc = [''] * len(allele_ex_seq)
+            allele_seq_map = [i for i in range(len(allele_seq))]
+            allele_ex_seq_map = [i for i in range(len(allele_seq))]
+
+            if perbase_snprate > 0:
+                HLA_seq = introduce_snps(allele_seq)
+
+            # Extract variants included in each allele
+            var_ids = []
+            for var_id, allele_list in Links.items():
+                if allele_name in allele_list:
+                    var_ids.append(var_id)
+
+            def var_cmp(a, b):
+                assert a.startswith("hv") and b.startswith("hv")
+                return int(a[2:]) - int(b[2:])
+            var_ids = sorted(var_ids, cmp=var_cmp)
+
+            # Build annotated sequence for the allele w.r.t backbone sequence
+            add_pos = 0
+            for var_id in var_ids:
+                var_type, var_pos, var_data = Vars[gene][var_id]
+                var_pos += add_pos
+                if var_type == "single":
+                    allele_ex_seq[var_pos] = var_data
+                    allele_ex_desc[var_pos] = var_id
+                elif var_type == "deletion":
+                    del_len = int(var_data)
+                    assert var_pos + del_len <= len(allele_ex_seq)
+                    allele_ex_seq[var_pos:var_pos+del_len] = ['D'] * del_len
+                    allele_ex_desc[var_pos:var_pos+del_len] = [var_id] * del_len
+                else:
+                    assert var_type == "insertion"
+                    ins_len = len(var_data)
+                    allele_ex_seq = allele_ex_seq[:var_pos] + (['I'] * ins_len) + allele_ex_seq[var_pos:]
+                    allele_ex_desc = allele_ex_desc[:var_pos] + ([var_id] * ins_len) + allele_ex_desc[var_pos:]
+                    add_pos += ins_len
+            allele_ex_seq = ''.join(allele_ex_seq)
+            assert len(backbone_seq) + add_pos == len(allele_ex_seq)            
+
+            # Build mapping from the allele to the annotated sequence
+            prev_j, minus_pos = 0, 0
+            for i in range(len(allele_seq)):
+                for j in range(prev_j, len(allele_ex_seq)):
+                    if allele_ex_seq[j] != 'D':
+                        if allele_ex_seq[j] == 'I':
+                            minus_pos += 1
+                        break
+                allele_seq_map[i] = j - minus_pos
+                allele_ex_seq_map[i] = j
+                prev_j = j + 1
+
+            # DK - debugging purposes
+            """
+            for t in range(0, len(allele_ex_seq), 100):
+                print t, allele_ex_seq[t:t+100]
+                print t, '-'.join(allele_ex_desc[t:t+100])
+                print t, allele_seq_map[t:t+100]
+            print "allele_seq length:", len(allele_seq)
+            print len(allele_ex_seq), "vs.", len(seq_dic[gene]["A*BACKBONE"]), "vs.", len(allele_seq_map)
+            print allele_ex_seq[1943:1946]
+            print allele_ex_desc[1943:1946]
+            sys.exit(1)
+            """
+            
+            tmp_reads_1, tmp_reads_2 = simulate_reads_impl(allele_seq,
+                                                           allele_seq_map,
+                                                           allele_ex_seq_map,
+                                                           allele_ex_seq,
+                                                           allele_ex_desc,
+                                                           simulate_interval,
+                                                           read_len,
+                                                           frag_len,
+                                                           perbase_errorrate,
+                                                           skip_fragment_regions)
+            reads_1 += tmp_reads_1
+            reads_2 += tmp_reads_2
+            num_pairs[-1].append(len(tmp_reads_1))
+
+    # Write reads into a FASTA file
+    def write_reads(reads, idx):
+        read_file = open('%s_input_%d.fa' % (base_fname, idx), 'w')
+        for read_i in range(len(reads)):
+            query_name = "%d|%s_%s" % (read_i + 1, "LR"[idx-1], reads[read_i][1])
+            if len(query_name) > 254:
+                query_name = query_name[:254]
+            print >> read_file, ">%s" % query_name
+            print >> read_file, reads[read_i][0]
+        read_file.close()
+    write_reads(reads_1, 1)
+    write_reads(reads_2, 2)
+
+    return num_pairs
+
+
+"""
+Align reads, and sort the alignments into a BAM file
+"""
+def align_reads(aligner,
+                simulation,
+                index_name,
+                index_type,
+                base_fname,
+                read_fname,
+                fastq,
+                threads,
+                out_fname,
+                verbose):
+    if aligner == "hisat2":
+        aligner_cmd = [aligner, "--mm"]
+        if not simulation:
+            aligner_cmd += ["--no-unal"]            
+        DNA = True
+        if DNA:
+            aligner_cmd += ["--no-spliced-alignment"] # no spliced alignment
+            aligner_cmd += ["-X", "1000"] # max fragment length
+        if index_type == "linear":
+            aligner_cmd += ["-k", "10"]
+        else:
+            aligner_cmd += ["--max-altstried", "64"]
+            aligner_cmd += ["--haplotype"]
+            if base_fname == "codis":
+                aligner_cmd += ["--enable-codis"]        
+    elif aligner == "bowtie2":
+        aligner_cmd = [aligner,
+                       "--no-unal",
+                       "-k", "10"]
+    else:
+        assert False
+    aligner_cmd += ["-x", index_name]
+    assert len(read_fname) in [1,2]
+    aligner_cmd += ["-p", str(threads)]
+    if not fastq:
+        aligner_cmd += ["-f"]
+    if len(read_fname) == 1:
+        aligner_cmd += ["-U", read_fname[0]]
+    else:
+        aligner_cmd += ["-1", "%s" % read_fname[0],
+                        "-2", "%s" % read_fname[1]]
+    if verbose >= 1:
+        print >> sys.stderr, ' '.join(aligner_cmd)
+    align_proc = subprocess.Popen(aligner_cmd,
+                                  stdout=subprocess.PIPE,
+                                  stderr=open("/dev/null", 'w'))
+
+    sambam_cmd = ["samtools",
+                  "view",
+                  "-bS",
+                  "-"]
+    sambam_proc = subprocess.Popen(sambam_cmd,
+                                   stdin=align_proc.stdout,
+                                   stdout=open(out_fname + ".unsorted", 'w'),
+                                   stderr=open("/dev/null", 'w'))
+    sambam_proc.communicate()
+    if index_type == "graph":
+        bamsort_cmd = ["samtools",
+                       "sort",
+                       out_fname + ".unsorted",
+                       "-o", out_fname]
+        bamsort_proc = subprocess.Popen(bamsort_cmd,
+                                        stderr=open("/dev/null", 'w'))
+        bamsort_proc.communicate()
+
+        bamindex_cmd = ["samtools",
+                        "index",
+                        out_fname]
+        bamindex_proc = subprocess.Popen(bamindex_cmd,
+                                         stderr=open("/dev/null", 'w'))
+        bamindex_proc.communicate()
+
+    os.system("rm %s" % (out_fname + ".unsorted"))
+
+
+"""
+HISAT-genotype's mpileup
+"""
+def get_mpileup(alignview_cmd,
+                ref_seq,
+                base_locus,
+                vars,
+                allow_discordant):
+    ref_seq_len = len(ref_seq)
+    mpileup = []
+    for i in range(ref_seq_len):
+        mpileup.append([[], {}])
+        
+    proc = subprocess.Popen(alignview_cmd,
+                            stdout=subprocess.PIPE,
+                            stderr=open("/dev/null", 'w'))
+
+    prev_pos = -1
+    cigar_re = re.compile('\d+\w')
+    for line in proc.stdout:
+        line = line.strip()
+        cols = line.split()
+        read_id, flag, _, pos, _, cigar_str = cols[:6]
+        read_seq = cols[9]
+        flag, pos = int(flag), int(pos)
+        # Unalined?
+        if flag & 0x4 != 0:
+            continue
+        pos -= (base_locus + 1)
+        if pos < 0:
+            continue
+
+        # Concordantly mapped?
+        if flag & 0x2 != 0:
+            concordant = True
+        else:
+            concordant = False
+
+        if not allow_discordant and not concordant:
+            continue
+
+        read_pos, left_pos = 0, pos
+        right_pos = left_pos
+        cigars = cigar_re.findall(cigar_str)
+        cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
+        for i in range(len(cigars)):
+            cigar_op, length = cigars[i]
+            if cigar_op in "MD":
+                for j in range(length):
+                    if cigar_op == 'M':
+                        read_nt = read_seq[read_pos + j]
+                    else:
+                        read_nt = 'D'
+                    if right_pos + j < len(mpileup):
+                        if read_nt not in mpileup[right_pos + j][1]:
+                            mpileup[right_pos + j][1][read_nt] = 1
+                        else:
+                            mpileup[right_pos + j][1][read_nt] += 1
+
+            if cigar_op in "MND":
+                right_pos += length
+
+            if cigar_op in "MIS":
+                read_pos += length
+
+    # Choose representative bases or 'D'
+    for i in range(len(mpileup)):
+        nt_dic = mpileup[i][1]
+        num_nt = sum(nt_dic.values())
+        nt_set = []
+        if num_nt >= 20:
+            for nt, count in nt_dic.items():
+                if nt not in "ACGT":
+                    continue
+                if count >= num_nt * 0.2 or count >= 7:
+                    nt_set.append(nt)
+        mpileup[i][0] = nt_set
+
+    # Sort variants
+    var_list = [[] for i in range(len(mpileup))]
+    for var_id, value in vars.items():
+        var_type, var_pos, var_data = value
+        assert var_pos < len(var_list)
+        var_list[var_pos].append([var_id, var_type, var_data])
+
+    # Assign known or unknown variants
+    skip_i, prev_del_var_id = -1, ""
+    for i in range(len(mpileup)):
+        nt_dic = mpileup[i][1]
+        ref_nt = ref_seq[i]
+        new_nt_dic = {}
+        for nt, count in nt_dic.items():
+            var_id = ""
+            if nt == 'D':
+                if i <= skip_i:
+                    assert prev_del_var_id != ""
+                    var_id = prev_del_var_id
+                else:
+                    for var_id_, var_type, var_data in var_list[i]:
+                        if var_type != "deletion":
+                            continue
+                        del_len = int(var_data)
+                        del_exist = True
+                        for j in range(i + 1, i + del_len):
+                            assert j < len(mpileup)
+                            nt_dic2 = mpileup[j][1]
+                            if 'D' not in nt_dic2:
+                                del_exist = False
+                                break
+                        if del_exist:
+                            var_id = var_id_
+                            prev_del_var_id = var_id
+                            skip_i = i + del_len - 1
+                            break                                                
+            elif nt != 'N' and nt != ref_nt:
+                assert nt in "ACGT"
+                id = "unknown"
+                for var_id_, var_type, var_data in var_list[i]:
+                    if var_type != "single":
+                        continue
+                    if nt == var_data:
+                        var_id = var_id_
+                        break
+            new_nt_dic[nt] = [count, var_id]
+                        
+        mpileup[i][1] = new_nt_dic
+
+    return mpileup
+
+
+"""
+"""
+def get_pair_interdist(alignview_cmd,
+                       simulation,
+                       verbose):
+    bamview_proc = subprocess.Popen(alignview_cmd,
+                                    stdout=subprocess.PIPE,
+                                    stderr=open("/dev/null", 'w'))
+    sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting
+    alignview_proc = subprocess.Popen(sort_read_cmd,
+                                      stdin=bamview_proc.stdout,
+                                      stdout=subprocess.PIPE,
+                                      stderr=open("/dev/null", 'w'))
+
+    dist_list = []
+    prev_read_id = None
+    cigar_re = re.compile('\d+\w')
+    reads = []
+    for line in alignview_proc.stdout:
+        line = line.strip()
+        cols = line.split()
+        read_id, flag, _, pos, _, cigar_str = cols[:6]
+        read_seq = cols[9]
+        flag, pos = int(flag), int(pos)
+        # Unalined?
+        if flag & 0x4 != 0:
+            continue
+
+        if simulation:
+            read_id = read_id.split('|')[0]
+
+        # Concordantly mapped?
+        if flag & 0x2 != 0:
+            concordant = True
+        else:
+            concordant = False
+
+        NH, YT = sys.maxint, ""
+        for i in range(11, len(cols)):
+             col = cols[i]
+             if col.startswith("NH"):
+                 NH = int(col[5:])
+             elif col.startswith("YT"):
+                 YT = col[5:]
+        if NH > 1 or YT != "CP":
+            continue
+
+        if prev_read_id != None and read_id != prev_read_id:
+            if len(reads) == 2:
+                left1, right1 = reads[0]
+                left2, right2 = reads[1]
+                if left1 <= left2:
+                    dist = left2 - right1 - 1
+                else:
+                    dist = left1 - right2 - 1
+                dist_list.append(dist)
+            reads = []
+
+        left_pos = right_pos =  pos
+        cigars = cigar_re.findall(cigar_str)
+        cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
+        for i in range(len(cigars)):
+            cigar_op, length = cigars[i]
+            if cigar_op in "MND":
+                right_pos += length
+
+        reads.append([left_pos, right_pos - 1])
+        
+        prev_read_id = read_id
+
+    dist_list = sorted(dist_list)
+    dist_avg = sum(dist_list) / max(1, len(dist_list))
+    if len(dist_list) > 0:
+        dist_median = dist_list[len(dist_list)/2]
+    else:
+        dist_median = -1
+
+    return dist_median
+
+
+##################################################
+#   Statistical routines
+##################################################
+
+
+"""
+"""
+def prob_diff(prob1, prob2):
+    diff = 0.0
+    for allele in prob1.keys():
+        if allele in prob2:
+            diff += abs(prob1[allele] - prob2[allele])
+        else:
+            diff += prob1[allele]
+    return diff
+
+
+"""
+"""
+def Gene_prob_cmp(a, b):
+    if a[1] != b[1]:
+        if a[1] < b[1]:
+            return 1
+        else:
+            return -1
+    assert a[0] != b[0]
+    if a[0] < b[0]:
+        return -1
+    else:
+        return 1
+
+
+"""
+"""
+def single_abundance(Gene_cmpt, Gene_length, exonic = False):
+    def normalize(prob):
+        total = sum(prob.values())
+        for allele, mass in prob.items():
+            prob[allele] = mass / total        
+
+    def normalize_len(prob, length):
+        total = 0
+        for allele, mass in prob.items():
+            assert allele in length
+            total += (mass / length[allele])
+        for allele, mass in prob.items():
+            assert allele in length
+            prob[allele] = mass / length[allele] / total
+
+    Gene_prob, Gene_prob_next = {}, {}
+    for cmpt, count in Gene_cmpt.items():
+        alleles = cmpt.split('-')
+        for allele in alleles:
+            if allele not in Gene_prob:
+                Gene_prob[allele] = 0.0
+            Gene_prob[allele] += (float(count) / len(alleles))
+    if exonic:
+        normalize(Gene_prob)
+    else:
+        normalize_len(Gene_prob, Gene_length)
+
+    def next_prob(Gene_cmpt, Gene_prob, Gene_length):
+        Gene_prob_next = {}
+        for cmpt, count in Gene_cmpt.items():
+            alleles = cmpt.split('-')
+            alleles_prob = 0.0
+            for allele in alleles:
+                if allele not in Gene_prob:
+                    continue
+                alleles_prob += Gene_prob[allele]
+            if alleles_prob <= 0.0:
+                continue
+            for allele in alleles:
+                if allele not in Gene_prob:
+                    continue
+                if allele not in Gene_prob_next:
+                    Gene_prob_next[allele] = 0.0
+                Gene_prob_next[allele] += (float(count) * Gene_prob[allele] / alleles_prob)
+        if exonic:
+            normalize(Gene_prob_next)
+        else:
+            normalize_len(Gene_prob_next, Gene_length)
+        return Gene_prob_next
+
+    def select_alleles(Gene_prob):
+        if len(Gene_prob) == 0:
+            return Gene_prob
+        Gene_prob2 = {}
+        max_prob = max(Gene_prob.values())
+        for allele, prob in Gene_prob.items():
+            if prob >= max_prob / 10.0:
+                Gene_prob2[allele] = prob
+        return Gene_prob2
+
+    fast_EM = True
+    diff, iter = 1.0, 0
+    while diff > 0.0001 and iter < 1000:
+        Gene_prob_next = next_prob(Gene_cmpt, Gene_prob, Gene_length)
+        if fast_EM:
+            # Accelerated version of EM - SQUAREM iteration
+            #    Varadhan, R. & Roland, C. Scand. J. Stat. 35, 335-353 (2008)
+            #    Also, this algorithm is used in Sailfish - http://www.nature.com/nbt/journal/v32/n5/full/nbt.2862.html
+            Gene_prob_next2 = next_prob(Gene_cmpt, Gene_prob_next, Gene_length)
+            sum_squared_r, sum_squared_v = 0.0, 0.0
+            p_r, p_v = {}, {}
+            for a in Gene_prob.keys():
+                p_r[a] = Gene_prob_next[a] - Gene_prob[a]
+                sum_squared_r += (p_r[a] * p_r[a])
+                p_v[a] = Gene_prob_next2[a] - Gene_prob_next[a] - p_r[a]
+                sum_squared_v += (p_v[a] * p_v[a])
+            if sum_squared_v > 0.0:
+                gamma = -math.sqrt(sum_squared_r / sum_squared_v)
+                for a in Gene_prob.keys():
+                    Gene_prob_next2[a] = max(0.0, Gene_prob[a] - 2 * gamma * p_r[a] + gamma * gamma * p_v[a]);
+                Gene_prob_next = next_prob(Gene_cmpt, Gene_prob_next2, Gene_length)
+
+        diff = prob_diff(Gene_prob, Gene_prob_next)
+        Gene_prob = Gene_prob_next
+
+        # Accelerate convergence
+        if iter >= 10:
+            Gene_prob = select_alleles(Gene_prob)
+
+        # DK - debugging purposes
+        if iter % 10 == 0 and False:
+            print >> sys.stderr, "iter", iter
+            for allele, prob in Gene_prob.items():
+                if prob >= 0.01:
+                    print >> sys.stderr, "\t", iter, allele, prob
+        
+        iter += 1
+        
+    Gene_prob = select_alleles(Gene_prob)
+    if exonic:
+        normalize(Gene_prob)
+    else:
+        normalize_len(Gene_prob, Gene_length)
+    Gene_prob = [[allele, prob] for allele, prob in Gene_prob.items()]
+    Gene_prob = sorted(Gene_prob, cmp=Gene_prob_cmp)
+    return Gene_prob
+
+
+##################################################
+#   Realignment, alternative alignments
+##################################################
+
+
+"""
+Identify alternative haplotypes
+   insertions are not considered...
+
+   INPUT: see the function's parameters below
+   OUPUT: 529-hv8-hv22-606: set(['529-hv13-570', '529-hv4-hv18-590', '529-hv2-hv16-582'])
+          529-hv3-hv17-598: set(['529-hv6-hv21-hv26-610'])
+"""
+def get_alternatives(ref_seq,     # GATAACTAGATACATGAGATAGATTTGATAGATAGATAGATACATACATACATACATACATACAGGATAGATAACTAGG...
+                     allele_vars, # {'VWA*20(22)': ['hv231', 'hv245'], "VWA*16(18')": ['hv235', 'hv250', 'hv256'], ...}
+                     Vars,        # {'hv241': ['deletion', 529, '52'], 'hv240': ['deletion', 529, '48'], ... }
+                     Var_list,    # [[529, 'hv230'], [529, 'hv231'], [529, 'hv232'], [529, 'hv233'], ...]
+                     verbose):
+    haplotype_alts_left, haplotype_alts_right = {}, {}
+    second_order_haplotypes = set()
+    for allele_name, vars in allele_vars.items():
+        for v in range(len(vars) - 1):
+            ht = vars[v] + "-" + vars[v+1]
+            second_order_haplotypes.add(ht)
+
+    rev_Var_list = []
+    for _, var_id in Var_list:
+        var_type, var_pos, var_data = Vars[var_id]
+        if var_type == "deletion":
+            var_pos = var_pos + int(var_data) - 1
+        elif var_type == "insertion":
+            var_pos += 1
+        rev_Var_list.append([var_pos, var_id])
+    rev_Var_list = sorted(rev_Var_list, cmp=lambda a, b: a[0] - b[0])
+
+    def nextbases(haplotype,
+                  left = True,
+                  exclude_list = []):
+        if left:
+            pos = int(haplotype[0]) - 1
+        else:
+            pos = haplotype[-1] + 1
+        if pos < 0 or pos >= len(ref_seq):
+            return []
+
+        if left:
+            bases = [[[pos] + haplotype[1:], ref_seq[pos]]]
+            prev_id = None
+            if len(haplotype) > 2:
+                prev_id = haplotype[1]        
+
+            var_i = lower_bound(rev_Var_list, pos + 1)
+            for var_j in reversed(range(0, var_i)):
+                _, var_id = rev_Var_list[var_j]
+                var_type, var_pos, var_data = Vars[var_id]
+                if var_type == "deletion":
+                    if var_pos == 0:
+                        continue
+                    var_pos = var_pos + int(var_data) - 1
+                if var_pos > pos:
+                    continue
+                if var_pos < pos:
+                    break
+                if var_id in exclude_list:
+                    continue
+                if prev_id:
+                    second_ht = var_id + "-" + prev_id
+                    if second_ht not in second_order_haplotypes:
+                        continue
+
+                if var_type == "single":
+                    bases.append([[var_pos, var_id] + haplotype[1:], var_data])
+                elif var_type == "deletion":
+                    bases2 = nextbases([var_pos - int(var_data) + 1, var_id] + haplotype[1:],
+                                       left,
+                                       exclude_list)
+                    bases += bases2
+                else:
+                    assert var_type == "insertion"
+        else:
+            bases = [[haplotype[:-1] + [pos], ref_seq[pos]]]
+            prev_id = None
+            if len(haplotype) > 2:
+                prev_id = haplotype[-2]       
+
+            var_i = lower_bound(Var_list, pos)
+            for var_j in range(var_i, len(Var_list)):
+                _, var_id = Var_list[var_j]
+                var_type, var_pos, var_data = Vars[var_id]
+                if var_pos < pos:
+                    continue
+                if var_pos > pos:
+                    break
+                if var_id in exclude_list:
+                    continue
+                if prev_id:
+                    second_ht = prev_id + "-" + var_id
+                    if second_ht not in second_order_haplotypes:
+                        continue
+
+                if var_type == "single":
+                    bases.append([haplotype[:-1] + [var_id, var_pos], var_data])
+                elif var_type == "deletion":
+                    bases2 = nextbases(haplotype[:-1] + [var_id, var_pos + int(var_data) - 1],
+                                       left,
+                                       exclude_list)
+                    bases += bases2
+                else:
+                    assert var_type == "insertion"
+
+        return bases
+
+    def get_haplotype_seq(haplotype):
+        seq = ""
+        pos = int(haplotype[0])
+        for i in range(1, len(haplotype) - 1):
+            var_id = haplotype[i]
+            var_type, var_pos, var_data = Vars[var_id]
+            if pos < var_pos:
+                seq += ref_seq[pos:var_pos]
+            if var_type == "single":
+                seq += var_data
+                pos = var_pos + 1
+            elif var_type == "deletion":
+                pos = var_pos + int(var_data)
+            else:
+                assert var_type == "insertion"
+                seq += var_data
+                pos = var_pos
+            
+        last_pos = int(haplotype[-1]) + 1
+        assert pos <= last_pos
+        if pos < last_pos:
+            seq += ref_seq[pos:last_pos]                
+        return seq
+
+    def get_alternative_recur(var_orig_id,
+                              haplotype,
+                              haplotype_alt,
+                              left = True,
+                              dep = 0):
+        bases1 = nextbases(haplotype,
+                           left)
+        bases2 = nextbases(haplotype_alt,
+                           left,
+                           [var_orig_id]) # exclude
+
+        found = False
+        for base1 in bases1:
+            next_haplotype, bp = base1
+            for base2 in bases2:
+                next_haplotype_alt, bp2 = base2
+                if bp != bp2:
+                    continue
+
+                # Todo: implement a routine to handle haplotypes ending with the same coordinate
+                if left:
+                    left1, left2 = int(next_haplotype[0]), int(next_haplotype_alt[0])
+                    if left1 == left2:
+                        continue
+                else:
+                    right1, right2 = int(next_haplotype[-1]), int(next_haplotype_alt[-1])
+                    if right1 == right2:
+                        continue
+
+                found = True
+                get_alternative_recur(var_orig_id,
+                                      next_haplotype,
+                                      next_haplotype_alt,
+                                      left,
+                                      dep + 1)            
+  
+        if dep > 0:
+            if not found:
+                def to_haplotype_str(haplotype):
+                    if len(haplotype) <= 2:
+                        haplotype = "%d-%d" % (haplotype[0], haplotype[1])
+                    else:
+                        haplotype = "%d-%s-%d" % (haplotype[0], '-'.join(haplotype[1:-1]), haplotype[-1])
+                    return haplotype
+
+                haplotype, haplotype_alt = to_haplotype_str(haplotype), to_haplotype_str(haplotype_alt)
+                haplotype_alts = haplotype_alts_left if left else haplotype_alts_right
+                if haplotype not in haplotype_alts:
+                    haplotype_alts[haplotype] = set()
+                haplotype_alts[haplotype].add(haplotype_alt)
+
+                if haplotype_alt not in haplotype_alts:
+                    haplotype_alts[haplotype_alt] = set()
+                haplotype_alts[haplotype_alt].add(haplotype)
+
+    # Search alternative haplotypes in both left and right directions
+    for var_i in range(len(Var_list)):
+        _, var_id = Var_list[var_i]
+        var_type, var_pos, var_data = Vars[var_id]
+        if var_pos == 0:
+            continue
+        if var_type != "deletion":
+            continue
+        del_len = int(var_data)
+        if var_pos + del_len >= len(ref_seq):
+            continue
+
+        # Left direction
+        get_alternative_recur(var_id,
+                              [var_pos, var_id, var_pos + del_len - 1],
+                              [var_pos + del_len, var_pos + del_len - 1])
+
+        # Right direction    
+        get_alternative_recur(var_id,
+                              [var_pos, var_id, var_pos + del_len - 1],
+                              [var_pos, var_pos - 1],
+                              False)
+
+    # Print alternative haplotypes / Sanity check
+    def print_haplotype_alts(haplotype_alts):
+        for haplotype, haplotype_set in haplotype_alts.items():
+            if verbose: print "\t%s:" % haplotype, haplotype_set
+            haplotype_seq = get_haplotype_seq(haplotype.split('-'))
+            for haplotype_alt in haplotype_set:
+                haplotype_alt_seq = get_haplotype_seq(haplotype_alt.split('-'))
+                assert haplotype_seq == haplotype_alt_seq            
+
+    if verbose: print "number of left haplotypes:", len(haplotype_alts_left)
+    print_haplotype_alts(haplotype_alts_left)
+    if verbose: print "number of right haplotypes:", len(haplotype_alts_right)
+    print_haplotype_alts(haplotype_alts_right)
+
+    return haplotype_alts_left, haplotype_alts_right
+
+
+"""
+Identify ambigious differences that may account for other alleles,
+  given a list of differences (cmp_list) between a read and a potential allele   
+"""
+def identify_ambigious_diffs(ref_seq,
+                             Vars,
+                             Alts_left,
+                             Alts_right,
+                             Alts_left_list,
+                             Alts_right_list,
+                             cmp_list,
+                             verbose,
+                             debug = False):
+    cmp_left, cmp_right = 0, len(cmp_list) - 1
+    left, right = cmp_list[0][1], cmp_list[-1][1] + cmp_list[-1][2] - 1
+    left_alt_set, right_alt_set = set(), set()
+
+    def get_haplotype_and_seq(cmp_list):
+        ht, seq = [], ""
+        for i in range(len(cmp_list)):
+            cmp_i = cmp_list[i]
+            type, pos, length = cmp_i[:3]
+            if len(cmp_i) <= 3:
+                var_id = ""
+            else:
+                var_id = cmp_i[3]
+            if type == "match":
+                seq += ref_seq[pos:pos+length]
+            elif type == "mismatch":
+                seq += ref_seq[pos]
+            elif type == "insertion":
+                None
+                # seq += data
+            else:
+                assert type == "deletion"
+
+            if var_id != "" and var_id != "unknown":
+                ht.append(var_id)
+        return ht, seq
+
+    # Left direction
+    found = False
+    for i in reversed(range(len(cmp_list))):
+        i_found = False
+        cmp_i = cmp_list[i]
+        type, cur_left, length = cmp_i[:3]
+        var_id = cmp_i[3] if type in ["mismatch", "deletion"] else ""
+
+        # DK - debugging purpose
+        if type in ["mismatch", "deletion", "insertion"]:
+            if not var_id.startswith("hv"):
+                continue
+        
+        if type in ["match", "deletion"]:
+            cur_right = cur_left + length - 1
+        else:
+            cur_right = cur_left
+
+        cur_ht, cur_seq = get_haplotype_and_seq(cmp_list[:i+1])
+        if len(cur_ht) == 0:
+            cur_ht_str = str(left)
+        else:
+            cur_ht_str = "%d-%s" % (left, '-'.join(cur_ht))
+        ht_i = lower_bound(Alts_left_list, cur_right + 1)
+        for ht_j in reversed(range(0, min(ht_i + 1, len(Alts_left_list)))):
+            ht_pos, ht = Alts_left_list[ht_j]
+            if ht_pos < cur_left:
+                break            
+            if ht_pos > cur_right:
+                continue
+            if len(cur_ht) > 0:
+                if ht.find('-'.join(cur_ht)) == -1:
+                    continue
+
+            ht = ht.split('-')[:-1]
+            if len(cur_ht) + 1 == len(ht):
+                ht_pos = int(ht[0])
+            else:
+                var_id2 = ht[len(ht) - len(cur_ht) - 1]
+                ht_type, ht_pos, ht_data = Vars[var_id2]
+                if ht_type == "deletion":
+                    ht_pos = ht_pos + int(ht_data) - 1
+                    
+            if left < ht_pos:
+                continue
+
+            i_found = True
+
+            if debug:
+                print cmp_list[:i+1]
+                print "\t", cur_ht, "vs", Alts_left_list[ht_j], ht_pos
+
+            _, rep_ht = Alts_left_list[ht_j]
+
+            if debug:
+                print "DK1:", cmp_i, cmp_list
+                print "DK2:", rep_ht, Alts_left[rep_ht]
+                print "DK3:", left, right, ht_pos
+
+            for alt_ht_str in Alts_left[rep_ht]:
+                alt_ht = alt_ht_str.split('-')
+                alt_ht_left, alt_ht_right = int(alt_ht[0]), int(alt_ht[-1])
+                assert alt_ht_right <= cur_right
+                seq_pos = cur_right - alt_ht_right
+                cur_pos = alt_ht_right
+                part_alt_ht = []
+                alt_ht = alt_ht[1:-1]
+                for var_id_ in reversed(alt_ht):
+                    var_type_, var_pos_, var_data_ = Vars[var_id_]
+                    if var_type_ == "deletion":
+                        del_len = int(var_data_)
+                        var_pos_ = var_pos_ + del_len - 1
+                    assert var_pos_ <= cur_pos
+                    next_seq_pos = seq_pos + (cur_pos - var_pos_)
+                    if next_seq_pos >= len(cur_seq):
+                        break
+                    if var_type_ == "single":
+                        next_seq_pos += 1
+                        next_cur_pos = var_pos_ - 1
+                    elif var_type_ == "deletion":
+                        next_cur_pos = var_pos_ - del_len
+                    else:
+                        assert var_type_ == "insertion"
+                        assert False
+
+                    part_alt_ht.insert(0, var_id_)
+                    if next_seq_pos >= len(cur_seq):
+                        break
+                    seq_pos, cur_pos = next_seq_pos, next_cur_pos
+
+                if len(part_alt_ht) > 0:
+                    seq_left = len(cur_seq) - seq_pos - 1
+                    part_alt_ht_str = ""
+                    if found:
+                        var_id_list = []
+                        for j in range(i + 1, cmp_left):
+                            cmp_j = cmp_list[j]
+                            if cmp_j[0] in ["mismatch", "deletion", "insertion"]:
+                                var_id_ = cmp_j[3]
+                                if var_id_.startswith("hv"):
+                                    var_id_list.append(var_id_)
+                        if len(var_id_list) > 0:
+                            part_alt_ht_str = '-' + '-'.join(var_id_list)
+                    part_alt_ht_str = ("%d-%s" % (cur_pos - seq_left, '-'.join(part_alt_ht))) + part_alt_ht_str
+                    left_alt_set.add(part_alt_ht_str)
+                        
+                if debug:
+                    print "\t\t", cur_left, alt_ht_str
+
+        if i_found:
+            if not found:
+                cmp_left = i + 1
+                left_alt_set.add(cur_ht_str)
+            found = True
+
+    if not found:
+        left_alt_set.add(str(left))
+
+    # Right direction
+    found = False
+    for i in range(0, len(cmp_list)):
+        i_found = False
+        cmp_i = cmp_list[i]
+        type, cur_left, length = cmp_i[:3]
+        var_id = cmp_i[3] if type in ["mismatch", "deletion"] else ""
+
+        # DK - debugging purpose
+        if type in ["mismatch", "deletion", "insertion"]:
+            if not var_id.startswith("hv"):
+                continue
+
+        if type in ["match", "deletion"]:
+            cur_right = cur_left + length - 1
+        else:
+            cur_right = cur_left
+
+        cur_ht, cur_seq = get_haplotype_and_seq(cmp_list[i:])
+        if len(cur_ht) == 0:
+            cur_ht_str = str(right)
+        else:
+            cur_ht_str = "%s-%d" % ('-'.join(cur_ht), right)
+
+        ht_i = lower_bound(Alts_right_list, cur_left)
+        for ht_j in range(ht_i, len(Alts_right_list)):
+            ht_pos, ht = Alts_right_list[ht_j]
+            if ht_pos > cur_right:
+                break
+            if ht_pos < cur_left:
+                continue
+
+            if len(cur_ht) > 0:
+                if ht.find('-'.join(cur_ht)) == -1:
+                    continue
+
+            ht = ht.split('-')[1:]
+            if len(cur_ht) + 1 == len(ht):
+                ht_pos = int(ht[-1])
+            else:
+                var_id2 = ht[len(cur_ht)]
+                _, ht_pos, _ = Vars[var_id2]
+
+            if right > ht_pos:
+                continue
+
+            i_found = True
+            _, rep_ht = Alts_right_list[ht_j]
+
+            if debug:
+                print "DK1:", cmp_i, cmp_list
+                print "DK2:", rep_ht, Alts_right[rep_ht]
+                print "DK3:", left, right, ht_pos
+
+            for alt_ht_str in Alts_right[rep_ht]:
+                alt_ht = alt_ht_str.split('-')
+                alt_ht_left, alt_ht_right = int(alt_ht[0]), int(alt_ht[-1])
+                assert cur_left <= alt_ht_left
+                seq_pos = alt_ht_left - cur_left
+                cur_pos = alt_ht_left
+                part_alt_ht = []
+                alt_ht = alt_ht[1:-1]
+                for var_id_ in alt_ht:
+                    var_type_, var_pos_, var_data_ = Vars[var_id_]
+                    assert var_pos_ >= cur_pos
+                    next_seq_pos = seq_pos + (var_pos_ - cur_pos)
+                    if next_seq_pos >= len(cur_seq):
+                        break
+                    
+                    if var_type_ == "single":
+                        next_seq_pos += 1
+                        next_cur_pos = var_pos_ + 1
+                    elif var_type_ == "deletion":
+                        next_cur_pos = var_pos_ + int(var_data_)
+                    else:
+                        assert var_type_ == "insertion"
+                        assert False
+
+                    part_alt_ht.append(var_id_)
+                    if next_seq_pos >= len(cur_seq):
+                        break
+                    seq_pos, cur_pos = next_seq_pos, next_cur_pos
+
+                if len(part_alt_ht) > 0:
+                    seq_left = len(cur_seq) - seq_pos - 1
+                    assert seq_left >= 0
+                    part_alt_ht_str = ""
+                    if found:
+                        var_id_list = []
+                        for j in range(cmp_right + 1, i):
+                            cmp_j = cmp_list[j]
+                            if cmp_j[0] in ["mismatch", "deletion", "insertion"]:
+                                var_id_ = cmp_j[3]
+                                if var_id_.startswith("hv"):
+                                    var_id_list.append(var_id_)
+                        if len(var_id_list) > 0:
+                            part_alt_ht_str = '-'.join(var_id_list) + '-'
+                    part_alt_ht_str += ("%s-%d" % ('-'.join(part_alt_ht), cur_pos + seq_left))
+                    right_alt_set.add(part_alt_ht_str)
+                        
+        if i_found:            
+            if not found:
+                cmp_right = i - 1
+                right_alt_set.add(cur_ht_str)
+            found = True
+
+    if not found:
+        right_alt_set.add(str(right))
+
+    if cmp_right < cmp_left:
+        cmp_left = 0
+        left_alt_set = set([str(left)])
+
+    # Sanity check
+    ht_set_ = set()
+    for ht in left_alt_set:
+        ht = '-'.join(ht.split('-')[1:])
+        if ht == "":
+            continue
+        if ht in ht_set_:
+            print >> sys.stderr, "Error) %s should not be in" % ht, ht_set_
+
+            # DK - debugging purposes
+            print "DK: cmp_list_range: [%d, %d]" % (cmp_left, cmp_right)
+            print "DK: cmp_list:", cmp_list
+            print "DK: left_alt_set:", left_alt_set, "right_alt_set:", right_alt_set
+            
+            assert False
+        ht_set_.add(ht)
+    for ht in right_alt_set:
+        ht = '-'.join(ht.split('-')[:-1])
+        if ht == "":
+            continue
+        if ht in ht_set_:
+            print >> sys.stderr, "Error) %s should not be in" % ht, ht_set_
+            assert False
+        ht_set_.add(ht)
+
+    if debug:
+        print "cmp_list_range: [%d, %d]" % (cmp_left, cmp_right)
+        print "left  alt set:", left_alt_set
+        print "right alt set:", right_alt_set
+    
+    return cmp_left, cmp_right, list(left_alt_set), list(right_alt_set)
+
diff --git a/hisatgenotype_scripts/compare_HLA_Omixon.py b/hisatgenotype_scripts/compare_HLA_Omixon.py
new file mode 100755
index 0000000..ad79c19
--- /dev/null
+++ b/hisatgenotype_scripts/compare_HLA_Omixon.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+
+import sys, os
+from argparse import ArgumentParser, FileType
+use_message = '''
+'''
+
+def compare(hisatgenotype_fname, omixon_fname):
+    hla_list = ["A", "B", "C", "DQA1", "DQB1", "DRB1"]
+    
+    # Read HISAT-genotype predicted HLA alleles for the CAAPA genomes
+    hisat_hla = {}
+    for line in open(hisatgenotype_fname):
+        line = line.strip()
+        fields = line.split('\t')
+        if len(fields) == 2:
+            sample, allele = fields
+            abundance, vars_covered = 0.0, ""
+        elif len(fields) == 3:
+            sample, allele, abundance = fields
+            vars_covered = ""
+        else:
+            assert len(fields) == 4
+            sample, allele, abundance, vars_covered = fields
+        abundance = float(abundance)
+        if sample not in hisat_hla:
+            hisat_hla[sample] = {}
+        gene, allele = allele.split('*')
+        if gene not in hisat_hla[sample]:
+            hisat_hla[sample][gene] = []
+        hisat_hla[sample][gene].append([allele, abundance])
+
+    # Read Omixon predicted HLA alleles for the CAAPA genomes
+    omixon_hla = {}
+    for line in open(omixon_fname):
+        line = line.strip()
+        sample, allele1, allele2 = line.split('\t')
+        gene1, allele1 = allele1.split('*')
+        gene2, allele2 = allele2.split('*')
+        
+        assert gene1 == gene2
+        if sample not in omixon_hla:
+            omixon_hla[sample] = {}
+        if gene1 not in omixon_hla[sample]:
+            omixon_hla[sample][gene1] = []
+
+        if len(omixon_hla[sample][gene1]) >= 2:
+            continue
+            
+        omixon_hla[sample][gene1].append(allele1)
+        omixon_hla[sample][gene1].append(allele2)
+
+    for gene in hla_list:
+        count, count_10 = [0, 0, 0], [0, 0, 0]
+        print >> sys.stderr, gene
+        for sample in omixon_hla.keys():
+            if sample not in hisat_hla:
+                continue
+            hisat_sample = hisat_hla[sample]
+            omixon_sample = omixon_hla[sample]
+            if gene not in omixon_sample or gene not in hisat_sample:
+                continue
+            hisat_gene = hisat_sample[gene]
+            omixon_gene = omixon_sample[gene]
+            num_match, num_match_10 = 0, 0
+            for omixon_allele in omixon_gene:
+                omixon_allele = omixon_allele.split(':')
+                for hisat_allele_idx in range(len(hisat_gene)):
+                    hisat_allele = hisat_gene[hisat_allele_idx]
+                    hisat_allele = hisat_allele[0].split(':')
+                    equal = True
+                    for i in range(min(len(omixon_allele), len(hisat_allele), 2)):
+                        omixon_num = omixon_allele[i]
+                        hisat_num = hisat_allele[i]
+                        """
+                        if not omixon_num[-1].isdigit():
+                            omixon_num = omixon_num[:-1]
+                        if not hisat_num[-1].isdigit():
+                            hisat_num = hisat_num[:-1]
+                        if int(hisat_num) != int(omixon_num):
+                            equal = False
+                            break
+                        """
+                        if hisat_num != omixon_num:
+                            equal = False
+                            break
+                    if equal:
+                        if hisat_allele_idx < 2:
+                            num_match += 1
+                        num_match_10 += 1
+                        break
+                    
+            # DK - for debugging purposes
+            """
+            if gene in ["A", "B", "C", "DQA1", "DQB1", "DRB1"] and num_match < 2:
+                print sample
+                print "\t", omixon_gene
+                print "\t", hisat_gene
+                # sys.exit(1)
+            """
+                
+            assert num_match < len(count)
+            count[num_match] += 1
+            count_10[num_match_10] += 1
+
+        if sum(count) <= 0:
+            continue
+        
+        print >> sys.stderr, "\tTop two\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count[0], count[1], count[2], (count[1] + count[2] * 2) / float(sum(count) * 2) * 100.0)
+        print >> sys.stderr, "\tTop ten\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count_10[0], count_10[1], count_10[2], (count_10[1] + count_10[2] * 2) / float(sum(count_10) * 2) * 100.0)
+        
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description='Compare HISAT-genotype and Omixon HLA typing results')
+    parser.add_argument('hisatgenotype_fname',
+                        nargs='?',
+                        type=str,
+                        help='hisatgenotype file name (e.g. cp_hla.txt)')
+    parser.add_argument('omixon_fname',
+                        nargs='?',
+                        type=str,
+                        help='omixon file name (e.g. omixon_caapa_hla.txt)')
+
+    args = parser.parse_args()
+
+    compare(args.hisatgenotype_fname,
+            args.omixon_fname)
+
diff --git a/hisatgenotype_scripts/extract_Omixon_HLA.py b/hisatgenotype_scripts/extract_Omixon_HLA.py
new file mode 100755
index 0000000..23aaa04
--- /dev/null
+++ b/hisatgenotype_scripts/extract_Omixon_HLA.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2016, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT 2.
+#
+# HISAT 2 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT 2 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import sys, os, subprocess, glob
+
+if __name__ == '__main__':
+    hla_list = ["A", "B", "C", "DQA1", "DQB1", "DRB1"]
+    gen_alleles = {}
+    for hla in hla_list:
+        for line in open("IMGTHLA/fasta/%s_gen.fasta" % hla):
+            if line.startswith(">"):
+                allele = line.split()[1]
+                gene = allele.split('*')[0]
+                if gene not in gen_alleles:
+                    gen_alleles[gene] = set()
+                gen_alleles[gene].add(allele)
+                
+    nuc_alleles = {}
+    for hla in hla_list:
+        for line in open("IMGTHLA/fasta/%s_nuc.fasta" % hla):
+            if line.startswith(">"):
+                allele = line.split()[1]
+                gene = allele.split('*')[0]
+                if gene not in nuc_alleles:
+                    nuc_alleles[gene] = set()
+                nuc_alleles[gene].add(allele)
+
+    print >> sys.stderr, "IMGTHLA"
+    for gene, alleles in nuc_alleles.items():
+        print >> sys.stderr, "\t%s: %d alleles" % (gene, len(alleles))
+
+    # Read HLA alleles from Omixon data
+    omixon_alleles = {}
+    omixon_fnames = glob.glob("HLAresults/*.gz")
+    for fname in omixon_fnames:
+        genome = fname.split("/")[1].split("_HLA")[0]
+        view_cmd = ["gzip", "-cd", fname]
+        proc = subprocess.Popen(view_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w'))
+        allele_count = {}
+        prev_allele1, prev_allele2 = "", ""
+        for line in proc.stdout:
+            if not line.startswith("HLA"):
+                continue
+
+            fields = line.strip().split()
+            if len(fields) > 6:
+                allele1, allele2 = fields[0][4:-1], fields[6][4:-1]
+            else:
+                allele1 = allele2 = fields[0][4:-1]
+
+            gene = allele1.split("*")[0]
+            if gene not in hla_list:
+                continue
+            if gene not in omixon_alleles:
+                omixon_alleles[gene] = set()
+            if gene not in allele_count:
+                allele_count[gene] = 0
+            if allele_count[gene] >= 10:
+                continue
+
+            if allele2 == "":
+                allele2 = prev_allele2
+            assert allele1 != "" and allele2 != ""
+
+            def update_allele(allele):
+                if allele == "DRB1*08:01:03":
+                    allele = "DRB1*08:01:01"
+                elif allele == "DRB1*11:11:02":
+                    allele = "DRB1*11:11:01"
+                return allele
+
+            allele1, allele2 = update_allele(allele1), update_allele(allele2)
+            
+            allele_count[gene] += 1
+            omixon_alleles[gene].add(allele1)
+            omixon_alleles[gene].add(allele2)
+            prev_allele1, prev_allele2 = allele1, allele2
+
+            print "%s\t%s\t%s" % (genome, allele1, allele2)
+
+    print >> sys.stderr, "Omixon"
+    for gene, alleles in omixon_alleles.items():
+        print >> sys.stderr, "\t%s: %d alleles" % (gene, len(alleles))
+        for allele in alleles:
+            if allele in nuc_alleles[gene]:
+                continue
+            found = False
+            for allele_cmp in nuc_alleles[gene]:
+                if allele_cmp.find(allele) != -1:
+                    found = True
+                    break                    
+
+            if not found:
+                print >> sys.stderr, "\t\t%s is missing" % allele
+
+            
diff --git a/hisatgenotype_scripts/get_haplotype_ILMN_StrandSeq.py b/hisatgenotype_scripts/get_haplotype_ILMN_StrandSeq.py
new file mode 100755
index 0000000..ff67d42
--- /dev/null
+++ b/hisatgenotype_scripts/get_haplotype_ILMN_StrandSeq.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2016, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT 2.
+#
+# HISAT 2 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT 2 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import sys, os, subprocess, re
+import inspect
+import random
+import glob
+from argparse import ArgumentParser, FileType
+
+
+"""
+"""
+def get_haplotypes(sra_run_info,
+                   alignment):
+
+    runs, run_to_genome = {}, {}
+    for line in open(sra_run_info):
+        line = line.strip()
+        fields = line.split('\t')
+        genome, run = fields[4], fields[0]
+        if genome not in runs:
+            runs[genome] = set()
+        runs[genome].add(run)
+        assert run not in run_to_genome
+        run_to_genome[run] = genome
+
+    prev_run = ""
+    plus, minus = set(), set()
+    for line in open(alignment):
+        line = line.strip()
+        fields = line.split('\t')
+        read_id, flag, ref, pos, _, cigar = fields[:6]
+        flag = int(flag)
+        run = read_id.split('.')[0]
+
+        if flag & 0x4 != 0:
+            continue
+
+        pos = int(pos) - 1
+        
+        if flag & 0x10 == 0:
+            plus.add(pos)
+        else:
+            minus.add(pos)            
+
+        if prev_run != "" and prev_run != run:
+            if len(plus) > 0 and len(minus) > 0:
+                if len(plus) > 1 or len(minus) > 1:
+                    print run_to_genome[prev_run], prev_run, plus, minus
+            plus, minus = set(), set()
+
+        prev_run = run
+
+    if run != "":
+        if len(plus) > 0 and len(minus) > 0:
+            if len(plus) > 1 or len(minus) > 1:
+                print run_to_genome[run], run, plus, minus
+
+
+"""
+"""
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        description='get haplotypes from StrandSeq reads')
+    parser.add_argument("sra_run_info",
+                        nargs='?',
+                        type=str,
+                        help="SRA Run Info filename")
+    parser.add_argument("alignment",
+                        nargs='?',
+                        type=str,
+                        help="SAM file name")
+
+    args = parser.parse_args()
+
+    get_haplotypes(args.sra_run_info,
+                   args.alignment)
+
diff --git a/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py b/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py
new file mode 100755
index 0000000..ad45f8d
--- /dev/null
+++ b/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2015, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT 2.
+#
+# HISAT 2 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT 2 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import sys, os, subprocess, re
+import inspect
+import random
+from argparse import ArgumentParser, FileType
+
+# Gold Standard (experimentally verified, a lot of literature, ...)
+gold_allele_info = {
+    "NA12877" : {"A" : ["03:01", "02:01"], "B" : ["15:01", "44:02"], "C" : ["05:01", "03:04"], "DQA1" : ["03:03", "03:01"], "DQB1" : ["03:02", "03:01"], "DRB1" : ["04:03", "04:03"]},
+    "NA12878" : {"A" : ["01:01", "11:01"], "B" : ["08:01", "56:01"], "C" : ["01:02", "07:01"], "DQA1" : ["05:01", "01:01"], "DQB1" : ["02:01", "05:01"], "DRB1" : ["03:01", "01:01"]},
+    "NA12879" : {"A" : ["01:01", "02:01"], "B" : ["08:01", "15:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:03"]},
+    "NA12880" : {"A" : ["02:01", "01:01"], "B" : ["15:01", "08:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:03"]},
+    "NA12881" : {"A" : ["03:01", "11:01"], "B" : ["44:02", "56:01"], "C" : ["05:01", "01:02"], "DQA1" : ["03:03", "01:01"], "DQB1" : ["03:01", "05:01"], "DRB1" : ["04:03", "01:01"]},
+    "NA12882" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["04:03", "01:01"]},
+    "NA12883" : {"A" : ["03:01", "11:01"], "B" : ["44:02", "56:01"], "C" : ["01:02", "05:01"], "DQA1" : ["03:03", "01:01"], "DQB1" : ["03:01", "05:01"], "DRB1" : ["01:01", "04:03"]},
+    "NA12884" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["01:01", "04:03"]},
+    "NA12885" : {"A" : ["03:01", "01:01"], "B" : ["44:02", "08:01"], "C" : ["05:01", "07:01"], "DQA1" : ["03:03", "05:01"], "DQB1" : ["03:01", "02:01"], "DRB1" : ["03:01", "04:03"]},
+    "NA12886" : {"A" : ["03:01", "01:01"], "B" : ["44:02", "08:01"], "C" : ["07:01", "05:01"], "DQA1" : ["03:03", "05:01"], "DQB1" : ["02:01", "03:01"], "DRB1" : ["03:01", "04:03"]},
+    "NA12887" : {"A" : ["02:01", "01:01"], "B" : ["15:01", "08:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:03"]},
+    "NA12888" : {"A" : ["01:01", "02:01"], "B" : ["08:01", "15:01"], "C" : ["07:01", "03:04"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:03"]},
+    "NA12889" : {"A" : ["03:01", "03:01"], "B" : ["07:02", "44:02"], "C" : ["05:01", "07:02"], "DQA1" : ["03:03", "01:02"], "DQB1" : ["03:01", "06:02"], "DRB1" : ["15:01", "04:03"]},
+    "NA12890" : {"A" : ["03:01", "02:01"], "B" : ["44:03", "15:01"], "C" : ["16:01", "03:04"], "DQA1" : ["03:01", "02:01"], "DQB1" : ["03:02", "02:02"], "DRB1" : ["04:03", "07:01"]},
+    "NA12891" : {"A" : ["24:02", "01:01"], "B" : ["08:01", "07:02"], "C" : ["07:02", "07:01"], "DQA1" : ["05:01", "01:02"], "DQB1" : ["06:02", "02:01"], "DRB1" : ["03:01", "15:01"]},
+    "NA12892" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "04:01"], "DQA1" : ["01:01", "01:01"], "DQB1" : ["05:01", "05:01"], "DRB1" : ["01:01", "01:02"]},
+    "NA12893" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["01:01", "04:03"]}
+    }
+
+# CEPH pedigree (17 family members)
+pedigree = {
+    "NA12889" : {"gender" : "M", "spouse" : "NA12890", "children" : ["NA12877"]},
+    "NA12890" : {"gender" : "F", "spouse" : "NA12889", "children" : ["NA12877"]},
+    "NA12877" : {"gender" : "M", "father" : "NA12889", "mother" : "NA12890", "spouse" : "NA12878", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]},
+
+    "NA12891" : {"gender" : "M", "spouse" : "NA12892", "children" : ["NA12878"]},
+    "NA12892" : {"gender" : "F", "spouse" : "NA12891", "children" : ["NA12878"]},
+    "NA12878" : {"gender" : "F", "father" : "NA12892", "mother" : "NA12891", "spouse" : "NA12877", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]},
+
+    "NA12879" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
+    "NA12880" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
+    "NA12881" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
+    "NA12882" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
+    "NA12883" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
+    "NA12884" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
+    "NA12885" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
+    "NA12886" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
+    "NA12887" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
+    "NA12888" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
+    "NA12893" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
+    }
+
+
+"""
+"""
+def test_HLA_genotyping(reference_type,
+                        hla_list,
+                        aligners,
+                        query_genomes,
+                        exclude_allele_list,
+                        num_mismatch,
+                        verbose):
+    # Current script directory
+    curr_script = os.path.realpath(inspect.getsourcefile(test_HLA_genotyping))
+    ex_path = os.path.dirname(curr_script)
+
+    if not os.path.exists("illumina/HLA"):
+        print >> sys.stderr, "Error: illumina/HLA data is needed (please send an email to infphilo at gmail.com for getting the data)"
+        sys.exit(1)
+
+    num_test, num_success = 0, 0
+    for genome in sorted(gold_allele_info.keys()):
+        if not genome in query_genomes:
+            continue
+        genes = gold_allele_info[genome]
+        read_fname_1, read_fname_2 = "illumina/HLA/%s.fished_1.fq" % genome, "illumina/HLA/%s.fished_2.fq" % genome
+        if not os.path.exists(read_fname_1) or not os.path.exists(read_fname_2):
+            continue
+        print >> sys.stderr, genome        
+        cmd_aligners = ['.'.join(aligners[i]) for i in range(len(aligners))]
+        test_hla_script = os.path.join(ex_path, "hisat2_test_HLA_genotyping.py")
+        for gene in sorted(genes.keys()):
+            if not gene in hla_list:
+                continue
+            alleles = genes[gene]
+            print >> sys.stderr, "\t%s - %s" % (gene, ' / '.join(alleles))            
+            test_hla_cmd = [test_hla_script,
+                            "--reference-type", reference_type,
+                            "--hla-list", gene,
+                            "--aligner-list", ','.join(cmd_aligners),
+                            "--reads", "%s,%s" % (read_fname_1, read_fname_2),
+                            "--best-alleles",
+                            "--exclude-allele-list", ','.join(exclude_allele_list),
+                            "--num-mismatch", str(num_mismatch)]
+
+            if verbose:
+                print >> sys.stderr, ' '.join(test_hla_cmd)
+            
+            proc = subprocess.Popen(test_hla_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w'))
+            num_test += 2
+            test_alleles = set()
+            for line in proc.stdout:
+                print "\t\t", line,
+                model, allele = line.split()[:2]
+                if model != "SingleModel":
+                    continue
+                allele = allele.split('*')[1]
+                allele = ':'.join(allele.split(':')[:2])
+                test_alleles.add(allele)
+            proc.communicate()
+            for allele in alleles:
+                if allele in test_alleles:
+                    num_success += 1
+
+    print >> sys.stderr, "%d/%d (%.2f%%)" % (num_success, num_test, num_success * 100.0 / num_test)
+
+
+"""
+"""
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        description='test HLA genotyping for Platinum Genomes')
+    parser.add_argument("--reference-type",
+                        dest="reference_type",
+                        type=str,
+                        default="gene",
+                        help="Reference type: gene, chromosome, and genome (default: gene)")
+    parser.add_argument("--hla-list",
+                        dest="hla_list",
+                        type=str,
+                        default="A,B,C,DQA1,DQB1,DRB1",
+                        help="A comma-separated list of HLA genes (default: A,B,C,DQA1,DQB1,DRB1)")
+    parser.add_argument("--aligner-list",
+                        dest="aligners",
+                        type=str,
+                        default="hisat2.graph",
+                        help="A comma-separated list of aligners (default: hisat2.graph)")
+    genomes_default = ','.join(gold_allele_info.keys())
+    parser.add_argument("--genome-list",
+                        dest="genome_list",
+                        type=str,
+                        default=genomes_default,
+                        help="A comma-separated list of genomes (default: %s)" % genomes_default)
+    parser.add_argument("--exclude-allele-list",
+                        dest="exclude_allele_list",
+                        type=str,
+                        default="",
+                        help="A comma-separated list of allleles to be excluded")
+    parser.add_argument("--num-mismatch",
+                        dest="num_mismatch",
+                        type=int,
+                        default=0,
+                        help="Maximum number of mismatches per read alignment to be considered (default: 0)")
+    parser.add_argument('-v', '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        help='also print some statistics to stderr')
+
+    args = parser.parse_args()
+
+    if not args.reference_type in ["gene", "chromosome", "genome"]:
+        print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type)
+        sys.exit(1)
+    args.hla_list = args.hla_list.split(',')
+    if args.aligners == "":
+        print >> sys.stderr, "Error: --aligners must be non-empty."
+        sys.exit(1)    
+    args.aligners = args.aligners.split(',')
+    for i in range(len(args.aligners)):
+        args.aligners[i] = args.aligners[i].split('.')
+    args.genome_list = args.genome_list.split(',')
+    args.exclude_allele_list = args.exclude_allele_list.split(',')
+
+    test_HLA_genotyping(args.reference_type,
+                        args.hla_list,
+                        args.aligners,
+                        args.genome_list,
+                        args.exclude_allele_list,
+                        args.num_mismatch,
+                        args.verbose)
diff --git a/hisatgenotype_scripts/hisatgenotype_locus_samples.py b/hisatgenotype_scripts/hisatgenotype_locus_samples.py
new file mode 100755
index 0000000..8cb08e3
--- /dev/null
+++ b/hisatgenotype_scripts/hisatgenotype_locus_samples.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2015, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT 2.
+#
+# HISAT 2 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT 2 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import sys, os, subprocess, re, threading
+import inspect
+import random
+import glob
+from argparse import ArgumentParser, FileType
+import hisatgenotype_typing_common as typing_common
+
+
+"""
+"""
+class myThread(threading.Thread):
+    def __init__(self,
+                 lock, 
+                 paths,
+                 reference_type,
+                 region_list,
+                 num_editdist,
+                 max_sample,
+                 assembly,
+                 out_dir,
+                 verbose):
+        threading.Thread.__init__(self)
+        self.lock = lock
+        self.paths = paths
+        self.reference_type = reference_type
+        self.region_list = region_list
+        self.num_editdist = num_editdist
+        self.max_sample = max_sample
+        self.assembly = assembly
+        self.out_dir = out_dir
+        self.verbose = verbose
+
+    def run(self):
+        global work_idx
+        while True:
+            self.lock.acquire()
+            my_work_idx = work_idx
+            work_idx += 1
+            self.lock.release()
+            if my_work_idx >= len(self.paths) or \
+               my_work_idx >= self.max_sample:
+                return
+            worker(self.lock,
+                   self.paths[my_work_idx],
+                   self.reference_type,
+                   self.region_list,
+                   self.num_editdist,
+                   self.assembly,
+                   self.out_dir,
+                   self.verbose)
+
+            
+"""
+"""
+work_idx = 0
+def worker(lock,
+           path,
+           reference_type,
+           region_list,
+           num_editdist,
+           assembly,
+           out_dir,
+           verbose):
+    fq_name = path.split('/')[-1]
+    read_dir = '/'.join(path.split('/')[:-1])
+    genome = fq_name.split('.')[0]
+    if not fq_name.endswith("extracted.1.fq.gz"):
+        return
+    read_basename = fq_name[:fq_name.find("extracted.1.fq.gz")]
+    read_fname_1, read_fname_2 = "%s/%sextracted.1.fq.gz" % \
+                                 (read_dir, read_basename), "%s/%sextracted.2.fq.gz" % (read_dir, read_basename)
+
+    if not os.path.exists(read_fname_1) or not os.path.exists(read_fname_2):
+        return
+    lock.acquire()
+    print >> sys.stderr, genome
+    lock.release()
+
+    for family, loci in region_list.items():
+        test_hla_cmd = ["hisatgenotype_locus.py",
+                        "--base", family]
+        if len(loci) > 0:
+            test_hla_cmd += ["--locus", ','.join(loci)]
+        test_hla_cmd += ["--num-editdist", str(num_editdist)]
+        test_hla_cmd += ["-1", read_fname_1, "-2", read_fname_2]
+        test_hla_cmd += ["--assembly-base"]
+        if out_dir != "":
+            test_hla_cmd += ["%s/%s" % (out_dir, genome)]
+        else:
+            test_hla_cmd += [genome]
+        if assembly:
+            test_hla_cmd += ["--assembly"]
+
+        if verbose:
+            lock.acquire()
+            print >> sys.stderr, ' '.join(test_hla_cmd)
+            lock.release()
+
+        proc = subprocess.Popen(test_hla_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        test_alleles = set()
+        output_list = []
+        for line in proc.stdout:
+            line = line.strip()
+            if line.find("abundance") == -1:
+                continue
+
+            rank, _, allele, _, abundance = line.split()        
+            output_list.append([allele, abundance[:-2]])
+
+    lock.acquire()
+    for output in output_list:
+        allele, abundance = output
+        print >> sys.stdout, "%s\t%s\t%s" % (genome, allele, abundance)
+    sys.stdout.flush()
+    lock.release()
+
+
+"""
+"""
+def genotyping(read_dir,
+               reference_type,
+               region_list,
+               num_editdist,
+               nthreads,
+               max_sample,
+               assembly,
+               out_dir,
+               verbose):
+    for database_name in region_list:
+        # Extract variants, backbone sequence, and other sequeces
+        typing_common.extract_database_if_not_exists(database_name,
+                                                     [])            # locus_list
+        # Build HISAT2's graph index
+        typing_common.build_index_if_not_exists(database_name,
+                                                "hisat2",
+                                                "graph",
+                                                1,            # threads
+                                                verbose)
+    
+    if not os.path.exists(read_dir):
+        print >> sys.stderr, "Error: %s does not exist." % read_dir
+        sys.exit(1)
+
+    if out_dir != "" and not os.path.exists(out_dir):
+        os.mkdir(out_dir)        
+
+    # fastq files
+    fq_fnames = glob.glob("%s/*.extracted.1.fq.gz" % read_dir)
+
+    lock = threading.Lock()
+    threads = []
+    for t in range(nthreads):
+        thread = myThread(lock,
+                          fq_fnames,
+                          reference_type,
+                          region_list,
+                          num_editdist,
+                          max_sample,
+                          assembly,
+                          out_dir,
+                          verbose)
+        thread.start()
+        threads.append(thread)
+
+    for thread in threads:
+        thread.join()
+
+
+"""
+"""
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        description='genotyping on many samples')
+    parser.add_argument("--reference-type",
+                        dest="reference_type",
+                        type=str,
+                        default="gene",
+                        help="Reference type: gene, chromosome, and genome (default: gene)")
+    parser.add_argument("--region-list",
+                        dest="region_list",
+                        type=str,
+                        default="",
+                        help="A comma-separated list of regions (default: empty)")
+    parser.add_argument('--read-dir',
+                        dest="read_dir",
+                        type=str,
+                        default="",
+                        help='read directory (e.g. read_input)')
+    parser.add_argument("--num-editdist",
+                        dest="num_editdist",
+                        type=int,
+                        default=2,
+                        help="Maximum number of mismatches per read alignment to be considered (default: 2)")
+    parser.add_argument("-p", "--threads",
+                        dest="threads",
+                        type=int,
+                        default=1,
+                        help="Number of threads")
+    parser.add_argument('--assembly',
+                        dest='assembly',
+                        action='store_true',
+                        help='Perform assembly')
+    parser.add_argument("--max-sample",
+                        dest="max_sample",
+                        type=int,
+                        default=sys.maxint,
+                        help="Number of samples to be analyzed (default: sys.maxint)")
+    parser.add_argument("--out-dir",
+                        dest="out_dir",
+                        type=str,
+                        default="",
+                        help='Output directory (default: (empty))')
+    parser.add_argument('-v', '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        help='also print some statistics to stderr')
+
+    args = parser.parse_args()
+
+    if args.read_dir == "":
+        print >> sys.stderr, "Error: please specify --read-dir."
+        sys.exit(1)
+
+    if not args.reference_type in ["gene", "chromosome", "genome"]:
+        print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type)
+        sys.exit(1)
+
+    region_list = {}
+    if args.region_list != "":
+        for region in args.region_list.split(','):
+            region = region.split('.')
+            if len(region) < 1 or len(region) > 2:
+                print >> sys.stderr, "Error: --region-list is incorrectly formatted."
+                sys.exit(1)
+                
+            family = region[0].lower()
+            if len(region) == 2:
+                locus_name = region[1].upper()
+            if family not in region_list:
+                region_list[family] = set()
+            if len(region) == 2:
+                region_list[family].add(locus_name)
+
+    genotyping(args.read_dir,
+               args.reference_type,
+               region_list,
+               args.num_editdist,
+               args.threads,
+               args.max_sample,
+               args.assembly,
+               args.out_dir,
+               args.verbose)
+
diff --git a/old_hisat2_test_HLA_genotyping.py b/old_hisat2_test_HLA_genotyping.py
deleted file mode 100755
index 3bdd6cf..0000000
--- a/old_hisat2_test_HLA_genotyping.py
+++ /dev/null
@@ -1,1341 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2015, Daehwan Kim <infphilo at gmail.com>
-#
-# This file is part of HISAT 2.
-#
-# HISAT 2 is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT 2 is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, re
-import inspect, random
-import math
-from argparse import ArgumentParser, FileType
-
-
-"""
-"""
-def test_HLA_genotyping(reference_type,
-                        hla_list,
-                        partial,
-                        aligners,
-                        read_fname,
-                        alignment_fname,
-                        threads,
-                        simulate_interval,
-                        enable_coverage,
-                        best_alleles,
-                        exclude_allele_list,
-                        num_mismatch,
-                        verbose,
-                        daehwan_debug):
-    # Current script directory
-    curr_script = os.path.realpath(inspect.getsourcefile(test_HLA_genotyping))
-    ex_path = os.path.dirname(curr_script)
-
-    # Clone a git repository, IMGTHLA
-    if not os.path.exists("IMGTHLA"):
-        os.system("git clone https://github.com/jrob119/IMGTHLA.git")
-
-    simulation = (read_fname == [] and alignment_fname == "")
-
-    def check_files(fnames):
-        for fname in fnames:
-            if not os.path.exists(fname):
-                return False
-        return True
-
-    # Download HISAT2 index
-    HISAT2_fnames = ["grch38",
-                     "genome.fa",
-                     "genome.fa.fai"]
-    if not check_files(HISAT2_fnames):
-        os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz")
-        hisat2_inspect = os.path.join(ex_path, "hisat2-inspect")
-        os.system("%s grch38/genome > genome.fa" % hisat2_inspect)
-        os.system("samtools faidx genome.fa")
-
-    # Check if the pre-existing files (hla*) are compatible with the current parameter setting
-    if os.path.exists("hla.ref"):
-        left = 0
-        HLA_genes = set()
-        BACKBONE = False
-        for line in open("hla.ref"):
-            HLA_name = line.strip().split()[0]
-            if HLA_name.find("BACKBONE") != -1:
-                BACKBONE = True
-            HLA_gene = HLA_name.split('*')[0]
-            HLA_genes.add(HLA_gene)
-        delete_hla_files = False
-        if reference_type == "gene":
-            if not BACKBONE:
-                delete_hla_files = True
-        elif reference_type in ["chromosome", "genome"]:
-            if BACKBONE:
-                delete_hla_files = True
-        else:
-            assert False
-        if not set(hla_list).issubset(HLA_genes):
-            delete_hla_files = True
-        if delete_hla_files:
-            os.system("rm hla*")
-    
-    # Extract HLA variants, backbone sequence, and other sequeces
-    HLA_fnames = ["hla_backbone.fa",
-                  "hla_sequences.fa",
-                  "hla.ref",
-                  "hla.snp",
-                  "hla.haplotype",
-                  "hla.link"]
-
-    if not check_files(HLA_fnames):
-        extract_hla_script = os.path.join(ex_path, "hisatgenotype_extract_vars.py")
-        extract_cmd = [extract_hla_script,
-                       "--reference-type", reference_type,
-                       "--hla-list", ','.join(hla_list)]
-        if partial:
-            extract_cmd += ["--partial"]
-        extract_cmd += ["--inter-gap", "30",
-                        "--intra-gap", "50"]
-        if verbose:
-            print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
-        proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-        proc.communicate()
-        if not check_files(HLA_fnames):
-            print >> sys.stderr, "Error: extract_HLA_vars failed!"
-            sys.exit(1)
-
-    # Build HISAT2 graph indexes based on the above information
-    HLA_hisat2_graph_index_fnames = ["hla.graph.%d.ht2" % (i+1) for i in range(8)]
-    if not check_files(HLA_hisat2_graph_index_fnames):
-        hisat2_build = os.path.join(ex_path, "hisat2-build")
-        build_cmd = [hisat2_build,
-                     "-p", str(threads),
-                     "--snp", "hla.snp",
-                     "--haplotype", "hla.haplotype",
-                     "hla_backbone.fa",
-                     "hla.graph"]
-        if verbose:
-            print >> sys.stderr, "\tRunning:", ' '.join(build_cmd)
-        proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-        proc.communicate()        
-        if not check_files(HLA_hisat2_graph_index_fnames):
-            print >> sys.stderr, "Error: indexing HLA failed!  Perhaps, you may have forgotten to build hisat2 executables?"
-            sys.exit(1)
-
-    # Read partial alleles from hla.data (temporary)
-    partial_alleles = set()
-    for line in open("IMGTHLA/hla.dat"):
-        if not line.startswith("DE"):
-            continue
-        allele_name = line.split()[1][4:-1]
-        gene = allele_name.split('*')[0]
-        if line.find("partial") != -1:
-            partial_alleles.add(allele_name)
-
-    # Read HLA alleles (names and sequences)
-    refHLAs, refHLA_loci = {}, {}
-    for line in open("hla.ref"):
-        HLA_name, chr, left, right, length, exon_str = line.strip().split()
-        HLA_gene = HLA_name.split('*')[0]
-        assert not HLA_gene in refHLAs
-        refHLAs[HLA_gene] = HLA_name
-        left, right = int(left), int(right)
-        exons = []
-        for exon in exon_str.split(','):
-            exon_left, exon_right = exon.split('-')
-            exons.append([int(exon_left), int(exon_right)])
-        refHLA_loci[HLA_gene] = [HLA_name, chr, left, right, exons]
-    HLAs = {}
-    def read_HLA_alleles(fname, HLAs):
-        for line in open(fname):
-            if line.startswith(">"):
-                HLA_name = line.strip().split()[0][1:]
-                HLA_gene = HLA_name.split('*')[0]
-                if not HLA_gene in HLAs:
-                    HLAs[HLA_gene] = {}
-                if not HLA_name in HLAs[HLA_gene]:
-                    HLAs[HLA_gene][HLA_name] = ""
-            else:
-                HLAs[HLA_gene][HLA_name] += line.strip()
-        return HLAs
-    if reference_type == "gene":
-        read_HLA_alleles("hla_backbone.fa", HLAs)
-    read_HLA_alleles("hla_sequences.fa", HLAs)
-
-    # HLA gene alleles
-    HLA_names = {}
-    for HLA_gene, data in HLAs.items():
-        HLA_names[HLA_gene] = list(data.keys())
-
-    # HLA gene allele lengths
-    HLA_lengths = {}
-    for HLA_gene, HLA_alleles in HLAs.items():
-        HLA_lengths[HLA_gene] = {}
-        for allele_name, seq in HLA_alleles.items():
-            HLA_lengths[HLA_gene][allele_name] = len(seq)
-
-    # Read HLA variants, and link information
-    Vars, Var_list = {}, {}
-    for line in open("hla.snp"):
-        var_id, var_type, allele, pos, data = line.strip().split('\t')
-        pos = int(pos)
-        if reference_type != "gene":
-            allele, dist = None, 0
-            for tmp_gene, values in refHLA_loci.items():
-                allele_name, chr, left, right, exons = values
-                if allele == None:
-                    allele = allele_name
-                    dist = abs(pos - left)
-                else:
-                    if dist > abs(pos - left):
-                        allele = allele_name
-                        dist = abs(pos - left)
-            
-        gene = allele.split('*')[0]
-        if not gene in Vars:
-            Vars[gene] = {}
-            assert not gene in Var_list
-            Var_list[gene] = []
-            
-        assert not var_id in Vars[gene]
-        left = 0
-        if reference_type != "gene":
-            _, _, left, _, _ = refHLA_loci[gene]
-        Vars[gene][var_id] = [var_type, pos - left, data]
-        Var_list[gene].append([pos - left, var_id])
-        
-    for gene, in_var_list in Var_list.items():
-        Var_list[gene] = sorted(in_var_list)
-    def lower_bound(Var_list, pos):
-        low, high = 0, len(Var_list)
-        while low < high:
-            m = (low + high) / 2
-            m_pos = Var_list[m][0]
-            if m_pos < pos:
-                low = m + 1
-            elif m_pos > pos:
-                high = m
-            else:
-                assert m_pos == pos
-                while m > 0:
-                    if Var_list[m-1][0] < pos:
-                        break
-                    m -= 1
-                return m
-        return low        
-            
-    Links = {}
-    for line in open("hla.link"):
-        var_id, alleles = line.strip().split('\t')
-        alleles = alleles.split()
-        assert not var_id in Links
-        Links[var_id] = alleles
-
-    # Scoring schemes from Sangtae Kim (Illumina)'s implementation
-    max_qual_value = 100
-    match_score, mismatch_score = [0] * max_qual_value, [0] * max_qual_value
-    for qual in range(max_qual_value):
-        error_rate = 0.1 ** (qual / 10.0)
-        match_score[qual] = math.log(1.000000000001 - error_rate);
-        mismatch_score[qual] = math.log(error_rate / 3.0);
-        
-    # Test HLA genotyping
-    test_list = []
-    if simulation:
-        basic_test, pair_test = True, False
-        if daehwan_debug:
-            if "basic_test" in daehwan_debug:
-                basic_test, pair_test = True, False
-            else:
-                basic_test, pair_test = False, True
-
-        test_passed = {}
-        test_list = []
-        genes = list(set(hla_list) & set(HLA_names.keys()))
-        if basic_test:
-            for gene in genes:
-                HLA_gene_alleles = HLA_names[gene]
-                for HLA_name in HLA_gene_alleles:
-                    if HLA_name.find("BACKBONE") != -1:
-                        continue
-                    test_list.append([[HLA_name]])
-        if pair_test:
-            test_size = 500
-            allele_count = 2
-            for test_i in range(test_size):
-                test_pairs = []
-                for gene in genes:
-                    HLA_gene_alleles = []
-                    for allele in HLA_names[gene]:
-                        if allele.find("BACKBONE") != -1:
-                            continue
-                        HLA_gene_alleles.append(allele)
-                    nums = [i for i in range(len(HLA_gene_alleles))]
-                    random.shuffle(nums)
-                    test_pairs.append(sorted([HLA_gene_alleles[nums[i]] for i in range(allele_count)]))
-                test_list.append(test_pairs)
-    else:
-        test_list = [hla_list]
-
-    for test_i in range(len(test_list)):
-        if "test_id" in daehwan_debug:
-            daehwan_test_ids = daehwan_debug["test_id"].split('-')
-            if str(test_i + 1) not in daehwan_test_ids:
-                continue
-
-        print >> sys.stderr, "Test %d" % (test_i + 1)
-        test_HLA_list = test_list[test_i]
-
-        # daehwan - for debugging purposes
-        # test_HLA_list = [["A*11:50Q", "A*11:01:01:01", "A*01:01:01:01"]]
-        for test_HLA_names in test_HLA_list:
-            if simulation:
-                for test_HLA_name in test_HLA_names:
-                    gene = test_HLA_name.split('*')[0]
-                    test_HLA_seq = HLAs[gene][test_HLA_name]
-                    seq_type = "partial" if test_HLA_name in partial_alleles else "full"
-                    print >> sys.stderr, "\t%s - %d bp (%s sequence)" % (test_HLA_name, len(test_HLA_seq), seq_type)
-            else:
-                print >> sys.stderr, "\t%s" % (test_HLA_names)
-                
-            
-        if simulation:
-            HLA_reads_1, HLA_reads_2 = [], []
-            for test_HLA_names in test_HLA_list:
-                gene = test_HLA_names[0].split('*')[0]
-                ref_allele = refHLAs[gene]
-                ref_seq = HLAs[gene][ref_allele]
-
-                # Simulate reads from two HLA alleles
-                def simulate_reads(seq, simulate_interval = 1, frag_len = 250, read_len = 100):
-                    comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
-                    reads_1, reads_2 = [], []
-                    for i in range(0, len(seq) - frag_len + 1, simulate_interval):
-                        reads_1.append(seq[i:i+read_len])
-                        tmp_read_2 = reversed(seq[i+frag_len-read_len:i+frag_len])
-                        read_2 = ""
-                        for s in tmp_read_2:
-                            if s in comp_table:
-                                read_2 += comp_table[s]
-                            else:
-                                read_2 += s
-                        reads_2.append(read_2)
-                    return reads_1, reads_2
-
-                for test_HLA_name in test_HLA_names:
-                    HLA_seq = HLAs[gene][test_HLA_name]
-                    tmp_reads_1, tmp_reads_2 = simulate_reads(HLA_seq, simulate_interval)
-                    HLA_reads_1 += tmp_reads_1
-                    HLA_reads_2 += tmp_reads_2
-
-            # Write reads into a fasta read file
-            def write_reads(reads, idx):
-                read_file = open('hla_input_%d.fa' % idx, 'w')
-                for read_i in range(len(reads)):
-                    print >> read_file, ">%d" % (read_i + 1)
-                    print >> read_file, reads[read_i]
-                read_file.close()
-            write_reads(HLA_reads_1, 1)
-            write_reads(HLA_reads_2, 2)
-
-        for aligner, index_type in aligners:
-            if index_type == "graph":
-                print >> sys.stderr, "\n\t\t%s %s on %s" % (aligner, index_type, reference_type)
-            else:
-                print >> sys.stderr, "\n\t\t%s %s" % (aligner, index_type)
-
-            if alignment_fname == "":
-                # Align reads, and sort the alignments into a BAM file
-                if aligner == "hisat2":
-                    hisat2 = os.path.join(ex_path, "hisat2")
-                    aligner_cmd = [hisat2,
-                                   "--no-unal",
-                                   "--mm"]
-                    if index_type == "linear":
-                        aligner_cmd += ["-k", "10"]
-                    aligner_cmd += ["-x", "hla.%s" % index_type]
-                elif aligner == "bowtie2":
-                    aligner_cmd = [aligner,
-                                   "--no-unal",
-                                   "-k", "10",
-                                   "-x", "hla"]
-                else:
-                    assert False
-                if simulation:
-                    if "test_id" in daehwan_debug:
-                        aligner_cmd += ["-f", "hla_input_1.fa"]
-                    else:
-                        aligner_cmd += ["-f",
-                                        "-1", "hla_input_1.fa",
-                                        "-2", "hla_input_2.fa"]
-                else:
-                    assert len(read_fname) in [1,2]
-                    aligner_cmd += ["-p", str(threads)]
-                    if len(read_fname) == 1:
-                        aligner_cmd += ["-U", read_fname[0]]
-                    else:
-                        aligner_cmd += ["-1", "%s" % read_fname[0],
-                                        "-2", "%s" % read_fname[1]]
-
-                if verbose:
-                    print >> sys.stderr, ' '.join(aligner_cmd)
-                align_proc = subprocess.Popen(aligner_cmd,
-                                              stdout=subprocess.PIPE,
-                                              stderr=open("/dev/null", 'w'))
-
-                sambam_cmd = ["samtools",
-                              "view",
-                              "-bS",
-                              "-"]
-                if simulation:
-                    output_fname_base = "hla_input"
-                else:
-                    output_fname_base = read_fname[0].split('/')[1]
-                    output_fname_base = output_fname_base.split('.')[0]                    
-                    
-                sambam_proc = subprocess.Popen(sambam_cmd,
-                                               stdin=align_proc.stdout,
-                                               stdout=open("%s_unsorted.bam" % output_fname_base, 'w'),
-                                               stderr=open("/dev/null", 'w'))
-                sambam_proc.communicate()
-                if index_type == "graph":
-                    bamsort_cmd = ["samtools",
-                                   "sort",
-                                   "%s_unsorted.bam" % output_fname_base,
-                                   "-o", "%s.bam" % output_fname_base]
-                    bamsort_proc = subprocess.Popen(bamsort_cmd,
-                                                    stderr=open("/dev/null", 'w'))
-                    bamsort_proc.communicate()
-
-                    bamindex_cmd = ["samtools",
-                                    "index",
-                                    "%s.bam" % output_fname_base]
-                    bamindex_proc = subprocess.Popen(bamindex_cmd,
-                                                     stderr=open("/dev/null", 'w'))
-                    bamindex_proc.communicate()
-
-                    os.system("rm %s_unsorted.bam" % output_fname_base)            
-                else:
-                    os.system("mv %s.bam %s.bam" % (output_fname_base, output_fname_base))
-
-                alignment_fname = "%s.bam" % output_fname_base
-
-            for test_HLA_names in test_HLA_list:
-                if simulation:
-                    gene = test_HLA_names[0].split('*')[0]
-                else:
-                    gene = test_HLA_names
-                ref_allele = refHLAs[gene]
-                ref_seq = HLAs[gene][ref_allele]
-                ref_exons = refHLA_loci[gene][-1]
-
-                # Read alignments
-                alignview_cmd = ["samtools",
-                                 "view"]
-                if alignment_fname == "":
-                    alignview_cmd += [alignment_fname]
-                else:
-                    if not os.path.exists(alignment_fname + ".bai"):
-                        os.system("samtools index %s" % alignment_fname)
-                    alignview_cmd += [alignment_fname]
-                base_locus = 0
-                if index_type == "graph":
-                    if reference_type == "gene":
-                        alignview_cmd += ["%s" % ref_allele]
-                    else:
-                        assert reference_type in ["chromosome", "genome"]
-                        _, chr, left, right, _ = refHLA_loci[gene]
-                        base_locus = left
-                        alignview_cmd += ["%s:%d-%d" % (chr, left + 1, right + 1)]
-
-                    bamview_proc = subprocess.Popen(alignview_cmd,
-                                                    stdout=subprocess.PIPE,
-                                                    stderr=open("/dev/null", 'w'))
-
-                    sort_read_cmd = ["sort", "-k", "1", "-n"]
-                    alignview_proc = subprocess.Popen(sort_read_cmd,
-                                                      stdin=bamview_proc.stdout,
-                                                      stdout=subprocess.PIPE,
-                                                      stderr=open("/dev/null", 'w'))
-                else:
-                    alignview_proc = subprocess.Popen(alignview_cmd,
-                                                 stdout=subprocess.PIPE,
-                                                 stderr=open("/dev/null", 'w'))
-
-                # Count alleles
-                HLA_counts, HLA_cmpt = {}, {}
-                coverage = [0 for i in range(len(ref_seq) + 1)]
-                num_reads, total_read_len = 0, 0
-                prev_read_id = None
-                prev_exon = False
-                if index_type == "graph":
-                    # Cigar regular expression
-                    cigar_re = re.compile('\d+\w')
-                    for line in alignview_proc.stdout:
-                        cols = line.strip().split()
-                        read_id, flag, chr, pos, mapQ, cigar_str = cols[:6]
-                        read_seq, qual = cols[9], cols[10]
-                        num_reads += 1
-                        total_read_len += len(read_seq)
-                        flag, pos = int(flag), int(pos)
-                        pos -= (base_locus + 1)
-                        if pos < 0:
-                            continue
-
-                        if flag & 0x4 != 0:
-                            continue
-
-                        NM, Zs, MD = "", "", ""
-                        for i in range(11, len(cols)):
-                            col = cols[i]
-                            if col.startswith("Zs"):
-                                Zs = col[5:]
-                            elif col.startswith("MD"):
-                                MD = col[5:]
-                            elif col.startswith("NM"):
-                                NM = int(col[5:])
-
-                        if NM > num_mismatch:
-                            continue
-
-                        # daehwan - for debugging purposes
-                        debug = False
-                        if read_id in ["2339"] and False:
-                            debug = True
-                            print "read_id: %s)" % read_id, pos, cigar_str, "NM:", NM, MD, Zs
-                            print "            ", read_seq
-
-                        vars = []
-                        if Zs:
-                            vars = Zs.split(',')
-
-                        assert MD != ""
-                        MD_str_pos, MD_len = 0, 0
-                        read_pos, left_pos = 0, pos
-                        right_pos = left_pos
-                        cigars = cigar_re.findall(cigar_str)
-                        cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-                        cmp_list = []
-                        for i in range(len(cigars)):
-                            cigar_op, length = cigars[i]
-                            if cigar_op == 'M':
-                                # Update coverage
-                                if enable_coverage:
-                                    if right_pos + length < len(coverage):
-                                        coverage[right_pos] += 1
-                                        coverage[right_pos + length] -= 1
-                                    elif right_pos < len(coverage):
-                                        coverage[right_pos] += 1
-                                        coverage[-1] -= 1
-
-                                first = True
-                                MD_len_used = 0
-                                while True:
-                                    if not first or MD_len == 0:
-                                        if MD[MD_str_pos].isdigit():
-                                            num = int(MD[MD_str_pos])
-                                            MD_str_pos += 1
-                                            while MD_str_pos < len(MD):
-                                                if MD[MD_str_pos].isdigit():
-                                                    num = num * 10 + int(MD[MD_str_pos])
-                                                    MD_str_pos += 1
-                                                else:
-                                                    break
-                                            MD_len += num
-                                    # Insertion or full match followed
-                                    if MD_len >= length:
-                                        MD_len -= length
-                                        cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used])
-                                        break
-                                    first = False
-                                    read_base = read_seq[read_pos + MD_len]
-                                    MD_ref_base = MD[MD_str_pos]
-                                    MD_str_pos += 1
-                                    # assert MD_ref_base in "ACGT"
-                                    cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used])
-                                    cmp_list.append(["mismatch", right_pos + MD_len, 1])
-                                    MD_len_used = MD_len + 1
-                                    MD_len += 1
-                                    # Full match
-                                    if MD_len == length:
-                                        MD_len = 0
-                                        break
-                            elif cigar_op == 'I':
-                                cmp_list.append(["insertion", right_pos, length])
-                            elif cigar_op == 'D':
-                                if MD[MD_str_pos] == '0':
-                                    MD_str_pos += 1
-                                assert MD[MD_str_pos] == '^'
-                                MD_str_pos += 1
-                                while MD_str_pos < len(MD):
-                                    if not MD[MD_str_pos] in "ACGT":
-                                        break
-                                    MD_str_pos += 1
-                                cmp_list.append(["deletion", right_pos, length])
-                            elif cigar_op == 'S':
-                                cmp_list.append(["soft", right_pos, length])
-                            else:                    
-                                assert cigar_op == 'N'
-                                cmp_list.append(["intron", right_pos, length])
-
-                            if cigar_op in "MND":
-                                right_pos += length
-
-                            if cigar_op in "MIS":
-                                read_pos += length
-
-                        exon = False
-                        for exon in ref_exons:
-                            exon_left, exon_right = exon
-                            if right_pos <= exon_left or pos > exon_right:
-                                continue
-                            else:
-                                exon = True
-                                break
-
-                        if right_pos > len(ref_seq):
-                            continue
-
-                        def add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, exon = True):
-                            max_count = max(HLA_count_per_read.values())
-                            cur_cmpt = set()
-                            for allele, count in HLA_count_per_read.items():
-                                if count < max_count:
-                                    continue
-                                if allele in exclude_allele_list:
-                                    continue                                
-                                cur_cmpt.add(allele)                    
-                                if not allele in HLA_counts:
-                                    HLA_counts[allele] = 1
-                                else:
-                                    HLA_counts[allele] += 1
-
-                            if len(cur_cmpt) == 0:
-                                return
-                            
-                            # daehwan - for debugging purposes                            
-                            alleles = ["", ""]
-                            # alleles = ["B*40:304", "B*40:02:01"]
-                            allele1_found, allele2_found = False, False
-                            for allele, count in HLA_count_per_read.items():
-                                if count < max_count:
-                                    continue
-                                if allele == alleles[0]:
-                                    allele1_found = True
-                                elif allele == alleles[1]:
-                                    allele2_found = True
-                            if allele1_found != allele2_found:
-                                print alleles[0], HLA_count_per_read[alleles[0]]
-                                print alleles[1], HLA_count_per_read[alleles[1]]
-                                if allele1_found:
-                                    print ("%s\tread_id %s - %d vs. %d]" % (alleles[0], prev_read_id, max_count, HLA_count_per_read[alleles[1]]))
-                                else:
-                                    print ("%s\tread_id %s - %d vs. %d]" % (alleles[1], prev_read_id, max_count, HLA_count_per_read[alleles[0]]))
-                                print read_seq
-
-                            cur_cmpt = sorted(list(cur_cmpt))
-                            cur_cmpt = '-'.join(cur_cmpt)
-                            add = 1
-                            if partial and not exon:
-                                add *= 0.2
-                            if not cur_cmpt in HLA_cmpt:
-                                HLA_cmpt[cur_cmpt] = add
-                            else:
-                                HLA_cmpt[cur_cmpt] += add
-
-                        if read_id != prev_read_id:
-                            if prev_read_id != None:
-                                add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, prev_exon)
-                                
-                            HLA_count_per_read = {}
-                            for HLA_name in HLA_names[gene]:
-                                if HLA_name.find("BACKBONE") != -1:
-                                    continue
-                                HLA_count_per_read[HLA_name] = 0
-
-                        def add_count(var_id, add):
-                            assert var_id in Links
-                            alleles = Links[var_id]
-                            for allele in alleles:
-                                if allele.find("BACKBONE") != -1:
-                                    continue
-                                HLA_count_per_read[allele] += add
-                                # daehwan - for debugging purposes
-                                if debug:
-                                    if allele in ["DQA1*05:05:01:01", "DQA1*05:05:01:02"]:
-                                        print allele, add, var_id
-
-                        # Decide which allele(s) a read most likely came from
-                        # also sanity check - read length, cigar string, and MD string
-                        for var_id, data in Vars[gene].items():
-                            var_type, var_pos, var_data = data
-                            if var_type != "deletion":
-                                continue
-                            if left_pos >= var_pos and right_pos <= var_pos + int(var_data):
-                                add_count(var_id, -1)                            
-                        ref_pos, read_pos, cmp_cigar_str, cmp_MD = left_pos, 0, "", ""
-                        cigar_match_len, MD_match_len = 0, 0            
-                        for cmp in cmp_list:
-                            type = cmp[0]
-                            length = cmp[2]
-                            if type == "match":
-                                var_idx = lower_bound(Var_list[gene], ref_pos)
-                                while var_idx < len(Var_list[gene]):
-                                    var_pos, var_id = Var_list[gene][var_idx]
-                                    if ref_pos + length <= var_pos:
-                                        break
-                                    if ref_pos <= var_pos:
-                                        var_type, _, var_data = Vars[gene][var_id]
-                                        if var_type == "insertion":
-                                            if ref_pos < var_pos and ref_pos + length > var_pos + len(var_data):
-                                                add_count(var_id, -1)
-                                                # daehwan - for debugging purposes
-                                                if debug:
-                                                    print cmp, var_id, Links[var_id]
-                                        elif var_type == "deletion":
-                                            del_len = int(var_data)
-                                            if ref_pos < var_pos and ref_pos + length > var_pos + del_len:
-                                                # daehwan - for debugging purposes
-                                                if debug:
-                                                    print cmp, var_id, Links[var_id], -1, Vars[gene][var_id]
-                                                # Check if this might be one of the two tandem repeats (the same left coordinate)
-                                                cmp_left, cmp_right = cmp[1], cmp[1] + cmp[2]
-                                                test1_seq1 = ref_seq[cmp_left:cmp_right]
-                                                test1_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos + del_len:cmp_right + del_len]
-                                                # Check if this happens due to small repeats (the same right coordinate - e.g. 19 times of TTTC in DQA1*05:05:01:02)
-                                                cmp_left -= read_pos
-                                                cmp_right += (len(read_seq) - read_pos - cmp[2])
-                                                test2_seq1 = ref_seq[cmp_left+int(var_data):cmp_right]
-                                                test2_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos+int(var_data):cmp_right]
-                                                if test1_seq1 != test1_seq2 and test2_seq1 != test2_seq2:
-                                                    add_count(var_id, -1)
-                                        else:
-                                            if debug:
-                                                print cmp, var_id, Links[var_id], -1
-                                            add_count(var_id, -1)
-                                    var_idx += 1
-
-                                read_pos += length
-                                ref_pos += length
-                                cigar_match_len += length
-                                MD_match_len += length
-                            elif type == "mismatch":
-                                read_base = read_seq[read_pos]
-                                var_idx = lower_bound(Var_list[gene], ref_pos)
-                                while var_idx < len(Var_list[gene]):
-                                    var_pos, var_id = Var_list[gene][var_idx]
-                                    if ref_pos < var_pos:
-                                        break
-                                    if ref_pos == var_pos:
-                                        var_type, _, var_data = Vars[gene][var_id]
-                                        if var_type == "single":
-                                            if var_data == read_base:
-                                                # daehwan - for debugging purposes
-                                                if debug:
-                                                    print cmp, var_id, 1, var_data, read_base, Links[var_id]
-
-                                                # daehwan - for debugging purposes
-                                                if False:
-                                                    read_qual = ord(qual[read_pos])
-                                                    add_count(var_id, (read_qual - 60) / 60.0)
-                                                else:
-                                                    add_count(var_id, 1)
-                                            # daehwan - check out if this routine is appropriate
-                                            # else:
-                                            #    add_count(var_id, -1)
-                                    var_idx += 1
-
-                                cmp_MD += ("%d%s" % (MD_match_len, ref_seq[ref_pos]))
-                                MD_match_len = 0
-                                cigar_match_len += 1
-                                read_pos += 1
-                                ref_pos += 1
-                            elif type == "insertion":
-                                ins_seq = read_seq[read_pos:read_pos+length]
-                                var_idx = lower_bound(Var_list[gene], ref_pos)
-                                # daehwan - for debugging purposes
-                                if debug:
-                                    print left_pos, cigar_str, MD, vars
-                                    print ref_pos, ins_seq, Var_list[gene][var_idx], Vars[gene][Var_list[gene][var_idx][1]]
-                                    # sys.exit(1)
-                                while var_idx < len(Var_list[gene]):
-                                    var_pos, var_id = Var_list[gene][var_idx]
-                                    if ref_pos < var_pos:
-                                        break
-                                    if ref_pos == var_pos:
-                                        var_type, _, var_data = Vars[gene][var_id]
-                                        if var_type == "insertion":                                
-                                            if var_data == ins_seq:
-                                                # daehwan - for debugging purposes
-                                                if debug:
-                                                    print cmp, var_id, 1, Links[var_id]
-                                                add_count(var_id, 1)
-                                    var_idx += 1
-
-                                if cigar_match_len > 0:
-                                    cmp_cigar_str += ("%dM" % cigar_match_len)
-                                    cigar_match_len = 0
-                                read_pos += length
-                                cmp_cigar_str += ("%dI" % length)
-                            elif type == "deletion":
-                                del_len = length
-                                # Deletions can be shifted bidirectionally
-                                temp_ref_pos = ref_pos
-                                while temp_ref_pos > 0:
-                                    last_bp = ref_seq[temp_ref_pos + del_len - 1]
-                                    prev_bp = ref_seq[temp_ref_pos - 1]
-                                    if last_bp != prev_bp:
-                                        break
-                                    temp_ref_pos -= 1
-                                var_idx = lower_bound(Var_list[gene], temp_ref_pos)
-                                while var_idx < len(Var_list[gene]):
-                                    var_pos, var_id = Var_list[gene][var_idx]
-                                    if temp_ref_pos < var_pos:
-                                        first_bp = ref_seq[temp_ref_pos]
-                                        next_bp = ref_seq[temp_ref_pos + del_len]
-                                        if first_bp == next_bp:
-                                            temp_ref_pos += 1
-                                            continue
-                                        else:
-                                            break
-                                    if temp_ref_pos == var_pos:
-                                        var_type, _, var_data = Vars[gene][var_id]
-                                        if var_type == "deletion":
-                                            var_len = int(var_data)
-                                            if var_len == length:
-                                                if debug:
-                                                    print cmp, var_id, 1, Links[var_id]
-                                                    print ref_seq[var_pos - 10:var_pos], ref_seq[var_pos:var_pos+int(var_data)], ref_seq[var_pos+int(var_data):var_pos+int(var_data)+10]
-                                                add_count(var_id, 1)
-                                    var_idx += 1
-
-                                if cigar_match_len > 0:
-                                    cmp_cigar_str += ("%dM" % cigar_match_len)
-                                    cigar_match_len = 0
-                                cmp_MD += ("%d" % MD_match_len)
-                                MD_match_len = 0
-                                cmp_cigar_str += ("%dD" % length)
-                                cmp_MD += ("^%s" % ref_seq[ref_pos:ref_pos+length])
-                                ref_pos += length
-                            elif type == "soft":
-                                if cigar_match_len > 0:
-                                    cmp_cigar_str += ("%dM" % cigar_match_len)
-                                    cigar_match_len = 0
-                                read_pos += length
-                                cmp_cigar_str += ("%dS" % length)
-                            else:
-                                assert type == "intron"
-                                if cigar_match_len > 0:
-                                    cmp_cigar_str += ("%dM" % cigar_match_len)
-                                    cigar_match_len = 0
-                                cmp_cigar_str += ("%dN" % length)
-                                ref_pos += length                    
-                        if cigar_match_len > 0:
-                            cmp_cigar_str += ("%dM" % cigar_match_len)
-                        cmp_MD += ("%d" % MD_match_len)
-                        if read_pos != len(read_seq) or \
-                                cmp_cigar_str != cigar_str or \
-                                cmp_MD != MD:
-                            print >> sys.stderr, "Error:", cigar_str, MD
-                            print >> sys.stderr, "\tcomputed:", cmp_cigar_str, cmp_MD
-                            print >> sys.stderr, "\tcmp list:", cmp_list
-                            assert False            
-
-                        prev_read_id = read_id
-                        prev_exon = exon
-
-                    if num_reads <= 0:
-                        continue
-
-                    if prev_read_id != None:
-                        add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read)
-
-                    # Coverage
-                    # it is not used by the default
-                    if enable_coverage:
-                        assert num_reads > 0
-                        read_len = int(total_read_len / float(num_reads))
-                        coverage_sum = 0
-                        for i in range(len(coverage)):
-                            if i > 0:
-                                coverage[i] += coverage[i-1]
-                            coverage_sum += coverage[i]
-                        coverage_avg = coverage_sum / float(len(coverage))
-                        assert len(ref_seq) < len(coverage)
-                        for i in range(len(ref_seq)):
-                            coverage_threshold = 1.0 * coverage_avg
-                            if i < read_len:
-                                coverage_threshold *= ((i+1) / float(read_len))
-                            elif i + read_len > len(ref_seq):
-                                coverage_threshold *= ((len(ref_seq) - i) / float(read_len))
-                            if coverage[i] >= coverage_threshold:
-                                continue
-                            pseudo_num_reads = (coverage_threshold - coverage[i]) / read_len
-                            var_idx = lower_bound(Var_list[gene], i + 1)
-                            if var_idx >= len(Var_list[gene]):
-                                var_idx = len(Var_list[gene]) - 1
-                            cur_cmpt = set()
-                            while var_idx >= 0:
-                                var_pos, var_id = Var_list[gene][var_idx]
-                                var_type, _, var_data = Vars[gene][var_id]
-                                if var_type == "deletion":
-                                    del_len = int(var_data)
-                                    if i < var_pos:
-                                        break
-                                    if i + read_len < var_pos + int(var_data):
-                                        assert var_id in Links
-                                        cur_cmpt = cur_cmpt.union(set(Links[var_id]))
-                                var_idx -= 1
-                            if cur_cmpt:
-                                cur_cmpt = '-'.join(list(cur_cmpt))
-                                if not cur_cmpt in HLA_cmpt:
-                                    HLA_cmpt[cur_cmpt] = 0
-                                HLA_cmpt[cur_cmpt] += pseudo_num_reads
-                else:
-                    assert index_type == "linear"
-                    def add_alleles(alleles):
-                        if not allele in HLA_counts:
-                            HLA_counts[allele] = 1
-                        else:
-                            HLA_counts[allele] += 1
-
-                        cur_cmpt = sorted(list(alleles))
-                        cur_cmpt = '-'.join(cur_cmpt)
-                        if not cur_cmpt in HLA_cmpt:
-                            HLA_cmpt[cur_cmpt] = 1
-                        else:
-                            HLA_cmpt[cur_cmpt] += 1
-
-                    prev_read_id, prev_AS = None, None
-                    alleles = set()
-                    for line in alignview_proc.stdout:
-                        cols = line[:-1].split()
-                        read_id, flag, allele = cols[:3]
-                        flag = int(flag)
-                        if flag & 0x4 != 0:
-                            continue
-                        if not allele.startswith(gene):
-                            continue
-                        if allele.find("BACKBONE") != -1:
-                            continue
-
-                        AS = None
-                        for i in range(11, len(cols)):
-                            col = cols[i]
-                            if col.startswith("AS"):
-                                AS = int(col[5:])
-                        assert AS != None
-                        if read_id != prev_read_id:
-                            if alleles:
-                                if aligner == "hisat2" or \
-                                        (aligner == "bowtie2" and len(alleles) < 10):
-                                    add_alleles(alleles)
-                                alleles = set()
-                            prev_AS = None
-                        if prev_AS != None and AS < prev_AS:
-                            continue
-                        prev_read_id = read_id
-                        prev_AS = AS
-                        alleles.add(allele)
-
-                    if alleles:
-                        add_alleles(alleles)
-
-                HLA_counts = [[allele, count] for allele, count in HLA_counts.items()]
-                def HLA_count_cmp(a, b):
-                    if a[1] != b[1]:
-                        return b[1] - a[1]
-                    assert a[0] != b[0]
-                    if a[0] < b[0]:
-                        return -1
-                    else:
-                        return 1
-                HLA_counts = sorted(HLA_counts, cmp=HLA_count_cmp)
-                for count_i in range(len(HLA_counts)):
-                    count = HLA_counts[count_i]
-                    if simulation:
-                        found = False
-                        for test_HLA_name in test_HLA_names:
-                            if count[0] == test_HLA_name:
-                                print >> sys.stderr, "\t\t\t*** %d ranked %s (count: %d)" % (count_i + 1, test_HLA_name, count[1])
-                                found = True
-                                """
-                                if count_i > 0 and HLA_counts[0][1] > count[1]:
-                                    print >> sys.stderr, "Warning: %s ranked first (count: %d)" % (HLA_counts[0][0], HLA_counts[0][1])
-                                    assert False
-                                else:
-                                    test_passed += 1
-                                """
-                        if count_i < 5 and not found:
-                            print >> sys.stderr, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
-                    else:
-                        print >> sys.stderr, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
-                        if count_i >= 9:
-                            break
-                print >> sys.stderr
-
-                def normalize(prob):
-                    total = sum(prob.values())
-                    for allele, mass in prob.items():
-                        prob[allele] = mass / total
-
-                def normalize2(prob, length):
-                    total = 0
-                    for allele, mass in prob.items():
-                        assert allele in length
-                        total += (mass / length[allele])
-                    for allele, mass in prob.items():
-                        assert allele in length
-                        prob[allele] = mass / length[allele] / total
-
-                def prob_diff(prob1, prob2):
-                    diff = 0.0
-                    for allele in prob1.keys():
-                        if allele in prob2:
-                            diff += abs(prob1[allele] - prob2[allele])
-                        else:
-                            diff += prob1[allele]
-                    return diff
-
-                def HLA_prob_cmp(a, b):
-                    if a[1] != b[1]:
-                        if a[1] < b[1]:
-                            return 1
-                        else:
-                            return -1
-                    assert a[0] != b[0]
-                    if a[0] < b[0]:
-                        return -1
-                    else:
-                        return 1
-
-                HLA_prob, HLA_prob_next = {}, {}
-                for cmpt, count in HLA_cmpt.items():
-                    alleles = cmpt.split('-')
-                    for allele in alleles:
-                        if allele not in HLA_prob:
-                            HLA_prob[allele] = 0.0
-                        HLA_prob[allele] += (float(count) / len(alleles))
-
-                assert gene in HLA_lengths
-                HLA_length = HLA_lengths[gene]
-                # normalize2(HLA_prob, HLA_length)
-                normalize(HLA_prob)
-                def next_prob(HLA_cmpt, HLA_prob, HLA_length):
-                    HLA_prob_next = {}
-                    for cmpt, count in HLA_cmpt.items():
-                        alleles = cmpt.split('-')
-                        alleles_prob = 0.0
-                        for allele in alleles:
-                            assert allele in HLA_prob
-                            alleles_prob += HLA_prob[allele]
-                        for allele in alleles:
-                            if allele not in HLA_prob_next:
-                                HLA_prob_next[allele] = 0.0
-                            HLA_prob_next[allele] += (float(count) * HLA_prob[allele] / alleles_prob)
-                    # normalize2(HLA_prob_next, HLA_length)
-                    normalize(HLA_prob_next)
-                    return HLA_prob_next
-
-                diff, iter = 1.0, 0
-                while diff > 0.0001 and iter < 1000:
-                    HLA_prob_next = next_prob(HLA_cmpt, HLA_prob, HLA_length)
-                    diff = prob_diff(HLA_prob, HLA_prob_next)
-                    HLA_prob = HLA_prob_next
-                    iter += 1
-                for allele, prob in HLA_prob.items():
-                    allele_len = len(HLAs[gene][allele])
-                    HLA_prob[allele] /= float(allele_len)
-                normalize(HLA_prob)
-                HLA_prob = [[allele, prob] for allele, prob in HLA_prob.items()]
-
-                HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp)
-                success = [False for i in range(len(test_HLA_names))]
-                found_list = [False for i in range(len(test_HLA_names))]
-                for prob_i in range(len(HLA_prob)):
-                    prob = HLA_prob[prob_i]
-                    found = False
-                    if simulation:
-                        for name_i in range(len(test_HLA_names)):
-                            test_HLA_name = test_HLA_names[name_i]
-                            if prob[0] == test_HLA_name:
-                                rank_i = prob_i
-                                while rank_i > 0:
-                                    if prob == HLA_prob[rank_i - 1][1]:
-                                        rank_i -= 1
-                                    else:
-                                        break
-                                print >> sys.stderr, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, test_HLA_name, prob[1] * 100.0)
-                                if rank_i < len(success):
-                                    success[rank_i] = True
-                                found_list[name_i] = True
-                                found = True                        
-                        if not False in found_list:
-                            break
-                    if not found:
-                        print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, prob[0], prob[1] * 100.0)
-                        if best_alleles and prob_i < 2:
-                            print >> sys.stdout, "SingleModel %s (abundance: %.2f%%)" % (prob[0], prob[1] * 100.0)
-                    if not simulation and prob_i >= 9:
-                        break
-                print >> sys.stderr
-
-                # daehwan - for debugging purposes
-                if False and (len(test_HLA_names) == 2 or not simulation):
-                    HLA_prob, HLA_prob_next = {}, {}
-                    for cmpt, count in HLA_cmpt.items():
-                        alleles = cmpt.split('-')
-                        for allele1 in alleles:
-                            for allele2 in HLA_names[gene]:
-                                if allele1 < allele2:
-                                    allele_pair = "%s-%s" % (allele1, allele2)
-                                else:
-                                    allele_pair = "%s-%s" % (allele2, allele1)
-                                if not allele_pair in HLA_prob:
-                                    HLA_prob[allele_pair] = 0.0
-                                HLA_prob[allele_pair] += (float(count) / len(alleles))
-
-                    if len(HLA_prob) <= 0:
-                        continue
-
-                    # Choose top allele pairs
-                    def choose_top_alleles(HLA_prob):
-                        HLA_prob_list = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()]
-                        HLA_prob_list = sorted(HLA_prob_list, cmp=HLA_prob_cmp)
-                        HLA_prob = {}
-                        best_prob = HLA_prob_list[0][1]
-                        for i in range(len(HLA_prob_list)):
-                            allele_pair, prob = HLA_prob_list[i]
-                            if prob * 2 <= best_prob:
-                                break                        
-                            HLA_prob[allele_pair] = prob
-                        normalize(HLA_prob)
-                        return HLA_prob
-                    HLA_prob = choose_top_alleles(HLA_prob)
-
-                    def next_prob(HLA_cmpt, HLA_prob):
-                        HLA_prob_next = {}
-                        for cmpt, count in HLA_cmpt.items():
-                            alleles = cmpt.split('-')
-                            prob = 0.0
-                            for allele in alleles:
-                                for allele_pair in HLA_prob.keys():
-                                    if allele in allele_pair:
-                                        prob += HLA_prob[allele_pair]
-                            for allele in alleles:
-                                for allele_pair in HLA_prob.keys():
-                                    if not allele in allele_pair:
-                                        continue
-                                    if allele_pair not in HLA_prob_next:
-                                        HLA_prob_next[allele_pair] = 0.0
-                                    HLA_prob_next[allele_pair] += (float(count) * HLA_prob[allele_pair] / prob)
-                        normalize(HLA_prob_next)
-                        return HLA_prob_next
-
-                    diff, iter = 1.0, 0
-                    while diff > 0.0001 and iter < 1000:
-                        HLA_prob_next = next_prob(HLA_cmpt, HLA_prob)
-                        diff = prob_diff(HLA_prob, HLA_prob_next)
-                        HLA_prob = HLA_prob_next
-                        HLA_prob = choose_top_alleles(HLA_prob)
-                        iter += 1
-
-                    HLA_prob = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()]
-                    HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp)
-
-                    success = [False]
-                    for prob_i in range(len(HLA_prob)):
-                        allele_pair, prob = HLA_prob[prob_i]
-                        allele1, allele2 = allele_pair.split('-')
-                        if best_alleles and prob_i < 1:
-                            print >> sys.stdout, "PairModel %s (abundance: %.2f%%)" % (allele_pair, prob * 100.0)
-                        if simulation:
-                            if allele1 in test_HLA_names and allele2 in test_HLA_names:
-                                rank_i = prob_i
-                                while rank_i > 0:
-                                    if HLA_prob[rank_i-1][1] == prob:                                        
-                                        rank_i -= 1
-                                    else:
-                                        break
-                                print >> sys.stderr, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, allele_pair, prob * 100.0)
-                                if rank_i == 0:
-                                    success[0] = True
-                                break
-                        print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, allele_pair, prob * 100.0)
-                        if not simulation and prob_i >= 9:
-                            break
-                    print >> sys.stderr
-                    
-                    # Li's method
-                    li_hla = os.path.join(ex_path, "li_hla/hla")
-                    if os.path.exists(li_hla):
-                        li_hla_cmd = [li_hla,
-                                      "hla",
-                                      alignment_fname,
-                                      "-b", "%s*BACKBONE" % gene]
-                        li_hla_proc = subprocess.Popen(li_hla_cmd,
-                                                       stdout=subprocess.PIPE,
-                                                       stderr=open("/dev/null", 'w'))
-
-                        # read in the result of Li's hla
-                        for line in li_hla_proc.stdout:
-                            allele1, allele2, score = line.strip().split()
-                            score = float(score)
-                            if simulation:
-                                if allele1 in test_HLA_names and allele2 in test_HLA_names:
-                                    print >> sys.stderr, "\t\t\t*** 1 ranked %s-%s (score: %.2f)" % (allele1, allele2, score)
-                                    success[0] = True
-                                else:
-                                    print >> sys.stderr, "\t\t\tLiModel fails"
-                            if best_alleles:
-                                print >> sys.stdout, "LiModel %s-%s (score: %.2f)" % (allele1, allele2, score)
-                        li_hla_proc.communicate()
-
-                if simulation and not False in success:
-                    aligner_type = "%s %s" % (aligner, index_type)
-                    if not aligner_type in test_passed:
-                        test_passed[aligner_type] = 1
-                    else:
-                        test_passed[aligner_type] += 1
-
-                if simulation:
-                    print >> sys.stderr, "\t\tPassed so far: %d/%d (abundance: %.2f%%)" % (test_passed[aligner_type], test_i + 1, (test_passed[aligner_type] * 100.0 / (test_i + 1)))
-            os.system("rm %s %s.bai" % (alignment_fname, alignment_fname))
-            alignment_fname = ""
-
-
-    if simulation:
-        for aligner_type, passed in test_passed.items():
-            print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list))
-    
-        
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description='test HLA genotyping')
-    parser.add_argument("--reference-type",
-                        dest="reference_type",
-                        type=str,
-                        default="gene",
-                        help="Reference type: gene, chromosome, and genome (default: gene)")
-    parser.add_argument("--hla-list",
-                        dest="hla_list",
-                        type=str,
-                        default="A,B,C,DQA1,DQB1,DRB1",
-                        help="A comma-separated list of HLA genes (default: A,B,C,DQA1,DQB1,DRB1)")
-    parser.add_argument('--partial',
-                        dest='partial',
-                        action='store_true',
-                        help='Include partial alleles (e.g. A_nuc.fasta)')
-    parser.add_argument("--aligner-list",
-                        dest="aligners",
-                        type=str,
-                        default="hisat2.graph,hisat2.linear,bowtie2.linear",
-                        help="A comma-separated list of aligners (default: hisat2.graph,hisat2.linear,bowtie2.linear)")
-    parser.add_argument("--reads",
-                        dest="read_fname",
-                        type=str,
-                        default="",
-                        help="Fastq read file name")
-    parser.add_argument("--alignment",
-                        dest="alignment_fname",
-                        type=str,
-                        default="",
-                        help="BAM file name")
-    parser.add_argument("-p", "--threads",
-                        dest="threads",
-                        type=int,
-                        default=1,
-                        help="Number of threads")
-    parser.add_argument("--simulate-interval",
-                        dest="simulate_interval",
-                        type=int,
-                        default=1,
-                        help="Reads simulated at every these base pairs (default: 1)")
-    parser.add_argument("--coverage",
-                        dest="coverage",
-                        action='store_true',
-                        help="Experimental purpose (assign reads based on coverage)")
-    parser.add_argument("--best-alleles",
-                        dest="best_alleles",
-                        action='store_true',
-                        help="")
-    parser.add_argument("--exclude-allele-list",
-                        dest="exclude_allele_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of allleles to be excluded")
-    parser.add_argument("--num-mismatch",
-                        dest="num_mismatch",
-                        type=int,
-                        default=0,
-                        help="Maximum number of mismatches per read alignment to be considered (default: 0)")
-    parser.add_argument('-v', '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        help='also print some statistics to stderr')
-    parser.add_argument("--debug",
-                        dest="debug",
-                        type=str,
-                        default="",
-                        help="e.g., test_id:10,read_id:10000,basic_test")
-
-    args = parser.parse_args()
-    if not args.reference_type in ["gene", "chromosome", "genome"]:
-        print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type)
-        sys.exit(1)
-    args.hla_list = args.hla_list.split(',')
-    if args.aligners == "":
-        print >> sys.stderr, "Error: --aligners must be non-empty."
-        sys.exit(1)    
-    args.aligners = args.aligners.split(',')
-    for i in range(len(args.aligners)):
-        args.aligners[i] = args.aligners[i].split('.')
-    if args.read_fname:
-        args.read_fname = args.read_fname.split(',')
-    else:
-        args.read_fname = []
-    if args.alignment_fname != "" and \
-            not os.path.exists(args.alignment_fname):
-        print >> sys.stderr, "Error: %s doesn't exist." % args.alignment_fname
-        sys.exit(1)
-    args.exclude_allele_list = args.exclude_allele_list.split(',')
-    debug = {}
-    if args.debug != "":
-        for item in args.debug.split(','):
-            if ':' in item:
-                key, value = item.split(':')
-                debug[key] = value
-            else:
-                debug[item] = 1
-
-    random.seed(1)
-    test_HLA_genotyping(args.reference_type,
-                        args.hla_list,
-                        args.partial,
-                        args.aligners,
-                        args.read_fname,
-                        args.alignment_fname,
-                        args.threads,
-                        args.simulate_interval,
-                        args.coverage,
-                        args.best_alleles,
-                        args.exclude_allele_list,
-                        args.num_mismatch,
-                        args.verbose,
-                        debug)
diff --git a/opts.h b/opts.h
index afdd5a3..d457fff 100644
--- a/opts.h
+++ b/opts.h
@@ -172,12 +172,18 @@ enum {
     ARG_TRANSCRIPTOME_MAPPING_ONLY,
     ARG_TRANSCRIPTOME_ASSEMBLY,
     ARG_TRANSCRIPTOME_ASSEMBLY_CUFFLINKS,
+    ARG_AVOID_PSEUDOGENE,
 #ifdef USE_SRA
     ARG_SRA_ACC,
 #endif
     ARG_REMOVE_CHRNAME,
     ARG_ADD_CHRNAME,
     ARG_MAX_ALTSTRIED,
+    ARG_HAPLOTYPE,
+    ARG_CODIS,
+    ARG_NO_TEMPLATELEN_ADJUSTMENT,
+    ARG_SUMMARY_FILE,
+    ARG_NEW_SUMMARY
 };
 
 #endif
diff --git a/pe.cpp b/pe.cpp
index 8eea8c3..8845898 100644
--- a/pe.cpp
+++ b/pe.cpp
@@ -17,6 +17,7 @@
  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <algorithm>
 #include "assert_helpers.h"
 #include "pe.h"
 
diff --git a/processor_support.h b/processor_support.h
index f68ee65..e731e00 100644
--- a/processor_support.h
+++ b/processor_support.h
@@ -17,7 +17,7 @@
 #   include <cpuid.h>
 #elif defined(_MSC_VER)
 // __MSC_VER defined by Microsoft compiler
-#define USING MSC_COMPILER
+#define USING_MSC_COMPILER
 #endif
 
 struct regs_t {unsigned int EAX, EBX, ECX, EDX;};
@@ -43,13 +43,16 @@ public:
     regs_t regs;
 
     try {
-#if ( defined(USING_INTEL_COMPILER) || defined(USING_MSC_COMPILER) )
+#if defined(USING_MSC_COMPILER) 
+		__cpuid((int *) &regs, 0); // test if __cpuid() works, if not catch the exception
+		__cpuid((int *) &regs, 0x1); // POPCNT bit is bit 23 in ECX
+#elif defined(USING_INTEL_COMPILER)
         __cpuid((void *) &regs,0); // test if __cpuid() works, if not catch the exception
         __cpuid((void *) &regs,0x1); // POPCNT bit is bit 23 in ECX
 #elif defined(USING_GCC_COMPILER)
         __get_cpuid(0x1, &regs.EAX, &regs.EBX, &regs.ECX, &regs.EDX);
 #else
-        std::cerr << “ERROR: please define __cpuid() for this build.\n”; 
+        std::cerr << "ERROR: please define __cpuid() for this build.\n"; 
         assert(0);
 #endif
         if( !( (regs.ECX & BIT(20)) && (regs.ECX & BIT(23)) ) ) return false;
diff --git a/qual.h b/qual.h
index 089080b..7c542bb 100644
--- a/qual.h
+++ b/qual.h
@@ -21,6 +21,7 @@
 #define QUAL_H_
 
 #include <stdexcept>
+#include <algorithm>
 #include "search_globals.h"
 #include "sstring.h"
 
diff --git a/spliced_aligner.h b/spliced_aligner.h
index 21ef2ff..997e5f2 100644
--- a/spliced_aligner.h
+++ b/spliced_aligner.h
@@ -152,11 +152,8 @@ void SplicedAligner<index_t, local_index_t>::hybridSearch(
                          this->_minsc[rdi],
                          rnd,
                          INDEX_MAX,
-                         (index_t)tpol.minIntronLen(),
-                         (index_t)tpol.maxIntronLen(),
-                         tpol.minAnchorLen(),
-                         tpol.minAnchorLen_noncan(),
-                         gpol.maxAltsTried(),
+                         tpol,
+                         gpol,
                          leftext,
                          rightext);
     }
@@ -243,6 +240,7 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
     const Read& rd = *(this->_rds[rdi]);
     index_t rdlen = (index_t)rd.length();
     if(hit.score() < this->_minsc[rdi]) return maxsc;
+    if(dep >= 128) return maxsc;
     
     // if it's already examined, just return
     if(hitoff == hit.rdoff() - hit.trim5() && hitlen == hit.len() + hit.trim5() + hit.trim3()) {
@@ -259,7 +257,6 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
         this->_spliceSites.expand();
     }
     EList<Coord>& coords = this->_coords[dep];
-    EList<GenomeHit<index_t> >& local_genomeHits = this->_local_genomeHits[dep];
     EList<SpliceSite>& spliceSites = this->_spliceSites[dep];
     
     // daehwan - for debugging purposes
@@ -285,11 +282,11 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
             bool another_spliced = false;
             if(!ssdb.empty()) {
                 int64_t best_score = hit.score();
-                local_genomeHits.clear();
+                this->_local_genomeHits[dep].clear();
                 this->_anchors_added.clear();
                 
-                local_genomeHits.expand();
-                local_genomeHits.back() = hit;
+                this->_local_genomeHits[dep].expand();
+                this->_local_genomeHits[dep].back() = hit;
                 this->_anchors_added.push_back(0);
                 
                 index_t fragoff = 0, fraglen = 0, left = 0, right = 0;
@@ -352,11 +349,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                        this->_minsc[rdi],
                                        rnd,
                                        (index_t)this->_minK_local,
-                                       (index_t)tpol.minIntronLen(),
-                                       (index_t)tpol.maxIntronLen(),
-                                       tpol.minAnchorLen(),
-                                       tpol.minAnchorLen_noncan(),
-                                       gpol.maxAltsTried(),
+                                       tpol,
+                                       gpol,
                                        leftext,
                                        rightext);
                         if(tempHit.len() <= 0)
@@ -400,8 +394,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                 another_spliced = true;
                                 if(tempHit.score() > best_score)
                                     best_score = tempHit.score();
-                                local_genomeHits.expand();
-                                local_genomeHits.back() = tempHit;
+                                this->_local_genomeHits[dep].expand();
+                                this->_local_genomeHits[dep].back() = tempHit;
                                 this->_anchors_added.push_back(1);
                                 index_t temp_fragoff = 0, temp_fraglen = 0, temp_left = 0;
                                 tempHit.getLeft(temp_fragoff, temp_fraglen, temp_left);
@@ -412,19 +406,19 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                     }
                 }
                 
-                size_t num_local_genomeHits = local_genomeHits.size();
+                size_t num_local_genomeHits = this->_local_genomeHits[dep].size();
                 for(size_t i = 0; i < num_local_genomeHits; i++) {
-                    local_genomeHits[i].getRight(fragoff, fraglen, right);
-                    if(local_genomeHits[i].score() < best_score) continue;
+                    this->_local_genomeHits[dep][i].getRight(fragoff, fraglen, right);
+                    if(this->_local_genomeHits[dep][i].score() < best_score) continue;
                     // make use of a list of known or novel splice sites to further align the read
                     if(fraglen >= minMatchLen &&
-                       local_genomeHits[i].trim3() == 0 &&
+                       this->_local_genomeHits[dep][i].trim3() == 0 &&
                        !tpol.no_spliced_alignment()) {
                         spliceSites.clear();
                         assert_gt(fraglen, 0);
-                        ssdb.getRightSpliceSites(local_genomeHits[i].ref(), right + fraglen - minMatchLen, minMatchLen, spliceSites);
+                        ssdb.getRightSpliceSites(this->_local_genomeHits[dep][i].ref(), right + fraglen - minMatchLen, minMatchLen, spliceSites);
                         for(size_t si = 0; si < spliceSites.size(); si++) {
-                            const GenomeHit<index_t>& canHit = local_genomeHits[i];
+                            const GenomeHit<index_t>& canHit = this->_local_genomeHits[dep][i];
                             const SpliceSite& ss = spliceSites[si];
                             if(!ss._fromfile && ss._readid + this->_thread_rids_mindist > rd.rdid) continue;
                             if(right > ss.left()) continue;
@@ -472,11 +466,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                            this->_minsc[rdi],
                                            rnd,
                                            (index_t)this->_minK_local,
-                                           (index_t)tpol.minIntronLen(),
-                                           (index_t)tpol.maxIntronLen(),
-                                           tpol.minAnchorLen(),
-                                           tpol.minAnchorLen_noncan(),
-                                           gpol.maxAltsTried(),
+                                           tpol,
+                                           gpol,
                                            leftext,
                                            rightext);
                             if(tempHit.len() <= 0)
@@ -516,8 +507,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                     another_spliced = true;
                                     if(combinedHit.score() > best_score)
                                         best_score = tempHit.score();
-                                    local_genomeHits.expand();
-                                    local_genomeHits.back() = combinedHit;
+                                    this->_local_genomeHits[dep].expand();
+                                    this->_local_genomeHits[dep].back() = combinedHit;
                                     this->_anchors_added.push_back(this->_anchors_added[i] + 1);
                                     
                                     index_t temp_fragoff = 0, temp_fraglen = 0, temp_right = 0;
@@ -530,9 +521,9 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                     }
                 }
                 
-                assert_eq(local_genomeHits.size(), this->_anchors_added.size());
-                for(size_t i = 0; i < local_genomeHits.size(); i++) {
-                    const GenomeHit<index_t>& canHit = local_genomeHits[i];
+                assert_eq(this->_local_genomeHits[dep].size(), this->_anchors_added.size());
+                for(size_t i = 0; i < this->_local_genomeHits[dep].size(); i++) {
+                    const GenomeHit<index_t>& canHit = this->_local_genomeHits[dep][i];
                     if(!this->_secondary && canHit.score() < best_score) continue;
                     // if(min(min_left_anchor, min_right_anchor) <= this->_minK_local) {
                     
@@ -613,11 +604,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                    this->_minsc[rdi],
                                    rnd,
                                    (index_t)this->_minK_local,
-                                   (index_t)tpol.minIntronLen(),
-                                   (index_t)tpol.maxIntronLen(),
-                                   tpol.minAnchorLen(),
-                                   tpol.minAnchorLen_noncan(),
-                                   gpol.maxAltsTried(),
+                                   tpol,
+                                   gpol,
                                    leftext,
                                    rightext);
                     if(tempHit.len() <= 0)
@@ -698,11 +686,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                            this->_minsc[rdi],
                            rnd,
                            (index_t)this->_minK_local,
-                           (index_t)tpol.minIntronLen(),
-                           (index_t)tpol.maxIntronLen(),
-                           tpol.minAnchorLen(),
-                           tpol.minAnchorLen_noncan(),
-                           gpol.maxAltsTried(),
+                           tpol,
+                           gpol,
                            leftext,
                            rightext,
                            1);
@@ -720,7 +705,7 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
         // Use at most two local indexes
         const index_t max_count = 2;
         int64_t prev_score = hit.score();
-        local_genomeHits.clear();
+        this->_local_genomeHits[dep].clear();
         while(!success && count++ < max_count && use_localindex) {
             if(him.localindexatts >= this->max_localindexatts) break;
             if(first) {
@@ -814,7 +799,7 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                  (index_t)coord.off(),
                                  (index_t)coord.joinedOff(),
                                  this->_sharedVars);
-                    if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol.maxAltsTried())) continue;
+                    if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue;
                     // check if the partial alignment is compatible with the new alignment using the local index
                     if(!tempHit.compatibleWith(hit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) {
                         if(count == 1) continue;
@@ -836,11 +821,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                        this->_minsc[rdi],
                                        rnd,
                                        (index_t)this->_minK_local,
-                                       (index_t)tpol.minIntronLen(),
-                                       (index_t)tpol.maxIntronLen(),
-                                       tpol.minAnchorLen(),
-                                       tpol.minAnchorLen_noncan(),
-                                       gpol.maxAltsTried(),
+                                       tpol,
+                                       gpol,
                                        leftext,
                                        rightext);
                     }
@@ -898,7 +880,7 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                                                    dep + 1);
                             maxsc = max<int64_t>(maxsc, tmp_maxsc);
                         } else {
-                            local_genomeHits.push_back(tempHit);
+                            this->_local_genomeHits[dep].push_back(tempHit);
                         }
                     }
                 }
@@ -906,8 +888,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
             if(maxsc >= prev_score - sc.mmpMax) success = true;
             if(!success &&
                (him.localindexatts >= this->max_localindexatts || count == max_count || hGFM->prevLocalGFM(lGFM) == NULL)) {
-                for(index_t ti = 0; ti < local_genomeHits.size(); ti++) {
-                    GenomeHit<index_t>& tempHit = local_genomeHits[ti];
+                for(index_t ti = 0; ti < this->_local_genomeHits[dep].size(); ti++) {
+                    GenomeHit<index_t>& tempHit = this->_local_genomeHits[dep][ti];
                     int64_t minsc = this->_minsc[rdi];
                     if(!this->_secondary) {
                         if(rdi == 0) minsc = max(minsc, sink.bestUnp1());
@@ -1003,7 +985,7 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                      (index_t)coord.off(),
                                      (index_t)coord.joinedOff(),
                                      this->_sharedVars);
-                        if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol.maxAltsTried())) continue;
+                        if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue;
                         if(!tempHit.compatibleWith(hit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue;
                         if(uniqueStop) {
                             assert_eq(coords.size(), 1);
@@ -1021,11 +1003,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                            this->_minsc[rdi],
                                            rnd,
                                            (index_t)this->_minK_local,
-                                           (index_t)tpol.minIntronLen(),
-                                           (index_t)tpol.maxIntronLen(),
-                                           tpol.minAnchorLen(),
-                                           tpol.minAnchorLen_noncan(),
-                                           gpol.maxAltsTried(),
+                                           tpol,
+                                           gpol,
                                            leftext,
                                            rightext);
                         }
@@ -1150,11 +1129,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                            this->_minsc[rdi],
                            rnd,
                            (index_t)this->_minK_local,
-                           (index_t)tpol.minIntronLen(),
-                           (index_t)tpol.maxIntronLen(),
-                           tpol.minAnchorLen(),
-                           tpol.minAnchorLen_noncan(),
-                           gpol.maxAltsTried(),
+                           tpol,
+                           gpol,
                            leftext,
                            rightext,
                            num_mismatch_allowed);
@@ -1284,11 +1260,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                    this->_minsc[rdi],
                                    rnd,
                                    (index_t)this->_minK_local,
-                                   (index_t)tpol.minIntronLen(),
-                                   (index_t)tpol.maxIntronLen(),
-                                   tpol.minAnchorLen(),
-                                   tpol.minAnchorLen_noncan(),
-                                   gpol.maxAltsTried(),
+                                   tpol,
+                                   gpol,
                                    leftext,
                                    rightext);
                     if(tempHit.len() <= 0)
@@ -1368,11 +1341,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                            this->_minsc[rdi],
                            rnd,
                            (index_t)this->_minK_local,
-                           (index_t)tpol.minIntronLen(),
-                           (index_t)tpol.maxIntronLen(),
-                           tpol.minAnchorLen(),
-                           tpol.minAnchorLen_noncan(),
-                           gpol.maxAltsTried(),
+                           tpol,
+                           gpol,
                            leftext,
                            rightext,
                            1);
@@ -1389,7 +1359,7 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
         // Use at most two local indexes
         const index_t max_count = 2;
         int64_t prev_score = hit.score();
-        local_genomeHits.clear();
+        this->_local_genomeHits[dep].clear();
         while(!success && count++ < max_count && use_localindex) {
             if(him.localindexatts >= this->max_localindexatts) break;
             if(first) {
@@ -1491,7 +1461,7 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                  (index_t)coord.off(),
                                  (index_t)coord.joinedOff(),
                                  this->_sharedVars);
-                    if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol.maxAltsTried())) continue;
+                    if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue;
                     // check if the partial alignment is compatible with the new alignment using the local index
                     if(!hit.compatibleWith(tempHit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) {
                         if(count == 1) continue;
@@ -1511,11 +1481,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                    this->_minsc[rdi],
                                    rnd,
                                    (index_t)this->_minK_local,
-                                   (index_t)tpol.minIntronLen(),
-                                   (index_t)tpol.maxIntronLen(),
-                                   tpol.minAnchorLen(),
-                                   tpol.minAnchorLen_noncan(),
-                                   gpol.maxAltsTried(),
+                                   tpol,
+                                   gpol,
                                    leftext,
                                    rightext);
                     GenomeHit<index_t> combinedHit = hit;
@@ -1572,7 +1539,7 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                                                    dep + 1);
                             maxsc = max<int64_t>(maxsc, tmp_maxsc);
                         } else {
-                            local_genomeHits.push_back(combinedHit);
+                            this->_local_genomeHits[dep].push_back(combinedHit);
                         }
                     }
                 }
@@ -1581,8 +1548,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
             if(maxsc >= prev_score - sc.mmpMax) success = true;
             if(!success &&
                (him.localindexatts >= this->max_localindexatts || count == max_count || hGFM->nextLocalGFM(lGFM) == NULL) ) {
-                for(index_t ti = 0; ti < local_genomeHits.size(); ti++) {
-                    GenomeHit<index_t>& tempHit = local_genomeHits[ti];
+                for(index_t ti = 0; ti < this->_local_genomeHits[dep].size(); ti++) {
+                    GenomeHit<index_t>& tempHit = this->_local_genomeHits[dep][ti];
                     int64_t minsc = this->_minsc[rdi];
                     if(!this->_secondary) {
                         if(rdi == 0) minsc = max(minsc, sink.bestUnp1());
@@ -1678,7 +1645,7 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                      (index_t)coord.off(),
                                      (index_t)coord.joinedOff(),
                                      this->_sharedVars);
-                        if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol.maxAltsTried())) continue;
+                        if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue;
                         if(!hit.compatibleWith(tempHit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue;
                         index_t leftext = (index_t)0, rightext = (index_t)INDEX_MAX;
                         tempHit.extend(
@@ -1694,11 +1661,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                                        this->_minsc[rdi],
                                        rnd,
                                        (index_t)this->_minK_local,
-                                       (index_t)tpol.minIntronLen(),
-                                       (index_t)tpol.maxIntronLen(),
-                                       tpol.minAnchorLen(),
-                                       tpol.minAnchorLen_noncan(),
-                                       gpol.maxAltsTried(),
+                                       tpol,
+                                       gpol,
                                        leftext,
                                        rightext);
                         GenomeHit<index_t> combinedHit = hit;
@@ -1825,11 +1789,8 @@ int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
                            this->_minsc[rdi],
                            rnd,
                            (index_t)this->_minK_local,
-                           (index_t)tpol.minIntronLen(),
-                           (index_t)tpol.maxIntronLen(),
-                           tpol.minAnchorLen(),
-                           tpol.minAnchorLen_noncan(),
-                           gpol.maxAltsTried(),
+                           tpol,
+                           gpol,
                            leftext,
                            rightext,
                            num_mismatch_allowed);
diff --git a/tp.h b/tp.h
index 0219ba2..374950f 100644
--- a/tp.h
+++ b/tp.h
@@ -45,7 +45,8 @@ public:
                         bool no_spliced_alignment = false,
                         bool transcriptome_mapping_only = false,
                         bool transcriptome_assembly = false,
-                        bool xs_only = false)
+                        bool xs_only = false,
+                        bool avoid_pseudogene = false)
     {
         init(minIntronLen,
              maxIntronLen,
@@ -54,7 +55,8 @@ public:
              no_spliced_alignment,
              transcriptome_mapping_only,
              transcriptome_assembly,
-             xs_only);
+             xs_only,
+             avoid_pseudogene);
     }
     
     /**
@@ -73,7 +75,8 @@ public:
               bool no_spliced_alignment = false,
               bool transcriptome_mapping_only = false,
               bool transcriptome_assembly = false,
-              bool xs_only = false)
+              bool xs_only = false,
+              bool avoid_pseudogene = false)
     {
         minIntronLen_ = minIntronLen;
         maxIntronLen_ = maxIntronLen;
@@ -83,6 +86,7 @@ public:
         transcriptome_mapping_only_ = transcriptome_mapping_only;
         transcriptome_assembly_ = transcriptome_assembly;
         xs_only_ = xs_only;
+        avoid_pseudogene_ = avoid_pseudogene;
     }
     
     size_t minIntronLen() const { return minIntronLen_; }
@@ -93,6 +97,7 @@ public:
     bool transcriptome_mapping_only() const { return transcriptome_mapping_only_; }
     bool transcriptome_assembly() const { return transcriptome_assembly_; }
     bool xs_only() const { return xs_only_; }
+    bool avoid_pseudogene() const { return avoid_pseudogene_; }
     
 private:
     size_t   minIntronLen_;
@@ -107,6 +112,7 @@ private:
     bool transcriptome_mapping_only_;
     bool transcriptome_assembly_;
     bool xs_only_;
+    bool avoid_pseudogene_;
 };
 
 #endif /*ndef TP_H_*/
diff --git a/util.h b/util.h
index b13a0b6..f9c792c 100644
--- a/util.h
+++ b/util.h
@@ -32,7 +32,7 @@ char* itoa10(const T& value, char* result) {
 	char* out = result;
 	T quotient = value;
 	if(std::numeric_limits<T>::is_signed) {
-		if(quotient <= 0) quotient = -quotient;
+		if(quotient <= 0) quotient = 0-quotient;
 	}
 	// Now write each digit from most to least significant
 	do {

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/hisat2.git



More information about the debian-med-commit mailing list