[med-svn] [rsem] 01/01: Imported Upstream version 1.2.29+dfsg

Michael Crusoe misterc-guest at moszumanska.debian.org
Sun May 15 16:34:16 UTC 2016


This is an automated email from the git hooks/post-receive script.

misterc-guest pushed a commit to annotated tag upstream/1.2.29+dfsg
in repository rsem.

commit e67ba117dc6657a5d38ecd6d3728947f93682dcd
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date:   Mon Mar 7 03:56:40 2016 -0800

    Imported Upstream version 1.2.29+dfsg
---
 BamConverter.h                              |  57 ++++---
 BamWriter.h                                 |  42 +++--
 EBSeq/Makefile                              |   7 +-
 EBSeq/calcClusteringInfo.cpp                |   2 -
 EBSeq/install                               |   7 +-
 EBSeq/rsem-for-ebseq-find-DE                |   2 +-
 FOR_CYGWIN_USERS                            |   1 -
 Makefile                                    | 238 +++++++++++++++-------------
 README.md                                   |  42 +++--
 SamParser.h                                 |  28 ++--
 WHAT_IS_NEW                                 |   9 ++
 bc_aux.h                                    |   2 +-
 convert-sam-for-rsem                        |   4 +
 extract-transcript-to-gene-map-from-trinity |   8 +-
 extractRef.cpp                              |  42 +++--
 getUnique.cpp                               |  24 +--
 rsem-calculate-expression                   |  12 +-
 rsem-control-fdr                            |   4 +
 rsem-generate-ngvector                      |   6 +-
 rsem-plot-transcript-wiggles                |   4 +
 rsem-prepare-reference                      |   4 +
 rsem-run-ebseq                              |   4 +
 rsem_perl_utils.pm                          |   2 +-
 samValidator.cpp                            | 103 ------------
 sam_utils.h                                 | 234 ---------------------------
 sampling.h                                  |  67 --------
 scanForPairedEndReads.cpp                   |  46 +++---
 synthesisRef.cpp                            |  47 +++---
 wiggle.cpp                                  |  13 +-
 29 files changed, 387 insertions(+), 674 deletions(-)

diff --git a/BamConverter.h b/BamConverter.h
index 84c0a30..ed9a6b8 100644
--- a/BamConverter.h
+++ b/BamConverter.h
@@ -8,8 +8,7 @@
 #include<map>
 
 #include <stdint.h>
-#include "bam.h"
-#include "sam.h"
+#include "htslib/sam.h"
 #include "sam_utils.h"
 
 #include "utils.h"
@@ -25,7 +24,8 @@ public:
 
 	void process();
 private:
-	samfile_t *in, *out;
+	samFile *in, *out;
+	bam_hdr_t *in_header, *out_header;
 	Transcripts& transcripts;
 
 	std::map<std::string, int> refmap;
@@ -46,25 +46,29 @@ BamConverter::BamConverter(const char* inpF, const char* outF, const char* chr_l
 {
 	general_assert(transcripts.getType() == 0, "Genome information is not provided! RSEM cannot convert the transcript bam file!");
 
-	in = samopen(inpF, "r", NULL);
+	in = sam_open(inpF, "r");
 	assert(in != 0);
+	in_header = sam_hdr_read(in);
+	assert(in_header != 0);
 
-	transcripts.buildMappings(in->header->n_targets, in->header->target_name);
-
-	bam_hdr_t *out_header = sam_header_read2(chr_list);
+	transcripts.buildMappings(in_header->n_targets, in_header->target_name);
 
+	std::string SQs = fai_headers(chr_list);
+	out_header = sam_hdr_parse(SQs.length(), SQs.c_str());
+	assert(out_header != 0);
+	
 	refmap.clear();
 	for (int i = 0; i < out_header->n_targets; ++i) {
 		refmap[out_header->target_name[i]] = i;
 	}
 
-	if (in->header->l_text > 0) {
+	if (in_header->l_text > 0) {
 		char comment[] = "@CO\tThis BAM file is processed by rsem-tbam2gam to convert from transcript coordinates into genomic coordinates.\n";
 		int comment_len = strlen(comment);
 
 		//Filter @SQ fields if the BAM file is user provided
-		char *text = in->header->text;
-		int l_text = in->header->l_text;
+		char *text = in_header->text;
+		int l_text = in_header->l_text;
 		char *new_text = new char[l_text + comment_len];
 		int pos = 0, s = 0;
 		while (pos < l_text) {
@@ -82,17 +86,18 @@ BamConverter::BamConverter(const char* inpF, const char* outF, const char* chr_l
 		delete[] new_text;
 	}
 
-	out = samopen(outF, "wb", out_header);
+	out = sam_open(outF, "wb");
 	assert(out != 0);
+	sam_hdr_write(out, out_header);
 
-	bam_hdr_destroy(out_header);
-
-        if (nThreads > 1) general_assert(samthreads(out, nThreads, 256) == 0, "Fail to create threads for writing the BAM file!");
+	if (nThreads > 1) general_assert(hts_set_threads(out, nThreads) == 0, "Fail to create threads for writing the BAM file!");
 }
 
 BamConverter::~BamConverter() {
-	samclose(in);
-	samclose(out);
+	bam_hdr_destroy(in_header);
+	sam_close(in);
+	bam_hdr_destroy(out_header);
+	sam_close(out);
 }
 
 void BamConverter::process() {
@@ -105,11 +110,11 @@ void BamConverter::process() {
 	cqname = "";
 	b = bam_init1(); b2 = bam_init1();
 
-	while (samread(in, b) >= 0) {
+	while (sam_read1(in, in_header, b) >= 0) {
 		++cnt;
 		isPaired = bam_is_paired(b);
 		if (isPaired) {
-		  assert(samread(in, b2) >= 0 && bam_is_paired(b2));
+		  assert(sam_read1(in, in_header, b2) >= 0 && bam_is_paired(b2));
 		  if (!bam_is_read1(b)) { bam1_t *tmp = b; b = b2; b2 = tmp; }
 		  assert(bam_is_read1(b) && bam_is_read2(b2));
 		  general_assert((bam_is_mapped(b) && bam_is_mapped(b2)) || (bam_is_unmapped(b) && bam_is_unmapped(b2)), \
@@ -150,8 +155,8 @@ void BamConverter::process() {
 		  cqname = qname;
 		  collapseMap.init(isPaired);
 		  
-		  samwrite(out, b);
-		  if (isPaired) samwrite(out, b2);
+		  sam_write1(out, out_header, b);
+		  if (isPaired) sam_write1(out, out_header, b2);
 		}
 	}
 
@@ -192,8 +197,8 @@ void BamConverter::convert(bam1_t* b, const Transcript& transcript) {
 	tr2chr(transcript, pos + 1, pos + readlen, core_pos, core_n_cigar, data);
 	assert(core_pos >= 0);
 
-	int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4;
-	b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len;
+	int rest_len = b->l_data - b->core.l_qname - b->core.n_cigar * 4;
+	b->l_data = b->core.l_qname + core_n_cigar * 4 + rest_len;
 	expand_data_size(b);
 	uint8_t* pt = b->data + b->core.l_qname;
 	memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len);
@@ -201,7 +206,7 @@ void BamConverter::convert(bam1_t* b, const Transcript& transcript) {
 
 	b->core.pos = core_pos;
 	b->core.n_cigar = core_n_cigar;
-	b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b)));
+	b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
 
 	modifyTags(b, transcript); // check if need to add XS tag, if need, add it
 }
@@ -221,11 +226,11 @@ inline void BamConverter::writeCollapsedLines() {
 			}
 			// otherwise, just use the MAPQ score of the orignal alignment
 
-			samwrite(out, tmp_b);
+			sam_write1(out, out_header, tmp_b);
 			if (isPaired) {
 				if (p != NULL) memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
 				tmp_b2->core.qual = tmp_b->core.qual;
-				samwrite(out, tmp_b2);
+				sam_write1(out, out_header, tmp_b2);
 			}
 			bam_destroy1(tmp_b);
 			if (isPaired) bam_destroy1(tmp_b2);
@@ -312,7 +317,7 @@ inline void BamConverter::modifyTags(bam1_t* b, const Transcript& transcript) {
 	s = bam_aux_get(b, "XS");
 	if (s != NULL) bam_aux_del(b, s);
 	bool hasN = false;
-	uint32_t* p = bam1_cigar(b);
+	uint32_t* p = bam_get_cigar(b);
 	for (int i = 0; i < (int)b->core.n_cigar; i++)
 		if ((*(p + i) & BAM_CIGAR_MASK) == BAM_CREF_SKIP) { hasN = true; break; }
 	if (hasN) bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand);
diff --git a/BamWriter.h b/BamWriter.h
index 4fd5aa6..dbcb0f2 100644
--- a/BamWriter.h
+++ b/BamWriter.h
@@ -10,8 +10,7 @@
 #include<iostream>
 
 #include <stdint.h>
-#include "bam.h"
-#include "sam.h"
+#include "htslib/sam.h"
 #include "sam_utils.h"
 
 #include "utils.h"
@@ -32,7 +31,8 @@ public:
 	void work(HitWrapper<SingleHit> wrapper);
 	void work(HitWrapper<PairedEndHit> wrapper);
 private:
-	samfile_t *in, *out;
+	samFile *in, *out;
+	bam_hdr_t *in_header, *out_header;
 	Transcripts& transcripts;
 	
 	void set_alignment_weight(bam1_t *b, double prb) {
@@ -49,31 +49,39 @@ private:
 
 //aux can be NULL
 BamWriter::BamWriter(const char* inpF, const char* aux, const char* outF, Transcripts& transcripts, int nThreads) : transcripts(transcripts) {
-  in = samopen(inpF, "r", aux);
+  in = sam_open(inpF, "r");
   assert(in != 0);
 
+  if (aux == NULL) in_header = sam_hdr_read(in);
+  else {
+    std::string SQs = fai_headers(aux);
+    in_header = sam_hdr_parse(SQs.length(), SQs.c_str());
+  }
+  assert(in_header != 0);
+
   //build mappings from external sid to internal sid
-  transcripts.buildMappings(in->header->n_targets, in->header->target_name);
+  transcripts.buildMappings(in_header->n_targets, in_header->target_name);
   
   //generate output's header
-  bam_hdr_t *out_header = bam_header_dwt(in->header);
+  out_header = bam_header_dwt(in_header);
   
   std::ostringstream strout;
   strout<<"@HD\tVN:1.4\tSO:unknown\n at PG\tID:RSEM\n";
   std::string content = strout.str();
   append_header_text(out_header, content.c_str(), content.length());
   
-  out = samopen(outF, "wb", out_header); // If CRAM format is desired, use "wc"
+  out = sam_open(outF, "wb"); // If CRAM format is desired, use "wc"
   assert(out != 0);
+  sam_hdr_write(out, out_header);
     
-  bam_hdr_destroy(out_header);
-
-  if (nThreads > 1) general_assert(samthreads(out, nThreads, 256) == 0, "Fail to create threads for writing the BAM file!");
+  if (nThreads > 1) general_assert(hts_set_threads(out, nThreads) == 0, "Fail to create threads for writing the BAM file!");
 }
 
 BamWriter::~BamWriter() {
-	samclose(in);
-	samclose(out);
+	bam_hdr_destroy(in_header);
+	sam_close(in);
+	bam_hdr_destroy(out_header);
+	sam_close(out);
 }
 
 void BamWriter::work(HitWrapper<SingleHit> wrapper) {
@@ -84,7 +92,7 @@ void BamWriter::work(HitWrapper<SingleHit> wrapper) {
 
 	b = bam_init1();
 
-	while (samread(in, b) >= 0) {
+	while (sam_read1(in, in_header, b) >= 0) {
 		++cnt;
 		if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; }
 
@@ -95,7 +103,7 @@ void BamWriter::work(HitWrapper<SingleHit> wrapper) {
 		  assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
 		  set_alignment_weight(b, hit->getConPrb());
 		}
-		samwrite(out, b);
+		sam_write1(out, out_header, b);
 	}
 
 	assert(wrapper.getNextHit() == NULL);
@@ -113,7 +121,7 @@ void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
 	b = bam_init1();
 	b2 = bam_init1();
 
-	while (samread(in, b) >= 0 && samread(in, b2) >= 0) {
+	while (sam_read1(in, in_header, b) >= 0 && sam_read1(in, in_header, b2) >= 0) {
 		cnt += 2;
 		if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; }
 
@@ -130,8 +138,8 @@ void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
 			set_alignment_weight(b2, hit->getConPrb());
 		}
 
-		samwrite(out, b);
-		samwrite(out, b2);
+		sam_write1(out, out_header, b);
+		sam_write1(out, out_header, b2);
 	}
 
 	assert(wrapper.getNextHit() == NULL);
diff --git a/EBSeq/Makefile b/EBSeq/Makefile
index 20a3559..ee358a9 100644
--- a/EBSeq/Makefile
+++ b/EBSeq/Makefile
@@ -1,5 +1,6 @@
-CC = g++
+CXX = g++
 PROGRAMS = EBSeq rsem-for-ebseq-calculate-clustering-info
+DEPENDENCIES = blockmodeling gplots gtools gdata caTools bitops KernSmooth
 
 .PHONY : all EBSeq clean
 
@@ -9,7 +10,7 @@ EBSeq :
 	./install
 
 rsem-for-ebseq-calculate-clustering-info : calcClusteringInfo.cpp
-	$(CC) -O3 -Wall calcClusteringInfo.cpp -o $@
+	$(CXX) -O3 -Wall calcClusteringInfo.cpp -o $@
 
 clean : 
-	rm -rf blockmodeling gplots $(PROGRAMS) *~ BiocInstaller
+	rm -rf $(PROGRAMS) $(DEPENDENCIES) BiocInstaller *~
diff --git a/EBSeq/calcClusteringInfo.cpp b/EBSeq/calcClusteringInfo.cpp
index 27357b2..9caefbc 100644
--- a/EBSeq/calcClusteringInfo.cpp
+++ b/EBSeq/calcClusteringInfo.cpp
@@ -10,8 +10,6 @@
 #include<algorithm>
 using namespace std;
 
-const int STRLEN = 1005;
-
 int M;
 int k; // k-mer size
 vector<string> names;
diff --git a/EBSeq/install b/EBSeq/install
index 4a8b807..ee2eab6 100755
--- a/EBSeq/install
+++ b/EBSeq/install
@@ -7,17 +7,18 @@ result <- suppressWarnings(tryCatch({
        }, error = function(err) {
             tryCatch({
 		source("http://www.bioconductor.org/biocLite.R")
-                biocLite("BiocUpgrade")
+                try(biocLite("BiocUpgrade"))
     		biocLite("EBSeq", lib = ".")
     		library("EBSeq")
 		cat("EBSeq v", as.character(packageVersion("EBSeq")), " is successfully installed from Bioconductor.\n", sep = "")
 		}, error = function(err) {
 		     tryCatch({
 		         cat("Failed to install EBSeq from Bioconductor! Try to install EBSeq v1.2.0 locally instead.\n")
-      			 install.packages(c("blockmodeling_0.1.8.tar.gz", "gplots_2.17.0.tar.gz", "EBSeq_1.2.0.tar.gz"), lib = ".", repos = NULL, type = "source")
+      			 install.packages(c("blockmodeling_0.1.8.tar.gz", "gtools_3.5.0.tar.gz", "gdata_2.17.0.tar.gz", "bitops_1.0-6.tar.gz",
+			                    "caTools_1.17.1.tar.gz", "KernSmooth_2.23-15.tar.gz", "gplots_2.17.0.tar.gz", "EBSeq_1.2.0.tar.gz"),
+					  lib = ".", repos = NULL, type = "source")
       			 library("EBSeq") 
 			 cat("EBSeq v1.2.0 is successfully installed locally.\n")
 			 }, error = function(err) { cat("Failed to install EBSeq v1.2.0 locally!\n") })
 		    })
 	}))
-
diff --git a/EBSeq/rsem-for-ebseq-find-DE b/EBSeq/rsem-for-ebseq-find-DE
index 756c745..62aad0d 100755
--- a/EBSeq/rsem-for-ebseq-find-DE
+++ b/EBSeq/rsem-for-ebseq-find-DE
@@ -15,7 +15,7 @@ norm_out_file <- paste0(output_file, ".normalized_data_matrix")
 nc <- length(argv) - 4;
 num_reps <- as.numeric(argv[5:(5+nc-1)])
 
-.libPaths(c(path, .libPaths()))
+.libPaths(c(.libPaths(), path))
 library(EBSeq)
 
 DataMat <- data.matrix(read.table(data_matrix_file))
diff --git a/FOR_CYGWIN_USERS b/FOR_CYGWIN_USERS
deleted file mode 100644
index b743e81..0000000
--- a/FOR_CYGWIN_USERS
+++ /dev/null
@@ -1 +0,0 @@
-Please uncomment the 4th and 8th lines in sam/Makefile before compiling RSEM.
diff --git a/Makefile b/Makefile
index 8c73412..973828f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,158 +1,172 @@
-CC = g++
-CFLAGS = -Wall -c -I.
-COFLAGS = -Wall -O3 -ffast-math -c -I.
-LFLAGS = -Wall -O3 -I.
-
 SAMTOOLS = samtools-1.3
 HTSLIB = htslib-1.3
-SAMFLAGS = -I$(SAMTOOLS) -I$(SAMTOOLS)/$(HTSLIB)
-SAMLIBS = $(SAMTOOLS)/libbam.a $(SAMTOOLS)/$(HTSLIB)/libhts.a
-PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth rsem-sam-validator rsem-scan-for-paired-end-reads
 
+ifneq ($(cygwin), true)
+  SAMTOOLS_MAKEFILE = Makefile
+else
+  SAMTOOLS_MAKEFILE = Makefile.cygwin
+endif
 
-.PHONY : all ebseq clean
+# overridable, defaulting to local copy
+BOOST = .
 
-all : $(PROGRAMS)
+# Compilation variables
+CXX = g++
+CXXFLAGS = -std=gnu++98 -Wall -I. -I$(BOOST) -I$(SAMTOOLS)/$(HTSLIB)
+CPPFLAGS =
 
-$(SAMTOOLS)/libbam.a : $(SAMTOOLS)/$(HTSLIB)/libhts.a
+LDFLAGS =
+LDLIBS =
 
-$(SAMTOOLS)/$(HTSLIB)/libhts.a : 
-	cd $(SAMTOOLS) ; ${MAKE} all
+# Installation variables
+INSTALL = install
+INSTALL_PROGRAM = $(INSTALL) -p
+INSTALL_DATA = $(INSTALL) -p -m 644
+INSTALL_DIR = $(INSTALL) -d
+STRIP ?=strip
 
-Transcript.h : utils.h
+prefix ?= /usr/local
+exec_prefix = $(prefix)
+bindir = $(exec_prefix)/bin
 
-Transcripts.h : utils.h my_assert.h Transcript.h
+# Auxiliary variables for compilation
+SAMHEADERS = $(SAMTOOLS)/$(HTSLIB)/htslib/sam.h
+SAMLIBS = $(SAMTOOLS)/$(HTSLIB)/libhts.a
+CONFIGURE = ./configure
 
-rsem-extract-reference-transcripts : utils.h my_assert.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp
-	$(CC) $(LFLAGS) extractRef.cpp -o rsem-extract-reference-transcripts
+OBJS1 = parseIt.o
+OBJS2 = extractRef.o synthesisRef.o preRef.o buildReadIndex.o wiggle.o tbam2bam.o bam2wig.o bam2readdepth.o getUnique.o samValidator.o scanForPairedEndReads.o
+OBJS3 = EM.o Gibbs.o calcCI.o simulation.o
 
-rsem-synthesis-reference-transcripts : utils.h my_assert.h Transcript.h Transcripts.h synthesisRef.cpp
-	$(CC) $(LFLAGS) synthesisRef.cpp -o rsem-synthesis-reference-transcripts
+PROGS1 = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-build-read-index rsem-simulate-reads
+PROGS2 = rsem-parse-alignments rsem-run-em rsem-tbam2gbam rsem-bam2wig rsem-bam2readdepth rsem-get-unique rsem-sam-validator rsem-scan-for-paired-end-reads
+PROGS3 = rsem-run-gibbs rsem-calculate-credibility-intervals
 
-BowtieRefSeqPolicy.h : RefSeqPolicy.h
+PROGRAMS = $(PROGS1) $(PROGS2) $(PROGS3)
 
-RefSeq.h : utils.h
-
-Refs.h : utils.h RefSeq.h RefSeqPolicy.h PolyARules.h
+# Auxiliary variables for installation
+SCRIPTS = rsem-prepare-reference rsem-calculate-expression rsem-refseq-extract-primary-assembly rsem-gff3-to-gtf rsem-plot-model \
+	  rsem-plot-transcript-wiggles rsem-gen-transcript-plots rsem-generate-data-matrix \
+	  extract-transcript-to-gene-map-from-trinity convert-sam-for-rsem    
 
 
-rsem-preref : preRef.o
-	$(CC) preRef.o -o rsem-preref
 
-preRef.o : utils.h RefSeq.h Refs.h PolyARules.h RefSeqPolicy.h AlignerRefSeqPolicy.h preRef.cpp
-	$(CC) $(COFLAGS) preRef.cpp
 
+.PHONY : all ebseq clean
 
-SingleRead.h : Read.h
-
-SingleReadQ.h : Read.h
+all : $(PROGRAMS) $(SAMTOOLS)/samtools
 
-PairedEndRead.h : Read.h SingleRead.h
+$(SAMTOOLS)/samtools :
+	cd $(SAMTOOLS) && $(CONFIGURE) --without-curses && $(MAKE) -f $(SAMTOOLS_MAKEFILE) samtools
 
-PairedEndReadQ.h : Read.h SingleReadQ.h
+$(SAMLIBS) : $(SAMTOOLS)/samtools
 
 
-PairedEndHit.h : SingleHit.h
+# Compile objects
+$(OBJS1) :
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -O2 -c -o $@ $<
 
-HitContainer.h : GroupInfo.h
+$(OBJS2) :
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -O3 -c -o $@ $<
 
-sam_utils.h : $(SAMTOOLS)/bam.h Transcript.h Transcripts.h
+$(OBJS3) :
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -O3 -ffast-math -c -o $@ $<
 
-SamParser.h : $(SAMTOOLS)/sam.h $(SAMTOOLS)/bam.h sam_utils.h utils.h my_assert.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Transcripts.h
 
+# Generate executables
+$(PROGS1) :
+	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
 
-rsem-parse-alignments : parseIt.o $(SAMLIBS)
-	$(CC) -o rsem-parse-alignments parseIt.o $(SAMLIBS) -lz -lpthread 
+$(PROGS2) :
+	$(CXX) $(LDFLAGS) -pthread -o $@ $^ $(LDLIBS) -lz
 
-parseIt.o : $(SAMTOOLS)/sam.h $(SAMTOOLS)/bam.h sam_utils.h utils.h my_assert.h GroupInfo.h Transcripts.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h parseIt.cpp
-	$(CC) -Wall -O2 -c -I. $(SAMFLAGS) parseIt.cpp
+$(PROGS3) :
+	$(CXX) $(LDFLAGS) -pthread -o $@ $^ $(LDLIBS)
 
 
-rsem-build-read-index : utils.h buildReadIndex.cpp
-	$(CC) $(LFLAGS) buildReadIndex.cpp -o rsem-build-read-index
+# Dependencies for executables
+rsem-extract-reference-transcripts : extractRef.o
+rsem-synthesis-reference-transcripts : synthesisRef.o
+rsem-preref : preRef.o
+rsem-build-read-index : buildReadIndex.o
+rsem-simulate-reads : simulation.o
 
+rsem-parse-alignments : parseIt.o $(SAMLIBS)
+rsem-run-em : EM.o $(SAMLIBS)
+rsem-tbam2gbam : tbam2gbam.o $(SAMLIBS)
+rsem-bam2wig : bam2wig.o wiggle.o $(SAMLIBS)
+rsem-bam2readdepth : bam2readdepth.o wiggle.o $(SAMLIBS)
+rsem-get-unique : getUnique.o $(SAMLIBS)
+rsem-sam-validator : samValidator.o $(SAMLIBS)
+rsem-scan-for-paired-end-reads : scanForPairedEndReads.o $(SAMLIBS)
 
-simul.h : boost/random.hpp
+rsem-run-gibbs : Gibbs.o
+rsem-calculate-credibility-intervals : calcCI.o
 
+# Dependencies for objects
+parseIt.o : parseIt.cpp $(SAMHEADERS) sam_utils.h utils.h my_assert.h GroupInfo.h Transcripts.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h
+
+extractRef.o : extractRef.cpp utils.h my_assert.h GTFItem.h Transcript.h Transcripts.h
+synthesisRef.o : synthesisRef.cpp utils.h my_assert.h Transcript.h Transcripts.h
+preRef.o : preRef.cpp utils.h RefSeq.h Refs.h PolyARules.h RefSeqPolicy.h AlignerRefSeqPolicy.h
+buildReadIndex.o : buildReadIndex.cpp utils.h
+wiggle.o: wiggle.cpp $(SAMHEADERS) sam_utils.h utils.h my_assert.h wiggle.h
+tbam2bam.o : tbam2gbam.cpp $(SAMHEADERS) utils.h Transcripts.h Transcript.h BamConverter.h sam_utils.h my_assert.h bc_aux.h
+bam2wig.o : bam2wig.cpp utils.h my_assert.h wiggle.h
+bam2readdepth.o : bam2readdepth.cpp utils.h my_assert.h wiggle.h
+getUnique.o : getUnique.cpp $(SAMHEADERS) sam_utils.h utils.h 
+samValidator.o : samValidator.cpp $(SAMHEADERS) sam_utils.h utils.h my_assert.h
+scanForPairedEndReads.o : scanForPairedEndReads.cpp $(SAMHEADERS) sam_utils.h utils.h my_assert.h 
+
+EM.o : EM.cpp $(SAMHEADERS) utils.h my_assert.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h simul.h sam_utils.h sa [...]
+Gibbs.o : Gibbs.cpp utils.h my_assert.h $(BOOST)/boost/random.hpp sampling.h simul.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h ModelParams.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h 
+calcCI.o : calcCI.cpp utils.h my_assert.h $(BOOST)/boost/random.hpp sampling.h simul.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h ModelParams.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h Buffer.h 
+simulation.o : simulation.cpp utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h $(BOOST)/boost/random.hpp WriteResults.h
+
+# Dependencies for header files
+Transcript.h : utils.h
+Transcripts.h : utils.h my_assert.h Transcript.h
+BowtieRefSeqPolicy.h : RefSeqPolicy.h
+RefSeq.h : utils.h
+Refs.h : utils.h RefSeq.h RefSeqPolicy.h PolyARules.h
+SingleRead.h : Read.h
+SingleReadQ.h : Read.h
+PairedEndRead.h : Read.h SingleRead.h
+PairedEndReadQ.h : Read.h SingleReadQ.h
+PairedEndHit.h : SingleHit.h
+HitContainer.h : GroupInfo.h
+sam_utils.h : $(SAMHEADERS) Transcript.h Transcripts.h
+SamParser.h : $(SAMHEADERS) sam_utils.h utils.h my_assert.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Transcripts.h
+simul.h : $(BOOST)/boost/random.hpp
 ReadReader.h : SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h ReadIndex.h
-
 SingleModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h SingleHit.h ReadReader.h simul.h
-
 SingleQModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h SingleHit.h ReadReader.h simul.h
-
 PairedEndModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h PairedEndRead.h PairedEndHit.h ReadReader.h simul.h 
-
 PairedEndQModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h PairedEndReadQ.h PairedEndHit.h ReadReader.h simul.h
-
 HitWrapper.h : HitContainer.h
-
-
-
-BamWriter.h : $(SAMTOOLS)/sam.h $(SAMTOOLS)/bam.h sam_utils.h utils.h my_assert.h SingleHit.h PairedEndHit.h HitWrapper.h Transcript.h Transcripts.h
-
-sampling.h : boost/random.hpp
-
+BamWriter.h : $(SAMHEADERS) sam_utils.h utils.h my_assert.h SingleHit.h PairedEndHit.h HitWrapper.h Transcript.h Transcripts.h
+sampling.h : $(BOOST)/boost/random.hpp
 WriteResults.h : utils.h my_assert.h GroupInfo.h Transcript.h Transcripts.h RefSeq.h Refs.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h
-
-rsem-run-em : EM.o $(SAMLIBS)
-	$(CC) -o rsem-run-em EM.o $(SAMLIBS) -lz -lpthread
-
-EM.o : utils.h my_assert.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h $(SAMTOOLS)/bam.h $(SAMTOOLS)/sam.h simul.h [...]
-	$(CC) $(COFLAGS) $(SAMFLAGS) EM.cpp
-
-bc_aux.h : $(SAMTOOLS)/bam.h
-
-BamConverter.h : $(SAMTOOLS)/bam.h $(SAMTOOLS)/sam.h sam_utils.h utils.h my_assert.h bc_aux.h Transcript.h Transcripts.h
-
-rsem-tbam2gbam : utils.h Transcripts.h Transcript.h BamConverter.h $(SAMTOOLS)/sam.h $(SAMTOOLS)/bam.h sam_utils.h my_assert.h bc_aux.h tbam2gbam.cpp $(SAMLIBS)
-	$(CC) $(LFLAGS) $(SAMFLAGS) tbam2gbam.cpp $(SAMLIBS) -lz -lpthread -o $@
-
-wiggle.o: $(SAMTOOLS)/bam.h $(SAMTOOLS)/sam.h sam_utils.h utils.h my_assert.h wiggle.h wiggle.cpp
-	$(CC) $(COFLAGS) $(SAMFLAGS) wiggle.cpp
-
-rsem-bam2wig : utils.h my_assert.h wiggle.h wiggle.o $(SAMLIBS) bam2wig.cpp
-	$(CC) $(LFLAGS) bam2wig.cpp wiggle.o $(SAMLIBS) -lz -lpthread -o $@
-
-rsem-bam2readdepth : utils.h my_assert.h wiggle.h wiggle.o $(SAMLIBS) bam2readdepth.cpp
-	$(CC) $(LFLAGS) bam2readdepth.cpp wiggle.o $(SAMLIBS) -lz -lpthread -o $@
-
-
-rsem-simulate-reads : simulation.o
-	$(CC) -o rsem-simulate-reads simulation.o
-
-simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h boost/random.hpp WriteResults.h simulation.cpp
-	$(CC) $(COFLAGS) simulation.cpp
-
-rsem-run-gibbs : Gibbs.o
-	$(CC) -o rsem-run-gibbs Gibbs.o -lpthread
-
-#some header files are omitted
-Gibbs.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h Gibbs.cpp 
-	$(CC) $(COFLAGS) Gibbs.cpp
-
+bc_aux.h : $(SAMHEADERS)
+BamConverter.h : $(SAMHEADERS) sam_utils.h utils.h my_assert.h bc_aux.h Transcript.h Transcripts.h
 Buffer.h : my_assert.h
 
-rsem-calculate-credibility-intervals : calcCI.o
-	$(CC) -o rsem-calculate-credibility-intervals calcCI.o -lpthread
-
-#some header files are omitted
-calcCI.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h Buffer.h calcCI.cpp
-	$(CC) $(COFLAGS) calcCI.cpp
-
-rsem-get-unique : $(SAMTOOLS)/bam.h $(SAMTOOLS)/sam.h sam_utils.h utils.h getUnique.cpp $(SAMLIBS)
-	$(CC) $(LFLAGS) $(SAMFLAGS) getUnique.cpp $(SAMLIBS) -lz -lpthread -o $@
-
-rsem-sam-validator : $(SAMTOOLS)/bam.h $(SAMTOOLS)/sam.h sam_utils.h utils.h my_assert.h samValidator.cpp $(SAMLIBS)
-	$(CC) $(LFLAGS) $(SAMFLAGS) samValidator.cpp $(SAMLIBS) -lz -lpthread -o $@
-
-rsem-scan-for-paired-end-reads : $(SAMTOOLS)/bam.h $(SAMTOOLS)/sam.h sam_utils.h utils.h my_assert.h scanForPairedEndReads.cpp $(SAMLIBS)
-	$(CC) $(LFLAGS) $(SAMFLAGS) scanForPairedEndReads.cpp $(SAMLIBS) -lz -lpthread -o $@
 
+# Compile EBSeq
 ebseq :
-	cd EBSeq ; ${MAKE} all
-
+	cd EBSeq && $(MAKE) all
+
+# Install RSEM
+install : $(PROGRAMS) $(SCRIPTS) $(SAMTOOLS)/samtools rsem_perl_utils.pm
+	$(INSTALL_DIR) $(DESTDIR)$(bindir) $(DESTDIR)$(bindir)/$(SAMTOOLS)
+	$(foreach prog,$(PROGRAMS),$(INSTALL_PROGRAM) $(prog) $(DESTDIR)$(bindir)/$(prog) ; $(STRIP) $(DESTDIR)$(bindir)/$(prog) ;)
+	$(INSTALL_PROGRAM) $(SAMTOOLS)/samtools $(DESTDIR)$(bindir)/$(SAMTOOLS)/samtools
+	$(STRIP) $(DESTDIR)$(bindir)/$(SAMTOOLS)/samtools
+	$(foreach script,$(SCRIPTS),$(INSTALL_PROGRAM) $(script) $(DESTDIR)$(bindir)/$(script) ;)
+	$(INSTALL_DATA) rsem_perl_utils.pm $(DESTDIR)$(bindir)/rsem_perl_utils.pm
+
+# Clean
 clean :
 	rm -f *.o *~ $(PROGRAMS)
-	cd $(SAMTOOLS) ; ${MAKE} clean 
-	rm -f $(SAMLIBS)
-	cd EBSeq ; ${MAKE} clean
+	cd $(SAMTOOLS) && $(MAKE) clean-all
+	cd EBSeq && $(MAKE) clean
diff --git a/README.md b/README.md
index 190f2af..611ac84 100644
--- a/README.md
+++ b/README.md
@@ -47,20 +47,33 @@ To compile RSEM, simply run
    
     make
 
-For cygwin users, please uncomment the 3rd and 7th line in
-`sam/Makefile` before you run `make`.
+For Cygwin users, run
+
+    make cygwin=true
 
 To compile EBSeq, which is included in the RSEM package, run
 
     make ebseq
 
-To install, simply put the rsem directory in your environment's PATH
-variable.
+To install RSEM, simply put the RSEM directory in your environment's PATH
+variable. Alternatively, run
+
+    make install
+
+By default, RSEM executables are installed to `/usr/local/bin`. You
+can change the installation location by setting `DESTDIR` and/or
+`prefix` variables. The RSEM executables will be installed to
+`${DESTDIR}${prefix}/bin`. The default values of `DESTDIR` and
+`prefix` are `DESTDIR=` and `prefix=/usr/local`. For example,
+
+    make install DESTDIR=/home/my_name prefix=/software
+
+will install RSEM executables to `/home/my_name/software/bin`.
 
-If you prefer to put all RSEM executables to a bin directory, please
-also remember to put `rsem_perl_utils.pm` to the same bin
-directory. `rsem_perl_utils.pm` is required for most RSEM's perl
-scripts.
+**Note** that `make install` does not install `EBSeq` related scripts,
+such as `rsem-generate-ngvector`, `rsem-run-ebseq`, and
+`rsem-control-fdr`. But `rsem-generate-data-matrix`, which generates
+count matrix for differential expression analysis, is installed.
 
 ### Prerequisites
 
@@ -494,7 +507,7 @@ detection. However, for de novo assembled transcriptome, it is hard to
 obtain an accurate gene-isoform relationship. Instead, RSEM provides a
 script `rsem-generate-ngvector`, which clusters transcripts based on
 measures directly relating to read mappaing ambiguity. First, it
-calcualtes the 'unmappability' of each transcript. The 'unmappability'
+calculates the 'unmappability' of each transcript. The 'unmappability'
 of a transcript is the ratio between the number of k mers with at
 least one perfect match to other transcripts and the total number of k
 mers of this transcript, where k is a parameter. Then, Ng vector is
@@ -572,13 +585,18 @@ RSEM uses the [Boost C++](http://www.boost.org/) and
 [EBSeq](http://www.biostat.wisc.edu/~ningleng/EBSeq_Package/) for
 differential expression analysis.
 
-We thank earonesty and Dr. Samuel Arvidsson for contributing patches.
+We thank earonesty, Dr. Samuel Arvidsson, John Marshall, and Michael
+R. Crusoe for contributing patches.
 
-We thank Han Lin, j.miller, Joël Fillon, Dr. Samuel G. Younkin and Malcolm Cook for suggesting possible fixes. 
+We thank Han Lin, j.miller, Joël Fillon, Dr. Samuel G. Younkin,
+Malcolm Cook, Christina Wells, Uroš Šipetić,
+outpaddling, rekado, and Josh Richer for suggesting possible fixes.
 
 **Note** that `bam_sort.c` of SAMtools is slightly modified so that
   `samtools sort -n` will not move the two mates of paired-end
-  alignments apart.
+  alignments apart. In addition, we turn on the `--without-curses`
+  option when configuring SAMtools and thus SAMtools' curses-based
+  `tview` subcommand is not built.
 
 ## <a name="license"></a> License
 
diff --git a/SamParser.h b/SamParser.h
index bfa1146..fe3d093 100644
--- a/SamParser.h
+++ b/SamParser.h
@@ -9,8 +9,7 @@
 #include<string>
 
 #include <stdint.h>
-#include "bam.h"
-#include "sam.h"
+#include "htslib/sam.h"
 #include "sam_utils.h"
 
 #include "utils.h"
@@ -49,8 +48,8 @@ public:
 	}
 
 private:
-	samfile_t *sam_in;
-	bam_header_t *header;
+	samFile *sam_in;
+	bam_hdr_t *header;
 	bam1_t *b, *b2;
 
 	Transcripts& transcripts;
@@ -87,10 +86,14 @@ char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
 SamParser::SamParser(const char* inpF, const char* aux, Transcripts& transcripts, const char* imdName)
 	: transcripts(transcripts)
 {
-	sam_in = samopen(inpF, "r", aux);
-
+	sam_in = sam_open(inpF, "r");
 	general_assert(sam_in != 0, "Cannot open " + cstrtos(inpF) + "! It may not exist.");
-	header = sam_in->header;
+
+	if (aux == NULL) header = sam_hdr_read(sam_in);
+	else {
+	  std::string SQs = fai_headers(aux);
+	  header = sam_hdr_parse(SQs.length(), SQs.c_str());
+	}
 	general_assert(header != 0, "Fail to parse sam header!");
 
 	transcripts.buildMappings(header->n_targets, header->target_name, imdName);
@@ -100,7 +103,8 @@ SamParser::SamParser(const char* inpF, const char* aux, Transcripts& transcripts
 }
 
 SamParser::~SamParser() {
-	samclose(sam_in);
+	bam_hdr_destroy(header);
+	sam_close(sam_in);
 	bam_destroy1(b);
 	bam_destroy1(b2);
 }
@@ -110,7 +114,7 @@ SamParser::~SamParser() {
 int SamParser::parseNext(SingleRead& read, SingleHit& hit) {
 	int val; // return value
 
-	if (samread(sam_in, b) < 0) return -1;
+	if (sam_read1(sam_in, header, b) < 0) return -1;
 
 	std::string name = bam_get_canonical_name(b);
 	
@@ -139,7 +143,7 @@ int SamParser::parseNext(SingleRead& read, SingleHit& hit) {
 int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) {
 	int val;
 
-	if (samread(sam_in, b) < 0) return -1;
+	if (sam_read1(sam_in, header, b) < 0) return -1;
 
 	std::string name = bam_get_canonical_name(b);
 	
@@ -169,7 +173,7 @@ int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) {
 int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) {
 	int val;
 
-	if ((samread(sam_in, b) < 0) || (samread(sam_in, b2) < 0)) return -1;
+	if ((sam_read1(sam_in, header, b) < 0) || (sam_read1(sam_in, header, b2) < 0)) return -1;
 
 	if (!bam_is_read1(b)) { bam1_t * tmp = b; b = b2; b2 = tmp; }
 	std::string name = bam_get_canonical_name(b);
@@ -208,7 +212,7 @@ int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) {
 int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) {
 	int val;
 	
-	if ((samread(sam_in, b) < 0) || (samread(sam_in, b2) < 0)) return -1;
+	if ((sam_read1(sam_in, header, b) < 0) || (sam_read1(sam_in, header, b2) < 0)) return -1;
 
 	if (!bam_is_read1(b)) { bam1_t *tmp = b; b = b2; b2 = tmp; } // swap if the first read is not read 1
 	std::string name = bam_get_canonical_name(b);
diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW
index 8309614..1f76820 100644
--- a/WHAT_IS_NEW
+++ b/WHAT_IS_NEW
@@ -1,3 +1,12 @@
+RSEM v1.2.29
+
+- Reformatted Makefile to be more professional, and `make install` is ready to use
+- Enabled `./configure --without-curses` for configuring SAMtools to avoid potential compilation problems due to curses library
+- Fixed bugs for installing EBSeq
+- Improved the readability of RSEM documentation
+
+--------------------------------------------------------------------------------------------
+
 RSEM v1.2.28
 
 - Fixed a bug in RSEM v1.2.27 that can lead to assertion errors for parsing GTF files
diff --git a/bc_aux.h b/bc_aux.h
index 1abd0a7..54f4d67 100644
--- a/bc_aux.h
+++ b/bc_aux.h
@@ -4,7 +4,7 @@
 #include<map>
 
 #include <stdint.h>
-#include "bam.h"
+#include "htslib/sam.h"
 
 struct SingleEndT {
 	bam1_t *b;
diff --git a/convert-sam-for-rsem b/convert-sam-for-rsem
index e723d6b..2a3256e 100755
--- a/convert-sam-for-rsem
+++ b/convert-sam-for-rsem
@@ -58,6 +58,10 @@ __END__
 
 convert-sam-for-rsem
 
+=head1 PURPOSE
+
+Make a RSEM compatible BAM file.
+
 =head1 SYNOPSIS
 
  convert-sam-for-rsem [options] <input.sam/input.bam/input.cram> output_file_name
diff --git a/extract-transcript-to-gene-map-from-trinity b/extract-transcript-to-gene-map-from-trinity
index bf53b9b..1ec63d5 100755
--- a/extract-transcript-to-gene-map-from-trinity
+++ b/extract-transcript-to-gene-map-from-trinity
@@ -21,8 +21,10 @@ while (substr($tag, 0, 1) eq ">") {
     if ($cnt == 0) { print "Warning: Fasta entry $tag has an empty sequence, it is omitted.\n"; }
     else {
 	my ($tid, @tmp) = split(/ /, $tag);
-	my ($comp, $c, @tmp) = split(/_/, $tag);
-	my $gid = join("_", $comp, $c);
+	my $pos = rindex($tid, "_");
+	my $gid = "";
+	if ($pos >= 0) { $gid = substr($tid, 0, $pos); }
+	else { $gid = $tid; }
 	print OUTPUT "$gid\t$tid\n";
     }
     $tag = $line; chomp($tag);
@@ -30,5 +32,3 @@ while (substr($tag, 0, 1) eq ">") {
 
 close(INPUT);
 close(OUTPUT);
-
- 
diff --git a/extractRef.cpp b/extractRef.cpp
index a09380d..1262675 100644
--- a/extractRef.cpp
+++ b/extractRef.cpp
@@ -207,13 +207,6 @@ void parse_gtf_file(char* gtfF) {
 	if (verbose) { printf("Parsing gtf File is done!\n"); }
 }
 
-inline char check(char c, string& seqname, int pos) {
-	general_assert(isalpha(c), "Sequence " + seqname + " contains an unknown letter (ASCII code " + itos(c) + ") at 0-based position " + itos(pos) + "!");
-	if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N';
-	if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n';
-	return c;
-}
-
 void shrink() {
   int curp = 0;
 
@@ -283,6 +276,19 @@ void writeResults(char* refName) {
 	if (verbose) { printf("Extracted Sequences File is generated!\n"); }
 }
 
+struct CursorPos {
+  char *filename;
+  int line_no, pos;
+} cursor;
+
+inline char check(char c) {
+  general_assert(isalpha(c), "FASTA file " + cstrtos(cursor.filename) + " contains an unknown character, " + \
+		 ctos(c) + " (ASCII code " + itos(c) + "), at line " + itos(cursor.line_no) + ", position " + itos(cursor.pos + 1) + "!");
+  if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N';
+  if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n';
+  return c;
+}
+
 int main(int argc, char* argv[]) {
   if (argc < 7 || ((hasMappingFile = atoi(argv[5])) && argc < 8)) {
 		printf("Usage: rsem-extract-reference-transcripts refName quiet gtfF sources hasMappingFile [mappingFile] chromosome_file_1 [chromosome_file_2 ...]\n");
@@ -301,7 +307,9 @@ int main(int argc, char* argv[]) {
 	
 	ifstream fin;
 	string line, gseq, seqname;
-
+	int len;
+	size_t seqlen;
+	
 	chrvec.clear();
 
 	seqs.clear();
@@ -310,24 +318,28 @@ int main(int argc, char* argv[]) {
 	for (int i = start; i < argc; i++) {
 		fin.open(argv[i]);
 		general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist.");
+		cursor.filename = argv[i]; cursor.line_no = cursor.pos = 0;
+		
 		getline(fin, line);
 		while ((fin) && (line[0] == '>')) {
 			istringstream strin(line.substr(1));
 			strin>>seqname;
-
-			gseq = "";
+			++cursor.line_no;
+			
+			gseq = ""; seqlen = 0;
 			while((getline(fin, line)) && (line[0] != '>')) {
+			  ++cursor.line_no;
+			  len = line.length();
+			  for (cursor.pos = 0; cursor.pos < len; ++cursor.pos) line[cursor.pos] = check(line[cursor.pos]);
+			  seqlen += len;
 			  gseq += line;
 			}
-
-			size_t len = gseq.length();
-			assert(len > 0);
-			for (size_t j = 0; j < len; j++) gseq[j] = check(gseq[j], seqname, j);
+			assert(seqlen > 0);
 			
 			iter = sn2tr.find(seqname);
 			if (iter == sn2tr.end()) continue;
 			
-			chrvec.push_back(ChrInfo(seqname, len));
+			chrvec.push_back(ChrInfo(seqname, seqlen));
 			
 			vector<int>& vec = iter->second;
 			int s = vec.size();
diff --git a/getUnique.cpp b/getUnique.cpp
index d137eb6..3d61589 100644
--- a/getUnique.cpp
+++ b/getUnique.cpp
@@ -6,8 +6,7 @@
 #include<vector>
 
 #include <stdint.h>
-#include "bam.h"
-#include "sam.h"
+#include "htslib/sam.h"
 #include "sam_utils.h"
 
 #include "utils.h"
@@ -17,7 +16,8 @@ using namespace std;
 
 int nThreads;
 string cqname;
-samfile_t *in, *out;
+samFile *in, *out;
+bam_hdr_t *header;
 bam1_t *b;
 vector<bam1_t*> arr;
 bool unaligned;
@@ -26,7 +26,7 @@ void output() {
 	if (unaligned || arr.size() == 0) return;
 	bool isPaired = bam_is_paired(arr[0]);
 	if ((isPaired && arr.size() != 2) || (!isPaired && arr.size() != 1)) return;
-	for (size_t i = 0; i < arr.size(); ++i) samwrite(out, arr[i]);
+	for (size_t i = 0; i < arr.size(); ++i) sam_write1(out, header, arr[i]);
 }
 
 int main(int argc, char* argv[]) {
@@ -36,11 +36,14 @@ int main(int argc, char* argv[]) {
 	}
 
         nThreads = atoi(argv[1]);
-	in = samopen(argv[2], "r", NULL);
+	in = sam_open(argv[2], "r");
 	assert(in != 0);
-	out = samopen(argv[3], "wb", in->header);
+	header = sam_hdr_read(in);
+	assert(header != 0);
+	out = sam_open(argv[3], "wb");
 	assert(out != 0);
-        if (nThreads > 1) general_assert(samthreads(out, nThreads, 256) == 0, "Fail to create threads for writing the BAM file!");
+	sam_hdr_write(out, header);
+	if (nThreads > 1) general_assert(hts_set_threads(out, nThreads) == 0, "Fail to create threads for writing the BAM file!");
 
 	HIT_INT_TYPE cnt = 0;
 
@@ -49,7 +52,7 @@ int main(int argc, char* argv[]) {
 	b = bam_init1();
 	unaligned = false;
 
-	while (samread(in, b) >= 0) {
+	while (sam_read1(in, header, b) >= 0) {
 		if (cqname != bam_get_qname(b)) {
 			output();
 			cqname = bam_get_qname(b);
@@ -70,8 +73,9 @@ int main(int argc, char* argv[]) {
 	output();
 
 	bam_destroy1(b);
-	samclose(in);
-	samclose(out);
+	bam_hdr_destroy(header);
+	sam_close(in);
+	sam_close(out);
 
 	printf("done!\n");
 
diff --git a/rsem-calculate-expression b/rsem-calculate-expression
index b17ac40..8f46646 100755
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -571,6 +571,10 @@ __END__
 
 rsem-calculate-expression
 
+=head1 PURPOSE
+
+Estimate gene and isoform expression from RNA-Seq data.
+
 =head1 SYNOPSIS
 
  rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name 
@@ -645,7 +649,7 @@ If gene_name/transcript_name is available, append it to the end of gene_id/trans
 
 =item B<--seed> <uint32>
 
-Set the seed for the random number generators used in calculating posterior mean estimates and credibility intervals. The seed must be a non-negative 32 bit interger. (Default: off)
+Set the seed for the random number generators used in calculating posterior mean estimates and credibility intervals. The seed must be a non-negative 32 bit integer. (Default: off)
 
 =item B<--single-cell-prior>
 
@@ -679,7 +683,7 @@ Show version information.
 
 =item B<--sort-bam-by-read-name>
 
-Sort BAM file aligned under transcript coordidate by read name. Setting this option on will produce determinstic maximum likelihood estimations from independent runs. Note that sorting will take long time and lots of memory. (Default: off)
+Sort BAM file aligned under transcript coordidate by read name. Setting this option on will produce deterministic maximum likelihood estimations from independent runs. Note that sorting will take long time and lots of memory. (Default: off)
 
 =item B<--no-bam-output>
 
@@ -687,7 +691,7 @@ Do not output any BAM file. (Default: off)
 
 =item B<--sampling-for-bam>
 
-When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is sampled, all alignments appeared in the BAM file should have weight 0. (Default: off)
+When RSEM generates a BAM file, instead of outputting all alignments a read has with their posterior probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is sampled, all alignments appeared in the BAM file should have weight 0. (Default: off)
 
 =item B<--output-genome-bam>
 
@@ -697,7 +701,7 @@ Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic
 
 Sort RSEM generated transcript and genome BAM files by coordinates and build associated indices. (Default: off)  
 
-=item B<--sort-bam-memory> <string>
+=item B<--sort-bam-memory-per-thread> <string>
 
 Set the maximum memory per thread that can be used by 'samtools sort'. <string> represents the memory and accepts suffices 'K/M/G'. RSEM will pass <string> to the '-m' option of 'samtools sort'. Note that the default used here is different from the default used by samtools. (Default: 1G)
 
diff --git a/rsem-control-fdr b/rsem-control-fdr
index eb8f345..89cfa66 100755
--- a/rsem-control-fdr
+++ b/rsem-control-fdr
@@ -64,6 +64,10 @@ __END__
 
 rsem-control-fdr
 
+=head1 PURPOSE
+
+Filter EBSeq output for statistical significance.
+
 =head1 SYNOPSIS
 
 rsem-control-fdr [options] input_file fdr_rate output_file
diff --git a/rsem-generate-ngvector b/rsem-generate-ngvector
index 93f1a79..0999e08 100755
--- a/rsem-generate-ngvector
+++ b/rsem-generate-ngvector
@@ -35,6 +35,10 @@ __END__
 
 rsem-generate-ngvector
 
+=head1 PURPOSE
+
+Create Ng vector for EBSeq based only on transcript sequences.
+
 =head1 SYNOPSIS
 
 rsem-generate-ngvector [options] input_fasta_file output_name
@@ -69,7 +73,7 @@ Show help information.
 
 =head1 DESCRIPTION
 
-This program generates the Ng vector required by EBSeq for isoform level differential expression analysis based on reference sequences only. EBSeq can take variance due to read mapping ambiguity into consideration by grouping isoforms with parent gene's number of isoforms. However, for de novo assembled transcriptome, it is hard to obtain an accurate gene-isoform relationship. Instead, this program groups isoforms by using measures on read mappaing ambiguity directly. First, it calcualte [...]
+This program generates the Ng vector required by EBSeq for isoform level differential expression analysis based on reference sequences only. EBSeq can take variance due to read mapping ambiguity into consideration by grouping isoforms with parent gene's number of isoforms. However, for de novo assembled transcriptome, it is hard to obtain an accurate gene-isoform relationship. Instead, this program groups isoforms by using measures on read mappaing ambiguity directly. First, it calculate [...]
 
 If your reference is a de novo assembled transcript set, you should run 'rsem-generate-ngvector' first. Then load the resulting 'output_name.ngvec' into R. For example, you can use
 
diff --git a/rsem-plot-transcript-wiggles b/rsem-plot-transcript-wiggles
index a005fde..ea9a37c 100755
--- a/rsem-plot-transcript-wiggles
+++ b/rsem-plot-transcript-wiggles
@@ -83,6 +83,10 @@ __END__
 
 rsem-plot-transcript-wiggles
 
+=head1 PURPOSE
+
+Generate PDF wiggle plots from transcript or gene ids
+
 =head1 SYNOPSIS
 
  rsem-plot-transcript-wiggles [options] sample_name input_list output_plot_file
diff --git a/rsem-prepare-reference b/rsem-prepare-reference
index 8f72abc..66e0e81 100755
--- a/rsem-prepare-reference
+++ b/rsem-prepare-reference
@@ -160,6 +160,10 @@ __END__
 
 rsem-prepare-reference
 
+=head1 PURPOSE
+
+Prepare transcript references for RSEM and optionally build BOWTIE/BOWTIE2/STAR indices.
+
 =head1 SYNOPSIS
 
  rsem-prepare-reference [options] reference_fasta_file(s) reference_name
diff --git a/rsem-run-ebseq b/rsem-run-ebseq
index 7f65066..64e6689 100755
--- a/rsem-run-ebseq
+++ b/rsem-run-ebseq
@@ -40,6 +40,10 @@ __END__
 
 rsem-run-ebseq
 
+=head1 PURPOSE
+
+Wrapper for EBSeq to perform differential expression analysis.
+
 =head1 SYNOPSIS
 
 rsem-run-ebseq [options] data_matrix_file conditions output_file
diff --git a/rsem_perl_utils.pm b/rsem_perl_utils.pm
index c53bf42..c4c6046 100644
--- a/rsem_perl_utils.pm
+++ b/rsem_perl_utils.pm
@@ -9,7 +9,7 @@ our @ISA = qw(Exporter);
 our @EXPORT = qw(runCommand);
 our @EXPORT_OK = qw(runCommand collectResults showVersionInfo getSAMTOOLS);
 
-my $version = "RSEM v1.2.28"; # Update version info here
+my $version = "RSEM v1.2.29"; # Update version info here
 my $samtools = "samtools-1.3"; # If update to another version of SAMtools, need to change this
 
 # command, {err_msg}
diff --git a/samValidator.cpp b/samValidator.cpp
deleted file mode 100644
index 90dd6b4..0000000
--- a/samValidator.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-#include<cstdio>
-#include<cstring>
-#include<cstdlib>
-#include<cassert>
-#include<string>
-#include<set>
-
-#include <stdint.h>
-#include "bam.h"
-#include "sam.h"
-#include "sam_utils.h"
-
-#include "utils.h"
-#include "my_assert.h"
-
-using namespace std;
-
-samfile_t *in;
-bam1_t *b, *b2;
-
-set<string> used;
-
-bool isValid;
-
-int main(int argc, char* argv[]) {
-  if (argc != 2) {
-    printf("Usage: rsem-sam-validator <input.sam/input.bam/input.cram>\n");
-    exit(-1);
-  }
-  
-  in = samopen(argv[1], "r", NULL);
-  general_assert(in != 0, "Cannot open input file!");
-  
-  used.clear();
-  b = bam_init1(); b2 = bam_init1();
-  
-  isValid = true;
-  
-  HIT_INT_TYPE cnt = 0;
-  string cqname(""), qname;
-  uint64_t creadlen = 0, readlen;
-  bool cispaired = false, ispaired;
-  
-  printf("."); fflush(stdout);
-  do {
-    if (samread(in, b) < 0) break;
-    assert(b->core.l_qseq > 0);
-    
-    qname = bam_get_canonical_name(b);
-    
-    // if this is a paired-end read
-    ispaired = bam_is_paired(b);
-    if (ispaired) {
-      
-      isValid = (samread(in, b2) >= 0) && (qname == bam_get_canonical_name(b2)) && bam_is_paired(b2);
-      if (!isValid) { printf("\nOnly find one mate for paired-end read %s!\n", qname.c_str()); continue; }
-      assert(b2->core.l_qseq > 0);
-      isValid = !(bam_is_read1(b) && bam_is_read2(b)) && !(bam_is_read1(b2) && bam_is_read2(b2));
-      if (!isValid) { printf("\nRead %s has more than 2 segments (e.g. tripled or more ended reads)!\n", qname.c_str()); continue; }
-      
-      if (!bam_is_read1(b)) { bam1_t *tmp = b; b = b2; b2 = tmp; }
-      isValid = bam_is_read1(b) && bam_is_read2(b2);
-      if (!isValid) { printf("\nThe two mates of paired-end read %s are not adjacent!\n", qname.c_str()); continue; }
-      
-      // both mates are mapped
-      if (bam_is_mapped(b) && bam_is_mapped(b2)) {
-	isValid = (b->core.tid == b2->core.tid) && (b->core.pos == b2->core.mpos) && (b2->core.pos == b->core.mpos);
-	if (!isValid) { printf("\nOne of paired-end read %s's alignment does not have two mates adjacent to each other! If you're running covert-sam-for-rsem now, this might mean the read contains duplicate alignments.\n", qname.c_str()); continue; }
-      }
-      
-      readlen = (uint64_t(b->core.l_qseq) << 32) + b2->core.l_qseq;
-    }
-    else readlen = b->core.l_qseq;
-    
-    if (cqname != qname) {
-      isValid = used.find(qname) == used.end();
-      if (!isValid) { printf("\nThe alignments of read %s are not grouped together!", qname.c_str()); continue; }
-      used.insert(cqname);
-      cqname = qname;
-      creadlen = readlen;
-      cispaired = ispaired;
-    }
-    else {
-      assert(cqname != "");
-      isValid = (creadlen == readlen);
-      if (!isValid) { printf("\nRead %s have different read/mate lengths!\n", qname.c_str()); continue; }
-      isValid = (cispaired == ispaired);
-      if (!isValid) { printf("\nRead %s is detected as both single-end read and paired-end read!\n", qname.c_str()); continue; }
-    }
-    
-    ++cnt;
-    if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
-    
-  } while(isValid);
-  
-  bam_destroy1(b); bam_destroy1(b2);
-  samclose(in);
-  
-  if (isValid) printf("\nThe input file is valid!\n");
-  else printf("The input file is not valid!\n");
-  
-  return 0;
-}
diff --git a/sam_utils.h b/sam_utils.h
deleted file mode 100644
index b53ed2d..0000000
--- a/sam_utils.h
+++ /dev/null
@@ -1,234 +0,0 @@
-#ifndef SAM_UTILS_H_
-#define SAM_UTILS_H_
-
-#include<cstdio>
-#include<cstring>
-#include<vector>
-#include<string>
-#include<stdint.h>
-
-#include "bam.h"
-
-#include "Transcript.h"
-#include "Transcripts.h"
-
-
-/******************************************************/
-
-// These functions are adopted/modified from samtools source codes because the original codes are not visible from sam.h/bam.h
-
-inline int bam_aux_type2size(int x) {
-  if (x == 'C' || x == 'c' || x == 'A') return 1;
-  else if (x == 'S' || x == 's') return 2;
-  else if (x == 'I' || x == 'i' || x == 'f' || x == 'F') return 4;
-  else return 0;
-}
-
-// dwt: duplicate without text
-bam_hdr_t *bam_header_dwt(const bam_hdr_t *ori_h) {
-  bam_hdr_t *h;
-  
-  h = bam_hdr_init();
-  h->n_targets = ori_h->n_targets;
-  h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
-  h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
-  for (int i = 0; i < h->n_targets; ++i) {
-    h->target_len[i] = ori_h->target_len[i];
-    h->target_name[i] = strdup(ori_h->target_name[i]);
-  }
-  
-  return h;
-}
-
-void append_header_text(bam_hdr_t *header, const char* text, int len)
-{
-  if (text == 0) return;
-  
-  int x = header->l_text + 1;
-  int y = header->l_text + len + 1; // 1 byte null
-  kroundup32(x);
-  kroundup32(y);
-  if (x < y) header->text = (char*)realloc(header->text, y);
-  strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here.
-  header->l_text += len;
-  header->text[header->l_text] = 0;
-}
-
-inline void expand_data_size(bam1_t *b) {
-  if (b->m_data < b->data_len) {
-    b->m_data = b->l_data;
-    kroundup32(b->m_data);
-    b->data = (uint8_t*)realloc(b->data, b->m_data);
-  }
-}
-
-/******************************************************/
-
-// These functions are specially designed for RSEM
-
-const char* whitespaces = " \t\n\r\f\v";
-
-inline bool bam_is_paired(const bam1_t* b) { return (b->core.flag & BAM_FPAIRED); }
-inline bool bam_is_proper(const bam1_t* b) { return (b->core.flag & BAM_FPROPER_PAIR); }
-inline bool bam_is_mapped(const bam1_t* b) { return !(b->core.flag & BAM_FUNMAP); }
-inline bool bam_is_unmapped(const bam1_t* b) { return (b->core.flag & BAM_FUNMAP); }
-inline bool bam_is_read1(const bam1_t* b) { return (b->core.flag & BAM_FREAD1); }
-inline bool bam_is_read2(const bam1_t* b) { return (b->core.flag & BAM_FREAD2); }
-
-inline std::string bam_get_canonical_name(const bam1_t* b) {
-  // Retain only the first whitespace-delimited word as the read name
-  // This prevents issues of mismatching names when aligners do not
-  // strip off extra words in read name strings
-  const char* raw_query_name = bam_get_qname(b);
-  const char* whitespace_pos = std::strpbrk(raw_query_name, whitespaces);
-  return (whitespace_pos == NULL ? std::string(raw_query_name) : std::string(raw_query_name, whitespace_pos - raw_query_name));
-}
-
-// Current RSEM only accept matches
-inline bool bam_check_cigar(bam1_t *b) {
-  uint32_t *cigar = bam_get_cigar(b);
-  char op = bam_cigar_op(*cigar);
-  int32_t oplen = bam_cigar_oplen(*cigar);
-  
-  return (b->core.n_cigar == 1) && (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) && (b->core.l_qseq == oplen);
-}
-
-uint8_t bam_prb_to_mapq(double val) {
-  double err = 1.0 - val;
-  if (err <= 1e-10) return 100;
-  return (uint8_t)(-10 * log10(err) + .5); // round it
-}
-
-inline std::string bam_get_read_seq(const bam1_t* b) {
-  uint8_t *p = bam_get_seq(b);
-  std::string readseq = "";
-  char base = 0;
-
-  if (bam_is_rev(b)) {
-    for (int i = b->core.l_qseq - 1; i >= 0; i--) {
-      switch(bam_seqi(p, i)) {
-	//case 0 : base = '='; break;
-      case 1 : base = 'T'; break;
-      case 2 : base = 'G'; break;
-      case 4 : base = 'C'; break;
-      case 8 : base = 'A'; break;
-      case 15 : base = 'N'; break;
-      default : assert(false);
-      }
-      readseq.append(1, base);
-    }
-  }
-  else {
-    for (int i = 0; i < b->core.l_qseq; ++i) {
-      switch(bam_seqi(p, i)) {
-	//case 0 : base = '='; break;
-      case 1 : base = 'A'; break;
-      case 2 : base = 'C'; break;
-      case 4 : base = 'G'; break;
-      case 8 : base = 'T'; break;
-      case 15 : base = 'N'; break;
-      default : assert(false);
-      }
-      readseq.append(1, base);
-    }
-  }
-  
-  return readseq;
-}
-
-inline std::string bam_get_qscore(const bam1_t* b) {
-  uint8_t *p = bam_get_qual(b);
-  std::string qscore = "";
-  
-  if (bam_is_rev(b)) {
-    p = p + b->core.l_qseq - 1;
-    for (int i = 0; i < b->core.l_qseq; ++i) {
-      qscore.append(1, (char)(*p + 33));
-      --p;
-    }
-  }
-  else {
-    for (int i = 0; i < b->core.l_qseq; ++i) {
-      qscore.append(1, (char)(*p + 33));
-      ++p;
-    }
-  }
-  
-  return qscore;
-}
-
-//convert transcript coordinate to chromosome coordinate and generate CIGAR string
-void tr2chr(const Transcript& transcript, int sp, int ep, int& pos, int& n_cigar, std::vector<uint32_t>& data) {
-	int length = transcript.getLength();
-	char strand = transcript.getStrand();
-	const std::vector<Interval>& structure = transcript.getStructure();
-
-	int s, i;
-	int oldlen, curlen;
-
-	uint32_t operation;
-
-	n_cigar = 0;
-	s = structure.size();
-
-	if (strand == '-') {
-		int tmp = sp;
-		sp = length - ep + 1;
-		ep = length - tmp + 1;
-	}
-
-	if (ep < 1 || sp > length) { // a read which align to polyA tails totally!
-	  pos = (sp > length ? structure[s - 1].end : structure[0].start - 1); // 0 based
-
-	  n_cigar = 1;
-	  operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP;
-	  data.push_back(operation);
-
-	  return;
-	}
-
-	if (sp < 1) {
-		n_cigar++;
-		operation = (1 - sp) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP;
-		data.push_back(operation);
-		sp = 1;
-	}
-
-	oldlen = curlen = 0;
-
-	for (i = 0; i < s; i++) {
-		oldlen = curlen;
-		curlen += structure[i].end - structure[i].start + 1;
-		if (curlen >= sp) break;
-	}
-	assert(i < s);
-	pos = structure[i].start + (sp - oldlen - 1) - 1; // 0 based
-
-	while (curlen < ep && i < s) {
-		n_cigar++;
-		operation = (curlen - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH;
-		data.push_back(operation);
-		++i;
-		if (i >= s) continue;
-		n_cigar++;
-		operation = (structure[i].start - structure[i - 1].end - 1) << BAM_CIGAR_SHIFT | BAM_CREF_SKIP;
-		data.push_back(operation);
-
-		oldlen = curlen;
-		sp = oldlen + 1;
-		curlen += structure[i].end - structure[i].start + 1;
-	}
-
-	if (i >= s) {
-		n_cigar++;
-		operation = (ep - length) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP;
-		data.push_back(operation);
-	}
-	else {
-		n_cigar++;
-		operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH;
-		data.push_back(operation);
-	}
-}
-
-#endif /* SAM_RSEM_AUX_H_ */
diff --git a/sampling.h b/sampling.h
deleted file mode 100644
index 7e445cf..0000000
--- a/sampling.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef SAMPLING
-#define SAMPLING
-
-#include<ctime>
-#include<cstdio>
-#include<cassert>
-#include<vector>
-#include<set>
-
-#include "boost/random.hpp"
-
-typedef unsigned int seedType;
-typedef boost::random::mt19937 engine_type;
-typedef boost::random::uniform_01<> uniform_01_dist;
-typedef boost::random::gamma_distribution<> gamma_dist;
-typedef boost::random::variate_generator<engine_type&, uniform_01_dist> uniform_01_generator;
-typedef boost::random::variate_generator<engine_type&, gamma_dist> gamma_generator;
-
-class engineFactory {
-public:
-  static void init() { seedEngine = new engine_type(time(NULL)); }
-  static void init(seedType seed) { seedEngine = new engine_type(seed); }
-
-  static void finish() { if (seedEngine != NULL) delete seedEngine; }
-
-	static engine_type *new_engine() {
-		seedType seed;
-		static std::set<seedType> seedSet;			// empty set of seeds
-		std::set<seedType>::iterator iter;
-
-		do {
-			seed = (*seedEngine)();
-			iter = seedSet.find(seed);
-		} while (iter != seedSet.end());
-		seedSet.insert(seed);
-
-		return new engine_type(seed);
-	}
-
- private:
-	static engine_type *seedEngine;
-};
-
-engine_type* engineFactory::seedEngine = NULL;
-
-// arr should be cumulative!
-// interval : [,)
-// random number should be in [0, arr[len - 1])
-// If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1
-int sample(uniform_01_generator& rg, std::vector<double>& arr, int len) {
-  int l, r, mid;
-  double prb = rg() * arr[len - 1];
-
-  l = 0; r = len - 1;
-  while (l <= r) {
-    mid = (l + r) / 2;
-    if (arr[mid] <= prb) l = mid + 1;
-    else r = mid - 1;
-  }
-
-  if (l >= len) { printf("%d %lf %lf\n", len, arr[len - 1], prb); }
-  assert(l < len);
-
-  return l;
-}
-
-#endif
diff --git a/scanForPairedEndReads.cpp b/scanForPairedEndReads.cpp
index 402e1cd..c494f86 100644
--- a/scanForPairedEndReads.cpp
+++ b/scanForPairedEndReads.cpp
@@ -7,8 +7,7 @@
 #include<algorithm>
 
 #include <stdint.h>
-#include "bam.h"
-#include "sam.h"
+#include "htslib/sam.h"
 #include "sam_utils.h"
 
 #include "utils.h"
@@ -17,7 +16,8 @@
 using namespace std;
 
 int nThreads;
-samfile_t *in, *out;
+samFile *in, *out;
+bam_hdr_t *header;
 bam1_t *b, *b2;
 vector<bam1_t*> arr_both, arr_partial_1, arr_partial_2, arr_partial_unknown;
 
@@ -57,16 +57,19 @@ int main(int argc, char* argv[]) {
 	}
 
         nThreads = atoi(argv[1]);
-	in = samopen(argv[2], "r", NULL);
+	in = sam_open(argv[2], "r");
 	general_assert(in != 0, "Cannot open " + cstrtos(argv[2]) + " !");
-	out = samopen(argv[3], "wb", in->header);
+	header = sam_hdr_read(in);
+	general_assert(header != 0, "Cannot load SAM header!");
+	out = sam_open(argv[3], "wb");
 	general_assert(out != 0, "Cannot open " + cstrtos(argv[3]) + " !");
-        if (nThreads > 1) general_assert(samthreads(out, nThreads, 256) == 0, "Fail to create threads for writing the BAM file!");        
+	sam_hdr_write(out, header);
+	if (nThreads > 1) general_assert(hts_set_threads(out, nThreads) == 0, "Fail to create threads for writing the BAM file!");
 
 	b = bam_init1(); b2 = bam_init1();
 
 	string qname;
-	bool go_on = (samread(in, b) >= 0);
+	bool go_on = (sam_read1(in, header, b) >= 0);
 	bool isPaired;
 	HIT_INT_TYPE cnt = 0;
 
@@ -78,7 +81,7 @@ int main(int argc, char* argv[]) {
 
 	  if (isPaired) {
 	    add_to_appropriate_arr(b);
-	    while ((go_on = (samread(in, b) >= 0)) && (qname == bam_get_canonical_name(b))) {
+	    while ((go_on = (sam_read1(in, header, b) >= 0)) && (qname == bam_get_canonical_name(b))) {
 	      general_assert_1(bam_is_paired(b), "Read " + qname + " is detected as both single-end and paired-end read!");
 	      add_to_appropriate_arr(b);
 	    }
@@ -88,33 +91,33 @@ int main(int argc, char* argv[]) {
 
 	    if (!arr_both.empty()) {
 	      sort(arr_both.begin(), arr_both.end(), less_than);
-	      for (size_t i = 0; i < arr_both.size(); i++) { samwrite(out, arr_both[i]); bam_destroy1(arr_both[i]); }
+	      for (size_t i = 0; i < arr_both.size(); i++) { sam_write1(out, header, arr_both[i]); bam_destroy1(arr_both[i]); }
 	      arr_both.clear();
 	    }
 	    
 	    while (!arr_partial_1.empty() || !arr_partial_2.empty()) {
 	      if (!arr_partial_1.empty() && !arr_partial_2.empty()) {
-		samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
-		samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
+		sam_write1(out, header, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
+		sam_write1(out, header, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
 	      }
 	      else if (!arr_partial_1.empty()) {
-		samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
-		samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
+		sam_write1(out, header, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
+		sam_write1(out, header, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
 	      }
 	      else {
-		samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
-		samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
+		sam_write1(out, header, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
+		sam_write1(out, header, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
 	      }
 	    }
 	    
 	    while (!arr_partial_unknown.empty()) {
-	      samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
+	      sam_write1(out, header, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
 	    }
 	  }
 	  else {
-	    samwrite(out, b);
-	    while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) {
-	      samwrite(out, b);
+	    sam_write1(out, header, b);
+	    while ((go_on = (sam_read1(in, header, b) >= 0)) && (qname == bam_get_qname(b))) {
+	      sam_write1(out, header, b);
 	    }
 	  }
 	  
@@ -125,9 +128,10 @@ int main(int argc, char* argv[]) {
 	printf("\nFinished!\n");
 	
 	bam_destroy1(b); bam_destroy1(b2);
+	bam_hdr_destroy(header);
 	
-	samclose(in);
-	samclose(out);
+	sam_close(in);
+	sam_close(out);
 	
 	return 0;
 }
diff --git a/synthesisRef.cpp b/synthesisRef.cpp
index c512049..266cd8b 100644
--- a/synthesisRef.cpp
+++ b/synthesisRef.cpp
@@ -69,14 +69,6 @@ void loadMappingInfo(int file_type, char* mappingF) {
   fin.close();
 }
 
-
-inline char check(char c, string& seqname, int pos) {
-	general_assert(isalpha(c), "Sequence " + seqname + " contains an unknown letter (ASCII code " + itos(c) + ") at 0-based position " + itos(pos) + "!");
-	if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N';
-	if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n';
-	return c;
-}
-
 void writeResults(int option, char* refName) {
 	ofstream fout;
 	string cur_gene_id, cur_transcript_id, name;
@@ -137,6 +129,19 @@ void writeResults(int option, char* refName) {
 	}
 }
 
+struct CursorPos {
+  char *filename;
+  int line_no, pos;
+} cursor;
+
+inline char check(char c) {
+  general_assert(isalpha(c), "FASTA file " + cstrtos(cursor.filename) + " contains an unknown character, " + \
+		 ctos(c) + " (ASCII code " + itos(c) + "), at line " + itos(cursor.line_no) + ", position " + itos(cursor.pos + 1) + "!");
+  if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N';
+  if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n';
+  return c;
+}
+
 int main(int argc, char* argv[]) {
   if (argc < 5 || ((hasMappingFile = atoi(argv[3])) && argc < 6)) {
 		printf("Usage: synthesisRef refName quiet hasMappingFile<0,no;1,yes;2,allele-specific> [mappingFile] reference_file_1 [reference_file_2 ...]\n");
@@ -155,27 +160,33 @@ int main(int argc, char* argv[]) {
 	ifstream fin;
 	string line, gseq;
 	string seqname, gene_id, transcript_id;
-
+	int seqlen, len;
+	
 	vector<Interval> vec;
 
 	M = 0;
 	name2seq.clear();
 	for (int i = start; i < argc; i++) {
 		fin.open(argv[i]);
-		general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist."); 
+		general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist.");
+
+		cursor.filename = argv[i]; cursor.line_no = cursor.pos = 0;
+		
 		getline(fin, line);
 		while ((fin) && (line[0] == '>')) {
 			istringstream strin(line.substr(1));
 			strin>>seqname;
-
-			gseq = "";
+			++cursor.line_no;
+			
+			gseq = ""; seqlen = 0;
 			while((getline(fin, line)) && (line[0] != '>')) {
-			    gseq += line;
+			  ++cursor.line_no;
+			  len = line.length();
+			  for (cursor.pos = 0; cursor.pos < len; ++cursor.pos) line[cursor.pos] = check(line[cursor.pos]);
+			  seqlen += len;
+			  gseq += line;
 			}
-
-			int len = gseq.length();
-			assert(len > 0);
-			for (int j = 0; j < len; j++) gseq[j] = check(gseq[j], seqname, j);
+			assert(seqlen > 0);
 			name2seq[seqname] = gseq;
 
 			transcript_id = seqname;
@@ -193,7 +204,7 @@ int main(int argc, char* argv[]) {
 			}
 			
 			vec.clear();
-			vec.push_back(Interval(1, len));
+			vec.push_back(Interval(1, seqlen));
 			transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, ""));
 			++M;
 
diff --git a/wiggle.cpp b/wiggle.cpp
index 2f841c4..032a869 100644
--- a/wiggle.cpp
+++ b/wiggle.cpp
@@ -4,8 +4,7 @@
 #include <iostream>
 
 #include <stdint.h>
-#include "bam.h"
-#include "sam.h"
+#include "htslib/sam.h"
 #include "sam_utils.h"
 
 #include "utils.h"
@@ -40,10 +39,11 @@ void add_bam_record_to_wiggle(const bam1_t *b, Wiggle& wiggle) {
 void build_wiggles(const std::string& bam_filename,
                    WiggleProcessor& processor) {
   
-    samfile_t *bam_in = samopen(bam_filename.c_str(), "r", NULL);
+    samFile *bam_in = sam_open(bam_filename.c_str(), "r");
     general_assert(bam_in != NULL, "Cannot open " + bam_filename + "!");
 
-    bam_hdr_t *header = bam_in->header;
+    bam_hdr_t *header = sam_hdr_read(bam_in);
+    general_assert(header != 0, "Cannot load SAM header!");
     bool *used = new bool[header->n_targets];
     memset(used, 0, sizeof(bool) * header->n_targets);
 
@@ -51,7 +51,7 @@ void build_wiggles(const std::string& bam_filename,
     HIT_INT_TYPE cnt = 0;
     bam1_t *b = bam_init1();
     Wiggle wiggle;
-    while (samread(bam_in, b) >= 0) {
+    while (sam_read1(bam_in, header, b) >= 0) {
       if (bam_is_unmapped(b)) continue;
       
       if (b->core.tid != cur_tid) {
@@ -76,7 +76,8 @@ void build_wiggles(const std::string& bam_filename,
       }
 
     bam_destroy1(b);
-    samclose(bam_in);
+    bam_hdr_destroy(header);
+    sam_close(bam_in);
 
     delete[] used;
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/rsem.git



More information about the debian-med-commit mailing list