[med-svn] [Git][med-team/subread][upstream] New upstream version 2.0.1+dfsg

Steffen Möller gitlab at salsa.debian.org
Fri May 22 18:59:40 BST 2020



Steffen Möller pushed to branch upstream at Debian Med / subread


Commits:
581d5ef2 by Steffen Moeller at 2020-05-22T19:30:00+02:00
New upstream version 2.0.1+dfsg
- - - - -


22 changed files:

- doc/SubreadUsersGuide.tex
- src/HelperFunctions.c
- src/core-interface-aligner.c
- src/core-junction.c
- src/core.c
- src/core.h
- src/detection-calls.c
- src/flattenAnnotations.c
- src/gene-algorithms.c
- src/hashtable.c
- src/hashtable.h
- src/index-builder.c
- src/input-files.c
- src/input-files.h
- src/makefile.version
- src/readSummary.c
- src/sambam-file.c
- src/sorted-hashtable.c
- src/subread.h
- src/tx-unique.c
- − test/subread-align/data/test-err-mut-r1.fq.gz
- − test/subread-align/data/test-err-mut-r2.fq.gz


Changes:

=====================================
doc/SubreadUsersGuide.tex
=====================================
@@ -36,19 +36,19 @@
 \begin{center}
 {\Huge\bf Rsubread/Subread Users Guide}\\
 \vspace{1 cm}
-{\centering\large Rsubread v2.0.0/Subread v2.0.0\\}
+{\centering\large Subread v2.0.1/Rsubread v2.2.0\\}
 \vspace{1 cm}
-\centering 2 September 2019\\
+\centering 13 May 2020\\
 \vspace{5 cm}
 \Large Wei Shi and Yang Liao\\
 \vspace{1 cm}
 \small
-{\large Bioinformatics Division\\
-The Walter and Eliza Hall Institute of Medical Research\\
-The University of Melbourne\\
-Melbourne, Australia\\}
+{\large Olivia Newton-John Cancer Research Institute\\
+Walter and Eliza Hall Institute of Medical Research\\
+University of Melbourne\\
+Australia\\}
 \vspace{7 cm}
-\centering Copyright \small{\copyright}  2011 - 2019\\
+\centering Copyright \small{\copyright}  2011 - 2020\\
 \end{center}
 
 \end{titlepage}
@@ -1027,7 +1027,7 @@ read mapping that produced the provided SAM/BAM files. This optional argument ca
 \hline
 -s $<int or string>$ \newline (\code{isStrandSpecific}) & Indicate if strand-specific read counting should be performed. A single integer value (applied to all input files) or a string of comma-separated values (applied to each corresponding input file) should be provided. Possible values include: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). Default value is 0 (ie. unstranded read counting carried out for all input files). For paired-end reads, strand of the first read is taken as the strand of the whole fragment. FLAG field is used to tell if a read is first or second read in a pair. Value of \code{isStrandSpecific} parameter in {\Rsubread} {\featureCounts} is a vector which has a length of either {\code{1}}, or the same with the total number of input files provided. \\
 \hline
--t $<string>$ \newline (\code{GTF.featureType}) & Specify the feature type. Only rows which have the matched feature type in the provided GTF annotation file will be included for read counting. `exon' by default.\\
+-t $<string>$ \newline (\code{GTF.featureType}) & Specify the feature type(s). If more than one feature type is provided, they should be separated by `,' (no space). Only rows which have a matched feature type in the provided GTF annotation file will be included for read counting. `exon' by default.\\
 \hline
 -T $<int>$ \newline (\code{nthreads}) & Number of the threads. The value should be between 1 and 32. 1 by default.\\
 \hline
@@ -1190,6 +1190,17 @@ featureCounts(files="mapping_results_PE.bam",isPairedEnd=TRUE,requireBothEndsMap
 featureCounts(files="mapping_results_PE.bam",isPairedEnd=TRUE,countChimericFragments=FALSE)
 \end{Rcode}
 
+\chapter{Mapping and quantification of single-cell data}
+
+\section{cellCounts}
+
+The \emph{cellCounts} program was developed to process single-cell RNA-seq (scRNA-seq) data to generate UMI counts for genes, exons or other genomic features.
+It calls \emph{align} function to perform mapping of scRNA-seq reads and calls \emph{featureCounts} functions to generate UMI counts.
+\emph{cellCounts} demultiplexes cell barcodes and also deduplicates reads.
+It accepts BCL or FASTQ files as input.
+Quantification of a scRNA-seq dataset can be simply done by a single \emph{cellCounts} function call.
+It is much faster than other scRNA-seq quantification tools.
+
 \chapter{SNP calling}
 
 \section{Algorithm}


=====================================
src/HelperFunctions.c
=====================================
@@ -1235,10 +1235,12 @@ HashTable * load_alias_table(char * fname) {
 		if((!sam_chr)||(!anno_chr)) continue;
 
 		sam_chr[strlen(sam_chr)-1]=0;
+		if(sam_chr[strlen(sam_chr)-1]=='\r') sam_chr[strlen(sam_chr)-1]=0;
 		char * anno_chr_buf = malloc(strlen(anno_chr)+1);
 		strcpy(anno_chr_buf, anno_chr);
 		char * sam_chr_buf = malloc(strlen(sam_chr)+1);
 		strcpy(sam_chr_buf, sam_chr);
+//		SUBREADprintf("ALIAS_LOAD : '%s' => '%s'\n", sam_chr_buf, anno_chr_buf);
 		HashTablePut(ret, sam_chr_buf, anno_chr_buf);
 	}
 


=====================================
src/core-interface-aligner.c
=====================================
@@ -68,6 +68,7 @@ static struct option long_options[] =
 	{"multiMapping", no_argument, 0, 0},
 	{"keepReadOrder", no_argument, 0, 0},
 	{"sortReadsByCoordinates", no_argument, 0, 0},
+	{"noTLENpreference",  no_argument, 0, 0},
 	{0, 0, 0, 0}
 };
 
@@ -583,6 +584,10 @@ int parse_opts_aligner(int argc , char ** argv, global_context_t * global_contex
 					global_context -> config.use_memory_buffer = 1;
 					global_context -> config.reads_per_chunk = 600llu*1024*1024;
 				}
+				else if(strcmp("noTLENpreference", long_options[option_index].name) == 0)
+				{
+					global_context -> config.no_TLEN_preference = 1;
+				}
 				else if(strcmp("gtfFeature", long_options[option_index].name) == 0)
 				{
 					strncpy(global_context->config.exon_annotation_feature_name_column, optarg, MAX_READ_NAME_LEN - 1);


=====================================
src/core-junction.c
=====================================
@@ -1106,7 +1106,7 @@ void copy_vote_to_alignment_res(global_context_t * global_context, thread_contex
 					// if the covered region has a wrong arrangement to their relative positions.
 					int test_minor_res = test_junction_minor(global_context, thread_context, current_vote, vote_i, vote_j, i, j, dist);
 //#warning " ============ DEBUG 1 ==================== "
-					if(0 &&  FIXLENstrcmp("R002403247", read_name) == 0) {
+					if(0 && FIXLENstrcmp("R002403247", read_name) == 0) {
 						char posout2[100];
 						char posout1[100];
 						absoffset_to_posstr(global_context, current_vote -> pos[vote_i][vote_j], posout1);
@@ -2402,7 +2402,6 @@ int process_voting_junction_PE_topK(global_context_t * global_context, thread_co
 				simple_mapping_t * current_loc = is_second_read?comb_buffer[i].r2_loc:comb_buffer[i].r1_loc;
 				assert(current_loc);
 				unsigned int current_pos = current_loc->mapping_position;
-				//SUBREADprintf("CLLL BUF %d R_%d : %u\n", i, 1+is_second_read,  current_loc->mapping_position);
 
 				int is_exist = 0;
 				for(j = 0; j < *current_r_cursor; j++)
@@ -2412,6 +2411,7 @@ int process_voting_junction_PE_topK(global_context_t * global_context, thread_co
 						break;
 					}
 				}
+				//SUBREADprintf("CLLL BUF %d R_%d : %u ; EXIST %d. Written into the %d-th best location\n", i, 1+is_second_read,  current_loc->mapping_position, is_exist, *current_r_cursor);
 
 				if(!is_exist){
 					if(current_loc -> is_vote_t_item)
@@ -2984,9 +2984,10 @@ int final_CIGAR_quality(global_context_t * global_context, thread_context_t * th
 					else
 						tail_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, read_text + read_cursor, current_perfect_section_abs, tmp_int, 1, adj_coverage_end);
 
-					//SUBREADprintf("SSTAIL:%d\n", tail_soft_clipped);
+					if(1 && FIXLENstrcmp("NS500643:556:HGTMTBGXB:4:13403:18179:8012", read_name)==0) 
+						SUBREADprintf("SSTAIL:%d\n", tail_soft_clipped);
 
-					if(tail_soft_clipped == tmp_int){
+					if(1 && tail_soft_clipped == tmp_int){
 						tail_soft_clipped = 0;
 						if(full_section_clipped)(*full_section_clipped) = 1;
 					} else has_clipping_this_section_tail = 1;
@@ -3125,6 +3126,8 @@ int final_CIGAR_quality(global_context_t * global_context, thread_context_t * th
 			}
 		}
 
+		if(1 && FIXLENstrcmp("NS500643:556:HGTMTBGXB:4:13403:18179:8012", read_name)==0) 
+			SUBREADprintf("NEW_CIGAR_2 : %s\n", new_cigar_tmp);
 		strcpy(cigar_string, new_cigar_tmp);
 	}
 
@@ -3347,11 +3350,11 @@ unsigned int finalise_explain_CIGAR(global_context_t * global_context, thread_co
 
 
 			//#warning " ========== COMMENT THIS LINE !! ========="
-			if(0 && FIXLENstrcmp("R00000110641", explain_context -> read_name) ==0){
+			if(1 && FIXLENstrcmp("NS500643:556:HGTMTBGXB:4:13403:18179:8012", explain_context -> read_name) ==0){
 				char outpos1[100];
 				absoffset_to_posstr(global_context, final_position, outpos1);
 				SUBREADprintf("FINALQUAL %s : FINAL_POS=%s ( %u )\tCIGAR=%s\tMM=%d / MAPLEN=%d > %d?\tVOTE=%d > %0.2f x %d ?  MASK=%d\tQUAL=%d\tBRNO=%d\nKNOWN_JUNCS=%d PENALTY=%d\n\n", explain_context -> read_name, outpos1 , final_position , tmp_cigar, mismatch_bases, non_clipped_length, applied_mismatch,  result -> selected_votes, global_context -> config.minimum_exonic_subread_fraction,result-> used_subreads_in_vote, result->result_flags, final_qual, explain_context -> best_read_id, known_junction_supp, explain_context -> best_indel_penalty);
-				exit(0);
+				//exit(0);
 			}
 
 


=====================================
src/core.c
=====================================
@@ -2676,7 +2676,7 @@ int do_iteration_two(global_context_t * global_context, thread_context_t * threa
 				if(current_result->selected_votes < global_context->config.minimum_subread_for_second_read || max_votes < global_context->config.minimum_subread_for_first_read)
 				{
 				//	if(current_read_number == 111 || current_read_number == 112)
-				//	SUBREADprintf("RESET0 [%d] R_%d SEL=%d MAX=%d\n", current_read_number, is_second_read + 1, current_result->selected_votes, max_votes);
+					//SUBREADprintf("RESET0 [%lld] R_%d SEL=%d MAX=%d\n", current_read_number, is_second_read + 1, current_result->selected_votes, max_votes);
 					current_result -> selected_votes = 0;
 					continue;
 				}
@@ -2688,7 +2688,7 @@ int do_iteration_two(global_context_t * global_context, thread_context_t * threa
 				}
 
 				//#warning ">>>>>>> COMMENT THIS <<<<<<<"
-				//printf("OCT27-STEP21-%s:%d-ALN%02d-THRE %d\n", current_read_name, is_second_read + 1, best_read_id, thread_context -> thread_id);
+				//printf("OCT27-STEP21-%s:%d-ALN%02d-THRE %d in MM=%d\n", current_read_name, is_second_read + 1, best_read_id, thread_context -> thread_id, global_context -> config.multi_best_reads);
 
 				int is_negative_strand = (current_result  -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
 
@@ -2844,7 +2844,7 @@ int do_iteration_two(global_context_t * global_context, thread_context_t * threa
 					test_PE_and_same_chro_align(global_context , realignment_result_R1 , realignment_result_R2, &is_exonic_regions, &is_PE, &is_same_chro , read_len_1, read_len_2, read_name_1, &tlen);
 
 					unsigned long long TLEN_exp_score = 0;
-					if(is_PE && global_context -> config.reported_multi_best_reads<2){
+					if(is_PE && global_context -> config.no_TLEN_preference == 0 && global_context -> config.reported_multi_best_reads<2){
 						TLEN_exp_score = (tlen > expected_TLEN)?(tlen - expected_TLEN):(expected_TLEN - tlen);
 						if(TLEN_exp_score>999) TLEN_exp_score=0;
 						else TLEN_exp_score = 999-TLEN_exp_score;
@@ -3017,6 +3017,7 @@ int do_iteration_two(global_context_t * global_context, thread_context_t * threa
 	}
 
 	free(final_realignments);
+	free(final_realignment_number);
 	free(final_MATCH_buffer1);
 	free(final_MISMATCH_buffer1);
 	free(final_PENALTY_buffer1);
@@ -3248,7 +3249,7 @@ int do_voting(global_context_t * global_context, thread_context_t * thread_conte
 					SUBREADprintf(" ======= PAIR %s = %llu ; NON_INFORMATIVE = %d, %d =======\n", read_name_1, current_read_number, vote_1 -> noninformative_subreads, vote_2 -> noninformative_subreads);
 					print_votes(vote_1, global_context -> config.index_prefix);
 					print_votes(vote_2, global_context -> config.index_prefix);
-					if(is_reversed==1)exit(0);
+					//if(is_reversed==1)exit(0);
 				}
 
 				if(global_context -> input_reads.is_paired_end_reads)
@@ -3615,6 +3616,7 @@ int read_chunk_circles(global_context_t *global_context)
 				print_in_box(80,0,0, "Load the %d-th index block...",1+ global_context->current_index_block_number);
 
 				if(gehash_load(global_context -> current_index, tmp_fname)) return -1;
+				print_in_box(80,0,0, "The index block has been loaded.");
 				sprintf(tmp_fname, "%s.%02d.%c.array", global_context->config.index_prefix, global_context->current_index_block_number, global_context->config.space_type == GENE_SPACE_COLOR?'c':'b');
 			}
 			period_load_index = miltime() - time_load_index;
@@ -3628,6 +3630,7 @@ int read_chunk_circles(global_context_t *global_context)
 				global_context -> is_final_voting_run = 1;
 			else	global_context -> is_final_voting_run = 0;
 
+			print_in_box(80,0,0, "Start read mapping in chunk.");
 			double time_start_voting = miltime();
 			ret = run_maybe_threads(global_context, STEP_VOTING);
 			double period_voting = miltime() - time_start_voting;
@@ -3980,12 +3983,46 @@ int do_anno_bitmap_add_feature(char * gene_name, char * tracnscript_id, char * c
 	return 0;
 }
 
-void warning_anno_vs_index(HashTable * anno_chros_tab, gene_offset_t * index_chros_offset){
-	HashTable *  index_chros_tab = index_chros_offset -> read_name_to_index;
-	warning_hash_hash(anno_chros_tab, index_chros_tab, "Chromosomes/contigs in annotation but not in index :");
-	warning_hash_hash(index_chros_tab, anno_chros_tab, "Chromosomes/contigs in index but not in annotation :");
+void convert_table_key_to_alias(void * ky ,void * va, HashTable * tab){
+	HashTable * alias_index_to_annot = tab -> appendix2;
+	HashTable * result_annot_chros = tab -> appendix1;
+
+	char * index_name = ky;
+	char * annot_name = HashTableGet(alias_index_to_annot, index_name);
+	if(annot_name==NULL) annot_name = index_name;
+	HashTablePut( result_annot_chros, annot_name, va ); 
 }
 
+void warning_anno_vs_index(HashTable * anno_chros_tab, gene_offset_t * index_chros_offset, HashTable * alias_index_to_annot){
+
+	HashTable * index_chros_tab = index_chros_offset -> read_name_to_index;
+	HashTable * chros_in_annot_using_index_names = anno_chros_tab ;
+	HashTable * chros_in_index_using_annot_names = index_chros_tab ;
+	HashTable * alias_annot_to_index = NULL;
+
+	if(alias_index_to_annot){
+		chros_in_annot_using_index_names = StringTableCreate(1000);
+		chros_in_index_using_annot_names = StringTableCreate(1000);
+	
+		alias_annot_to_index = StringTableReverse(alias_index_to_annot);
+		anno_chros_tab -> appendix1 = chros_in_annot_using_index_names;
+		anno_chros_tab -> appendix2 = alias_annot_to_index;
+		HashTableIteration(anno_chros_tab, convert_table_key_to_alias);
+	
+		index_chros_tab -> appendix1 = chros_in_index_using_annot_names;
+		index_chros_tab -> appendix2 = alias_index_to_annot;
+		HashTableIteration(index_chros_tab, convert_table_key_to_alias);
+	}
+
+	warning_hash_hash(anno_chros_tab, chros_in_index_using_annot_names, "Chromosomes/contigs in annotation but not in index :");
+	warning_hash_hash(index_chros_tab, chros_in_annot_using_index_names, "Chromosomes/contigs in index but not in annotation :");
+
+	if(alias_index_to_annot){
+		HashTableDestroy(alias_annot_to_index);
+		HashTableDestroy(chros_in_annot_using_index_names);
+		HashTableDestroy(chros_in_index_using_annot_names);
+	}
+}
 
 int load_annotated_exon_regions(global_context_t * global_context){
 	int bitmap_size = (4096 / EXONIC_REGION_RESOLUTION / 8)*1024*1024;
@@ -3999,8 +4036,6 @@ int load_annotated_exon_regions(global_context_t * global_context){
 	int loaded_features = load_features_annotation(global_context->config.exon_annotation_file, global_context->config.exon_annotation_file_type, global_context->config.exon_annotation_gene_id_column, NULL, global_context->config.exon_annotation_feature_name_column, global_context, do_anno_bitmap_add_feature);
 	if(loaded_features < 0)return -1;
 	else print_in_box(80,0,0,"%d annotation records were loaded.\n", loaded_features);
-	warning_anno_vs_index(global_context -> annotation_chro_table, &global_context -> chromosome_table);
-	HashTableDestroy(global_context -> annotation_chro_table);
 	return 0;
 }
 
@@ -4021,9 +4056,10 @@ int load_global_context(global_context_t * context)
 		context -> config.multi_best_reads = max(context -> config.multi_best_reads , context -> config.reported_multi_best_reads);
 		if(context->config.multi_best_reads>1) context -> config.reads_per_chunk /= context->config.multi_best_reads;
 	}
+	print_in_box(80,0,0,"Check the input reads.");
 	subread_init_lock(&context->input_reads.input_lock);
 	if(core_geinput_open(context, &context->input_reads.first_read_file, 1,1)) {
-		//sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"Unable to open '%s' as input. Please check if it exists, you have the permission to read it, and it is in the correct format.\n", context->config.first_read_file);
+	//	sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"Unable to open '%s' as input. Please check if it exists, you have the permission to read it, and it is in the correct format.\n", context->config.first_read_file);
 		return -1;
 	}
 
@@ -4061,6 +4097,7 @@ int load_global_context(global_context_t * context)
 		context -> config.max_vote_number_cutoff = 2;
 	}
 
+	print_in_box(80,0,0,"Initialise the memory objects.");
 	context -> config.max_vote_simples = max(context -> config.max_vote_simples ,  context -> config.reported_multi_best_reads);
 	context -> config.max_vote_combinations = max(context -> config.max_vote_combinations ,  context -> config.reported_multi_best_reads);
 
@@ -4075,6 +4112,7 @@ int load_global_context(global_context_t * context)
 	stat(context->config.first_read_file , &ginp1_stat);
 	context->input_reads.first_read_file_size = ginp1_stat.st_size;
 
+	print_in_box(80,0,0,"Estimate the mean read length.");
 	context -> input_reads.avg_read_length = guess_reads_density_format(context->config.first_read_file , context->config.is_SAM_file_input?1:0, &min_phred_score, &max_phred_score , &guess_tested_reads);
 	if(context -> input_reads.avg_read_length<0 )context -> input_reads.avg_read_length = 250;
 //	SUBREADprintf("QR=[%d,%d]; ALEN=%f\n",  min_phred_score, max_phred_score, context -> input_reads.avg_read_length);
@@ -4110,6 +4148,7 @@ int load_global_context(global_context_t * context)
 		// Only the sam file is opened here; other files like bed, indel and etc are opened in init_modules()
 		sprintf(tmp_fname,"%s", context->config.output_prefix);
 
+		print_in_box(80,0,0,"Create the output %s file.", context -> config.is_BAM_output?"BAM":"SAM");
 		if(context -> config.is_BAM_output)
 		{
 			context -> output_bam_writer = malloc(sizeof(SamBam_Writer));
@@ -4140,6 +4179,7 @@ int load_global_context(global_context_t * context)
 	}
 	
 	// ====== check index files, count blocks and load chro table ======
+	print_in_box(80,0,0,"Check the index.");
 	sprintf(tmp_fname, "%s.reads", context->config.index_prefix);
 	if(!does_file_exist(tmp_fname))
 	{
@@ -4163,7 +4203,6 @@ int load_global_context(global_context_t * context)
 		return -1;
 	}
 
-
 	context->index_block_number = 0; 
 	while(1)
 	{
@@ -4181,6 +4220,7 @@ int load_global_context(global_context_t * context)
 	if(context->config.report_sam_file)
 		write_sam_headers(context);
 
+	print_in_box(80,0,0,"Init the voting space.");
 	// ====== init other variables ======
 	if(context -> config.all_threads>1)
 		subread_init_lock(&context -> thread_initial_lock);
@@ -4197,10 +4237,14 @@ int load_global_context(global_context_t * context)
 
 	context -> sam_chro_to_anno_chr_alias = NULL;
 	if(context->config.exon_annotation_file[0]){ 
+		print_in_box(80,0,0,"Load the annotation file.");
 		if( load_annotated_exon_regions( context ) ) return -1;
 		if(context->config.exon_annotation_alias_file[0])
 			 context -> sam_chro_to_anno_chr_alias = load_alias_table(context->config.exon_annotation_alias_file);
+		warning_anno_vs_index(context -> annotation_chro_table, &context -> chromosome_table, context -> sam_chro_to_anno_chr_alias);
+		HashTableDestroy(context -> annotation_chro_table);
 	} else context -> exonic_region_bitmap = NULL;
+	print_in_box(80,0,0,"Global environment is initialised.");
 	return 0;
 }
 


=====================================
src/core.h
=====================================
@@ -204,6 +204,7 @@ typedef struct{
 	int use_quality_score_break_ties;
 	int big_margin_record_size;
 	int PE_predominant_weight;
+	int no_TLEN_preference;
 
 	// subjunc
 	int entry_program_name;


=====================================
src/detection-calls.c
=====================================
@@ -12,8 +12,8 @@
 #include "gene-algorithms.h"
 
 typedef struct{
-	char gene_name[FEATURE_NAME_LENGTH];
-	char chro_name[MAX_CHROMOSOME_NAME_LEN];
+	char gene_name[FEATURE_NAME_LENGTH+2];
+	char chro_name[MAX_CHROMOSOME_NAME_LEN+2];
 	unsigned int start, end;
 	char is_negative_strand;
 } DTCexon_t;
@@ -59,13 +59,13 @@ int DTCdo_add_feature(char * gene_name, char * transcript_name, char * chro_name
 	return 0;
 }
 
-int DTCcompare_merge_genes(void * L_elem, void * R_elem){
+int DTCcompare_merge_genes(void * L_elem, void * R_elem, ArrayList *me){
 	unsigned int * le = L_elem, * re = R_elem;
 	if(le[1] > re[1]) return 1;
 	if(le[1] < re[1]) return -1;
 	return 0;
 }
-int DTCcompare_exons(void * L_elem, void * R_elem){
+int DTCcompare_exons(void * L_elem, void * R_elem, ArrayList *me){
 	DTCexon_t * le = L_elem, * re = R_elem;
 	int chname_cmp = strcmp(le -> chro_name, re -> chro_name);
 	if(chname_cmp) return chname_cmp;


=====================================
src/flattenAnnotations.c
=====================================
@@ -145,7 +145,7 @@ int flatAnno_do_anno_1R(char * gene_name, char * transcript_name, char * chro_na
 	return 0;
 }
 
-int flatAnno_do_anno_merge_one_array_compare(void * vL, void * vR){
+int flatAnno_do_anno_merge_one_array_compare(void * vL, void * vR, ArrayList *me){
 	int * iL = vL, *iR = vR;
 	if((*iL)>(*iR))return 1;
 	if((*iL)<(*iR))return -1;
@@ -246,12 +246,16 @@ void flatAnno_do_anno_merge_one_array(void * key, void * hashed_obj, HashTable *
 	this_list -> numOfElements = n1_items+1;
 }
 
+int flatme_strcmp(void * L, void * R, ArrayList * me){
+	return strcmp(L, R);
+}
+
 int flatAnno_do_anno_merge_and_write(flatAnno_context_t * context){
 	context -> gene_chro_strand_to_exons_table -> appendix1 = context;
 
 	HashTableIteration(context -> gene_chro_strand_to_exons_table, context -> merging_mode == MERGING_MODE_CHOPPING? flatAnno_do_anno_chop_one_array :flatAnno_do_anno_merge_one_array);
 	ArrayList * all_chro_st_list = HashTableKeyArray(context -> gene_chro_strand_to_exons_table);
-	ArrayListSort(all_chro_st_list, (int(*)(void * , void*))strcmp);
+	ArrayListSort(all_chro_st_list, flatme_strcmp);
 
 	fprintf( context -> output_FP , "GeneID\tChr\tStart\tEnd\tStrand\n");
 	int i,j;


=====================================
src/gene-algorithms.c
=====================================
@@ -2551,6 +2551,7 @@ int get_base_error_prob64i(char v)
 	return PROB_QUAL_INT_TABLE[v-'@'];
 }
 
+#ifdef SKIP_THIS_PART
 void bad_reverse_cigar(char * cigar)
 {
 	int cigar_cursor = 0;
@@ -2600,6 +2601,7 @@ int debug_main()
 	SUBREADprintf("%s\n",cg);
 	return 0;
 }
+#endif
 
 void remove_indel_neighbours(HashTable * indel_table)
 {


=====================================
src/hashtable.c
=====================================
@@ -6,7 +6,7 @@
  * Released to the public domain.
  *
  *--------------------------------------------------------------------------
- * $Id: hashtable.c,v 9999.39 2019/08/28 22:31:14 cvs Exp $
+ * $Id: hashtable.c,v 9999.42 2019/12/05 03:05:37 cvs Exp $
 \*--------------------------------------------------------------------------*/
 
 #include <stdio.h>
@@ -24,7 +24,7 @@ static int isProbablePrime(srInt_64 number);
 static srInt_64 calculateIdealNumOfBuckets(HashTable *hashTable);
 
 srInt_64 long_random_val(){
-	long ret = 0;
+	srInt_64 ret = 0;
 	if(RAND_MAX<255){
 		SUBREADprintf("Is this a embedded computer????\n");
 		return -1;
@@ -37,6 +37,13 @@ srInt_64 long_random_val(){
 	return ret;
 }
 
+HashTable * ArrayListToLookupTable_Int(ArrayList * arr){
+	srInt_64 x1;
+	HashTable * ret = HashTableCreate( max(1,arr -> numOfElements / 6) );
+	for(x1 = 0; x1 < arr->numOfElements; x1++) HashTablePut(ret, ArrayListGet(arr, x1)+1, NULL+1);
+	return ret;
+}
+
 void * ArrayListShift(ArrayList * list){
 	if(list->numOfElements<1) return NULL;
 	void *ret = list->elementList [0];
@@ -116,7 +123,7 @@ void ArrayListSetDeallocationFunction(ArrayList * list,  void (*elem_deallocator
 	list -> elemDeallocator = elem_deallocator;
 }
 
-int ArrayListSort_comp_pntr( void * L_elem, void * R_elem ){
+int ArrayListSort_comp_pntr( void * L_elem, void * R_elem, ArrayList * me ){
 	if( L_elem > R_elem )return 1;
 	if( L_elem < R_elem )return -1;
 	return 0;
@@ -125,11 +132,11 @@ int ArrayListSort_comp_pntr( void * L_elem, void * R_elem ){
 int ArrayListSort_compare(void * sortdata0, int L, int R){
 	void ** sortdata = sortdata0;
 	ArrayList * list = sortdata[0];
-	int (*comp_elems)(void * L_elem, void * R_elem) = sortdata[1];
+	int (*comp_elems)(void * L_elem, void * R_elem, ArrayList * me) = sortdata[1];
 
 	void * L_elem = list -> elementList[L];
 	void * R_elem = list -> elementList[R];
-	return comp_elems(L_elem, R_elem);
+	return comp_elems(L_elem, R_elem, list);
 }
 
 void ArrayListSort_exchange(void * sortdata0, int L, int R){
@@ -144,7 +151,7 @@ void ArrayListSort_exchange(void * sortdata0, int L, int R){
 void ArrayListSort_merge(void * sortdata0, int start, int items, int items2){
 	void ** sortdata = sortdata0;
 	ArrayList * list = sortdata[0];
-	int (*comp_elems)(void * L_elem, void * R_elem) = sortdata[1];
+	int (*comp_elems)(void * L_elem, void * R_elem, ArrayList * me) = sortdata[1];
 
 	void ** merged = malloc(sizeof(void *)*(items + items2));
 	int write_cursor, read1=start, read2=start+items;
@@ -153,7 +160,7 @@ void ArrayListSort_merge(void * sortdata0, int start, int items, int items2){
 		void * Elm1 = (read1 == start + items)? NULL:list -> elementList[read1];
 		void * Elm2 = (read2 == start + items + items2)?NULL:list -> elementList[read2];
 
-		int select_1 = (read1 == start + items)?0:( read2 == start + items + items2 || comp_elems(Elm1, Elm2) < 0 );
+		int select_1 = (read1 == start + items)?0:( read2 == start + items + items2 || comp_elems(Elm1, Elm2, list) < 0 );
 		if(select_1) merged[write_cursor] = list -> elementList[read1++];
 		else merged[write_cursor] = list -> elementList[read2++];
 		if(read2 > start + items + items2){
@@ -167,7 +174,7 @@ void ArrayListSort_merge(void * sortdata0, int start, int items, int items2){
 	free(merged);
 }
 
-void ArrayListSort(ArrayList * list, int compare_L_minus_R(void * L_elem, void * R_elem)){
+void ArrayListSort(ArrayList * list, int compare_L_minus_R(void * L_elem, void * R_elem, ArrayList * me)){
 	void * sortdata[2];	
 	sortdata[0] = list;
 	sortdata[1] = compare_L_minus_R?compare_L_minus_R:ArrayListSort_comp_pntr;
@@ -239,6 +246,21 @@ HashTable * StringTableCreate(srInt_64 numOfBuckets){
 	return ret;
 }
 
+void StringTableReverse_Run(void * ky, void * va, HashTable * tab){
+	HashTable * res = tab -> appendix1;
+	HashTablePut(res, va, ky);
+}
+
+HashTable * StringTableReverse(HashTable * ori){
+	HashTable * res = StringTableCreate(ori -> numOfBuckets);
+	void * oriap1 = ori -> appendix1;
+	ori -> appendix1 = res;
+
+	HashTableIteration(ori, StringTableReverse_Run);
+	ori -> appendix1 = oriap1;
+	return res;
+}
+
 ArrayList * HashTableKeys(HashTable * tab){
 	int i;
 	ArrayList * ret = ArrayListCreate(tab -> numOfElements);
@@ -990,3 +1012,34 @@ void HashTableIteration(HashTable * tab, void process_item(void * key, void * ha
 		}
 	}
 }
+
+void HashTableSortedIndexes_copy_idx( void *k, void *v, HashTable * k2int_tab ){
+	ArrayList * dsta = k2int_tab -> appendix1;
+	ArrayListPush(dsta, k);
+}
+
+int HashTableSortedIndexes_cmp_idx( void * Lv, void * Rv, ArrayList * me ){
+	void ** appdx = me -> appendix1;
+	HashTable * k2int_tab = appdx[0];
+	void * Lhash = HashTableGet(k2int_tab, Lv);
+	void * Rhash = HashTableGet(k2int_tab, Rv);
+	void * large_first = appdx[1];
+	if(large_first){
+		if(Lhash > Rhash) return -1;
+		if(Lhash < Rhash) return 1;
+	}else{
+		if(Lhash > Rhash) return 1;
+		if(Lhash < Rhash) return -1;
+	}
+	return 0;
+}
+
+ArrayList * HashTableSortedIndexes(HashTable * k2int_tab, int larger_value_first){
+	ArrayList *ret = HashTableKeys(k2int_tab);
+	void * appx[2];
+	ret -> appendix1 = appx;
+	appx[0]=k2int_tab;
+	appx[1]=NULL+larger_value_first;
+	ArrayListSort(ret, HashTableSortedIndexes_cmp_idx);
+	return ret;
+}


=====================================
src/hashtable.h
=====================================
@@ -6,7 +6,7 @@
  * Released to the public domain.
  *
  *--------------------------------------------------------------------------
- * $Id: hashtable.h,v 9999.27 2019/08/25 10:12:06 cvs Exp $
+ * $Id: hashtable.h,v 9999.30 2019/12/05 03:05:37 cvs Exp $
 \*--------------------------------------------------------------------------*/
 
 #ifndef _HASHTABLE_H
@@ -71,7 +71,7 @@ int ArrayListPush_NoRepeatedPtr(ArrayList * list, void * new_elem);
 void * ArrayListShift(ArrayList * list);
 void * ArrayListPop(ArrayList * list);
 void ArrayListSetDeallocationFunction(ArrayList * list,  void (*elem_deallocator)(void *elem));
-void ArrayListSort(ArrayList * list, int compare_L_minus_R(void * L_elem, void * R_elem));
+void ArrayListSort(ArrayList * list, int compare_L_minus_R(void * L_elem, void * R_elem, ArrayList * me));
 
 // A simple comparison function if you want to sort unsigned long long ints.
 int ArrayListLLUComparison(void * L_elem, void * R_elem);
@@ -115,6 +115,7 @@ ArrayList * HashTableKeyArray(HashTable * tab);
 \*--------------------------------------------------------------------------*/
 
 HashTable * StringTableCreate(srInt_64 numOfBuckets);
+HashTable * StringTableReverse(HashTable * ori);
 HashTable *HashTableCreate(srInt_64 numOfBuckets);
 
 /*--------------------------------------------------------------------------*\
@@ -494,6 +495,11 @@ srUInt_64 HashTableStringHashFunction(const void *key);
 void free_values_destroy(HashTable * tab);
 int HashTablePutReplaceEx(HashTable *hashTable, const void *key, void *value, int replace_key, int dealloc_key, int dealloc_value);
 
+ArrayList * HashTableSortedIndexes(HashTable * k2int_tab, int larger_value_first);
+
+// NOTE: values are all added by 1. 
+HashTable * ArrayListToLookupTable_Int(ArrayList * arr);
+
 #endif /* _HASHTABLE_H */
 
 


=====================================
src/index-builder.c
=====================================
@@ -441,31 +441,25 @@ int build_gene_index(const char index_prefix [], char ** chro_files, int chro_fi
 
 int add_repeated_subread(gehash_t * tab , unsigned int subr, unsigned char ** huge_index)
 {
-	int huge_byte = (subr>>2) &0x3fffffff;
-	int huge_offset = (subr % 4) * 2;
-	int ind_id = (huge_byte >> 20) & 1023;
+	int huge_byte = (subr>>1);
+	int huge_offset = (subr % 2) * 4;
+	int ind_id = (huge_byte >> 24) & 127;
 
-	if(NULL == huge_index[ind_id])  huge_index[ind_id] = calloc(1024*1024, 1);
-	if(NULL == huge_index[ind_id]){ 
-		SUBREADprintf("ERROR: No memory can be allocated.\nThe program has to terminate\n");
-		return -1;
-	}
-	unsigned int byte_value = huge_index[ ind_id ][huge_byte&0xfffff] ;
+	unsigned int byte_value = huge_index[ ind_id ][huge_byte&0xffffff] ;
 
-	int huge_value = (byte_value>> huge_offset) & 0x3;
-	if(huge_value <3){
+	int huge_value = (byte_value>> huge_offset) & 0xf;
+	if(huge_value <15){
 		huge_value ++;
-		huge_index[ ind_id ][huge_byte&0xfffff] = (byte_value & (~(0x3 << huge_offset))) | (huge_value << huge_offset);
+		byte_value = (byte_value & (~(0xf << huge_offset))) | (huge_value << huge_offset);
+		huge_index[ ind_id ][huge_byte&0xffffff] = (unsigned char)(byte_value&0xff);
 		return 0;
 	}
 	unsigned int times = 0;
 	int matched = gehash_get(tab, subr, &times, 1);
 	if(matched)
-	{
 		gehash_update(tab, subr, times+1);
-	}
 	else
-		if(gehash_insert(tab, subr,4, NULL)) return 1;
+		if(gehash_insert(tab, subr, 16, NULL)) return 1;
 	
 	return 0;
 }
@@ -480,26 +474,17 @@ int scan_gene_index(const char index_prefix [], char ** chro_files, int chro_fil
 	*actual_total_bases_inc_marging = padding_around_contigs;
 
 	gehash_t occurrence_table;
-	unsigned char * huge_index[1024];
-
-	for(i=0;i<1024;i++)
-	{
-		huge_index[i] = NULL;
+	unsigned char * huge_index[128];
 
-		if(0){//memory is allocated when needed. 
-			huge_index[i] = (unsigned char *)malloc(1024*1024); 
-			
-			if(!huge_index[i])
-			{
-				for(j=0;j<i;j++) free(huge_index[j]);
-				SUBREADprintf("ERROR: You need at least one point five gigabytes of memory for building the index.\n");
-				return -1;
-			}
-			memset(huge_index[i], 0 , 1024*1024);
+	for(i=0;i<128;i++) {
+		huge_index[i] = calloc(1024*1024*16,1);
+		if(NULL == huge_index[i]){ 
+			SUBREADprintf("ERROR: No memory can be allocated.\nThe program has to terminate\n");
+			return -1;
 		}
 	}
 
-	if(gehash_create_ex(&occurrence_table, 150000000, 0, SUBINDEX_VER0, 1, 0)) return 1;
+	if(gehash_create_ex(&occurrence_table, 500000, 0, SUBINDEX_VER0, 1, 0)) return 1;
 
 	gene_input_t ginp;
 
@@ -677,7 +662,7 @@ int scan_gene_index(const char index_prefix [], char ** chro_files, int chro_fil
 	}
 
 
-	for(i=0;i<1024;i++)
+	for(i=0;i<128;i++)
 		if(huge_index[i])  free(huge_index[i]);
 
 	gehash_destory(&occurrence_table);


=====================================
src/input-files.c
=====================================
@@ -2544,7 +2544,8 @@ int SAM_pairer_create(SAM_pairer_context_t * pairer, int all_threads, int bin_bu
 		pairer -> threads[x1].thread_id = x1;
 		pairer -> threads[x1].reads_in_SBAM = 0;
 		pairer -> threads[x1].input_buff_SBAM = malloc(pairer -> input_buff_SBAM_size);
-		pairer -> threads[x1].input_buff_BIN = malloc(pairer -> input_buff_BIN_size);
+		pairer -> threads[x1].input_buff_BIN_capacity = pairer -> input_buff_BIN_size;
+		pairer -> threads[x1].input_buff_BIN = malloc(pairer -> threads[x1].input_buff_BIN_capacity );
 
 		pairer -> threads[x1].input_buff_BIN_used = 0;
 		pairer -> threads[x1].orphant_table = HashTableCreate(pairer -> input_buff_SBAM_size / 100);
@@ -2615,7 +2616,7 @@ int SAM_pairer_read_BAM_block(FILE * fp, int max_read_len, char * inbuff) {
 		return -1;
 	}
 	if(gz_header_12[0]!=31 || gz_header_12[1]!=139){
-		SUBREADprintf("Unrecognized Gzip headers: %u, %u\nPlease make sure if the input file is in the BAM format.\n", gz_header_12[0], gz_header_12[1]);
+		//SUBREADprintf("Unrecognized Gzip headers: %u, %u\nPlease make sure if the input file is in the BAM format.\n", gz_header_12[0], gz_header_12[1]);
 		return -1;
 	}
 	unsigned short xlen = 0, bsize = 0;
@@ -2768,16 +2769,31 @@ int SAM_pairer_fetch_BAM_block(SAM_pairer_context_t * pairer , SAM_pairer_thread
 
 	inflateReset(&thread_context -> strm);
 
-	thread_context -> strm.avail_in = (unsigned int)(thread_context -> input_buff_SBAM_used - thread_context -> input_buff_SBAM_ptr);
+	int lin, lout;
+
+	lin=thread_context -> strm.avail_in = (unsigned int)(thread_context -> input_buff_SBAM_used - thread_context -> input_buff_SBAM_ptr);
 	thread_context -> strm.next_in = (unsigned char *)thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr;
-	thread_context -> strm.avail_out = pairer -> input_buff_BIN_size - thread_context -> input_buff_BIN_used; 
+	
+	if( thread_context -> input_buff_BIN_capacity < thread_context -> input_buff_BIN_used + 128*1024){
+		thread_context -> input_buff_BIN_capacity =  max(thread_context -> input_buff_BIN_used, thread_context -> input_buff_BIN_capacity )*1.5;
+		if(thread_context -> input_buff_BIN_capacity > 1024*1024*1024){
+			SUBREADprintf("ERROR: buffer size larger than 1GB\n");
+			return 1;
+		}else{
+			//SUBREADprintf("Resize Buffer of Th_%d to %d (used %d); In_ava=%d - %d\n", thread_context -> thread_id,  thread_context -> input_buff_BIN_capacity, thread_context -> input_buff_BIN_used, thread_context -> input_buff_SBAM_used , thread_context -> input_buff_SBAM_ptr);
+		}
+		thread_context -> input_buff_BIN = realloc( thread_context -> input_buff_BIN , thread_context -> input_buff_BIN_capacity);
+		//SUBREADprintf("  PTR=%p\n",thread_context -> input_buff_BIN);
+		assert( thread_context -> input_buff_BIN );
+	}
+	lout=thread_context -> strm.avail_out = thread_context -> input_buff_BIN_capacity - thread_context -> input_buff_BIN_used; 
 	thread_context -> strm.next_out = (unsigned char *)thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_used;
 
 	int ret = inflate(&thread_context ->strm, Z_FINISH);
 	if(ret == Z_OK || ret == Z_STREAM_END)
 	{
-		int have = pairer -> input_buff_BIN_size - thread_context ->strm.avail_out - thread_context -> input_buff_BIN_used;
-		int used_BAM = (unsigned int)(thread_context -> input_buff_SBAM_used - thread_context -> input_buff_SBAM_ptr) - thread_context -> strm.avail_in;
+		int have = lout - thread_context ->strm.avail_out;
+		int used_BAM = lin - thread_context -> strm.avail_in;
 		
 		//SUBREADprintf("ABBO TH %d : INFLATED BAM_CONSUMED: %d BIN_USED: %d => %d    NEED_FIND_START=%d\n", thread_context -> thread_id, used_BAM, thread_context -> input_buff_BIN_used , thread_context -> input_buff_BIN_used+have, thread_context -> need_find_start);
 		thread_context -> input_buff_BIN_used += have;
@@ -2785,16 +2801,24 @@ int SAM_pairer_fetch_BAM_block(SAM_pairer_context_t * pairer , SAM_pairer_thread
 
 		if(thread_context -> need_find_start){
 			int test_read_bin = SAM_pairer_find_start(pairer, thread_context);
-			//#warning "====================== FOR TESTINg FALL BACK ONLY, MUST REMOVE FROM RELEASE ============================"
 			if(test_read_bin<1 && thread_context -> input_buff_BIN_used >= 32  ){
 				pairer -> is_bad_format = 1;
-				//SUBREADprintf("ABBO : BAD_FMT 01\n");
+			//	#warning "BADFORMAT-DEBUG"
+			//	SUBREADprintf("ABBO : BAD_FMT 01\n");
 			}
 		}
+		//SUBREADprintf("FETCHED BLOCK DECOMP=%d FROM COMP=%d\n", have, used_BAM);
 	} else {
-		SUBREADprintf("GZIP ERROR:%d\n", ret);
+		if(ret == -5){
+			SUBREADprintf("Cannot parse the input BAM file. If the BAM file contains long reads, please run featureCounts on the long-read mode.\n");
+		}else{
+			SUBREADprintf("GZIP ERROR:%d\n", ret);
+		}
+		pairer -> is_bad_format = 1;
+		pairer -> is_internal_error = 1;
 		return 1;
 	}
+
 	return 0;
 }
 
@@ -2834,7 +2858,7 @@ void SAM_pairer_reduce_BAM_bin(SAM_pairer_context_t * pairer, SAM_pairer_thread_
 	
 }
 
-#define MAX_BIN_RECORD_LENGTH ( 24*1024)
+#define MAX_BIN_RECORD_LENGTH ( 20*1024*1024)
 int reduce_SAM_to_BAM(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context, int include_sequence);
 int is_read_bin(char * bin, int bin_len, int max_refID);
 
@@ -2886,7 +2910,8 @@ int SAM_pairer_get_next_read_BIN( SAM_pairer_context_t * pairer , SAM_pairer_thr
 					ref_bin_len += 4;
 				}
 
-				is_OK = is_OK || pairer -> output_header(pairer, thread_context -> thread_id, 0, pairer -> BAM_n_ref , header_txt , ref_bin_len );
+				is_OK = is_OK || pairer -> output_header(pairer, thread_context -> thread_id, 0, pairer -> BAM_n_ref , header_txt ,  ref_bin_len );
+				//SUBREADprintf("TFMT:HEADER REFS=%d TXTS=%d SIGN=%u\n", pairer -> BAM_n_ref, pairer->BAM_l_text, bam_signature);
 
 				if(header_txt) free(header_txt);
 				if(is_OK){
@@ -2895,11 +2920,7 @@ int SAM_pairer_get_next_read_BIN( SAM_pairer_context_t * pairer , SAM_pairer_thr
 				}
 
 				pairer -> BAM_header_parsed = 1;
-				//if(pairer -> display_progress)
-				//	SUBREADprintf("\nThe header was parsed: %d reference sequences were found.\nScanning the input file.\n", pairer -> BAM_n_ref);
 				SAM_pairer_fetch_BAM_block(pairer, thread_context);
-
-				//SUBREADprintf("HEAD_FINISHED, BAD=%d\n", pairer -> is_bad_format);
 			}
 
 			if(pairer -> is_bad_format) return 0;
@@ -2935,15 +2956,19 @@ int SAM_pairer_get_next_read_BIN( SAM_pairer_context_t * pairer , SAM_pairer_thr
 				}
 			}
 
+			//SUBREADprintf("TFMT:RLEN=%d\n", record_len);
+
 			if(!pairer -> is_bad_format){
 				unsigned int  seq_len = 0;
 				thread_context -> input_buff_BIN_ptr += 4;
 				memcpy(&seq_len, thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr + 16, 4);
 				
 				//SUBREADprintf("REDUCE_2: record %u, %u\n", record_len, seq_len);
-				if(record_len < 32 || record_len > min(MAX_BIN_RECORD_LENGTH,60000) || seq_len >= pairer -> long_read_minimum_length){
+	//			#warning "=========== CHECK IF '0 && ' IS CORRECT ==========="
+				if(record_len < 32 || (0 && record_len > min(MAX_BIN_RECORD_LENGTH,60000))|| seq_len >= pairer -> long_read_minimum_length){
 					if(seq_len >= pairer -> long_read_minimum_length) pairer -> is_single_end_mode = 1;
-				//	SUBREADprintf("BADFMT: THID=%d; rlen %d; seqlen %d; room %d < %d\n",  thread_context -> thread_id,  record_len, seq_len);
+			//		#warning "BADFORMAT-DEBUG"
+			//		SUBREADprintf("BADFMT: THID=%d; rlen %d; seqlen %d; BIN_PTR=%d\n",  thread_context -> thread_id,  record_len, seq_len, thread_context -> input_buff_BIN_ptr);
 					pairer -> is_bad_format = 1;
 					return 0;
 				}
@@ -2951,16 +2976,6 @@ int SAM_pairer_get_next_read_BIN( SAM_pairer_context_t * pairer , SAM_pairer_thr
 				(* bin_where) = thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr - 4;
 				(* bin_len) = record_len + 4;
 
-				if(0){
-					unsigned int Treclen=0, Tseqlen = 0;
-					memcpy(&Treclen, *bin_where, 4);
-					memcpy(&Tseqlen, (*bin_where)+20, 4);
-					assert(record_len == Treclen);
-					int is_bin_start = is_read_bin((char *)*bin_where, thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_used +4 , pairer -> BAM_n_ref);
-					SUBREADprintf("TREC TH_%d: reclen=%u, seqlen=%u, IS_BIN=%d\n", thread_context -> thread_id, Treclen, Tseqlen, is_bin_start);
-					assert(is_bin_start == 1);
-				}
-
 				thread_context -> input_buff_BIN_ptr += record_len;
 			}
 			return 1;
@@ -4473,7 +4488,7 @@ int SAM_pairer_find_start(SAM_pairer_context_t * pairer , SAM_pairer_thread_t *
 		}
 	}
 	thread_context -> input_buff_BIN_ptr = start_pos;
-	//SUBREADprintf("ABBO TH %d : FOUND START AT %d in %d\n", thread_context -> thread_id , start_pos, thread_context -> input_buff_BIN_used);
+//	SUBREADprintf("ABBO TH %d : FOUND START AT %d in %d\n", thread_context -> thread_id , start_pos, thread_context -> input_buff_BIN_used);
 	return start_pos < min(MAX_BIN_RECORD_LENGTH, thread_context -> input_buff_BIN_used);
 }
 
@@ -4700,70 +4715,81 @@ int fix_load_next_block(FILE * in, char * binbuf, z_stream * strm){
 }
 
 int  fix_write_block(FILE * out, char * bin, int binlen, z_stream * strm){
-	char * bam_buf = malloc(70000);
-	int x1, bam_len = 0;
-
-	if(binlen > 0){
-		strm -> avail_in = binlen;
-		strm -> next_in = (unsigned char*)bin;
-		strm -> avail_out = 70000;
-		strm -> next_out = (unsigned char*)bam_buf;
-		deflate(strm , Z_FINISH);
-		bam_len = 70000 - strm -> avail_out;
-		deflateReset(strm);
-	}else{
-		z_stream nstrm;
-		nstrm.zalloc = Z_NULL;
-		nstrm.zfree = Z_NULL;
-		nstrm.opaque = Z_NULL;
-		nstrm.avail_in = 0;
-		nstrm.next_in = Z_NULL;
-	
-		deflateInit2(&nstrm, SAMBAM_COMPRESS_LEVEL, Z_DEFLATED,
-			PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	int is_end_mode = binlen == 0, written=0;
+	//SUBREADprintf("FIX_WRTR : %d\n", binlen);
 
-		nstrm.avail_in = 0;
-		nstrm.next_in = (unsigned char*)bin;
-		nstrm.avail_out = 70000;
-		nstrm.next_out = (unsigned char*)bam_buf;
-		deflate(&nstrm, Z_FINISH);
-		bam_len = 70000 - nstrm.avail_out;
-		deflateEnd(&nstrm);
-	}
+	while(1){
+		if(binlen - written<1 && !is_end_mode) return 0;
 
-	//SUBREADprintf("FIX_COMPR: %d -> %d  RET=%d\n", binlen , bam_len, retbam);
-
-	unsigned int crc0 = crc32(0, NULL, 0);
-	unsigned int crc = crc32(crc0, (unsigned char *) bin , binlen);
-
-	fputc(31, out);
-	fputc(139, out);
-	fputc(8, out);
-	fputc(4, out);
-	fputc(0, out);
-	fputc(0, out);
-	fputc(0, out);
-	fputc(0, out);
-
-	fputc(0, out);//XFL
-	fputc(0xff, out);//OS
-
-	x1 = 6;
-	fwrite( &x1, 2, 1 , out );
-	fputc( 66, out );
-	fputc( 67, out );
-	x1 = 2;
-	fwrite( &x1, 2, 1 , out );
-	x1 = bam_len + 19 + 6;
-	fwrite( &x1, 2, 1 , out );
-	int write_len = fwrite( bam_buf , 1,bam_len, out );
+		char * bam_buf = malloc(70000);
+		int x1, bam_len = 0, old_in= 0, this_sec_len = 0, old_start = written;
 	
-	fwrite( &crc, 4, 1, out );
-	fwrite( &binlen, 4, 1, out );
-
-	free(bam_buf);
-
-	if(write_len < bam_len)return 1;
+		if(binlen - written > 0){
+			old_in = strm -> avail_in = binlen - written;
+			strm -> next_in = (unsigned char*)bin + written;
+			strm -> avail_out = 70000;
+			strm -> next_out = (unsigned char*)bam_buf;
+			deflate(strm , Z_FINISH);
+			bam_len = 70000 - strm -> avail_out;
+			this_sec_len = old_in - strm -> avail_in;
+			written += this_sec_len;
+
+			deflateReset(strm);
+		}else{
+			z_stream nstrm;
+			nstrm.zalloc = Z_NULL;
+			nstrm.zfree = Z_NULL;
+			nstrm.opaque = Z_NULL;
+			nstrm.avail_in = 0;
+			nstrm.next_in = Z_NULL;
+		
+			deflateInit2(&nstrm, SAMBAM_COMPRESS_LEVEL, Z_DEFLATED,
+				PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	
+			nstrm.avail_in = 0;
+			nstrm.next_in = (unsigned char*)bin;
+			nstrm.avail_out = 70000;
+			nstrm.next_out = (unsigned char*)bam_buf;
+			deflate(&nstrm, Z_FINISH);
+			bam_len = 70000 - nstrm.avail_out;
+			deflateEnd(&nstrm);
+		}
+	
+		//SUBREADprintf("FIX_COMPR: %d -> %d  RET=%d\n", binlen , bam_len, retbam);
+	
+		unsigned int crc0 = crc32(0, NULL, 0);
+		unsigned int crc = crc32(crc0, (unsigned char *) bin + old_start, this_sec_len);
+	
+		fputc(31, out);
+		fputc(139, out);
+		fputc(8, out);
+		fputc(4, out);
+		fputc(0, out);
+		fputc(0, out);
+		fputc(0, out);
+		fputc(0, out);
+	
+		fputc(0, out);//XFL
+		fputc(0xff, out);//OS
+	
+		x1 = 6;
+		fwrite( &x1, 2, 1 , out );
+		fputc( 66, out );
+		fputc( 67, out );
+		x1 = 2;
+		fwrite( &x1, 2, 1 , out );
+		x1 = bam_len + 19 + 6;
+		fwrite( &x1, 2, 1 , out );
+		int write_len = fwrite( bam_buf , 1,bam_len, out );
+		
+		fwrite( &crc, 4, 1, out );
+		fwrite( &binlen, 4, 1, out );
+	
+		free(bam_buf);
+	
+		if(write_len < bam_len)return 1;
+		if(binlen<1) return 0;
+	}
 	return 0;
 }
 
@@ -4775,7 +4801,7 @@ int  fix_write_block(FILE * out, char * bin, int binlen, z_stream * strm){
 
 #define FIX_FLASH_OUT { if(out_bin_ptr > 0)disk_is_full |= fix_write_block(new_fp, out_bin, out_bin_ptr, &out_strm); out_bin_ptr = 0; }
 
-#define FIX_APPEND_OUT(p, c) { if(out_bin_ptr > 60000){FIX_FLASH_OUT} ;  memcpy(out_bin + out_bin_ptr, p, c); out_bin_ptr +=c ; }
+#define FIX_APPEND_OUT(p, c) { if(out_bin_ptr > 60002){FIX_FLASH_OUT} ;  memcpy(out_bin + out_bin_ptr, p, c); out_bin_ptr +=c ; }
 #define FIX_APPEND_READ(p, c){ memcpy(out_bin + out_bin_ptr, p, c); out_bin_ptr +=c ;  }
 
 int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
@@ -4786,8 +4812,8 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
 	sprintf(tmpfname, "%s.fixbam", pairer -> tmp_file_prefix);
 
 	FILE * new_fp = f_subr_open(tmpfname, "wb");
-	char * in_bin = malloc(140000);
-	char * out_bin = malloc(70000);
+	char * in_bin = malloc(1024*70);
+	char * out_bin = malloc(20*1024*1024);
 
 	z_stream in_strm;
 	z_stream out_strm;
@@ -4849,7 +4875,7 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
 		content_size += (nch << (8 * x1));
 	}
 	FIX_APPEND_OUT(&content_size, 4);
-	//SUBREADprintf("FIX: CHROLEN=%d\n", content_size);
+	//SUBREADprintf("LONGFIX: CHROLEN=%d\n", content_size);
 	for(content_count = 0; content_count < content_size; content_count++){
 		int namelen = 0;
 		for(x1 = 0; x1 < 4; x1++){
@@ -4864,7 +4890,7 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
 			FIX_APPEND_READ(&nch, 1);
 		}
 
-		if(out_bin_ptr > 60000){
+		if(out_bin_ptr > 60003){
 			FIX_FLASH_OUT;
 		}
 	}
@@ -4891,9 +4917,6 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
 			block_size += (nch << (8 * x1));
 		}
 
-		if(block_size + out_bin_ptr > 60000 && !pairer -> tiny_mode)
-			FIX_FLASH_OUT;
-
 		FIX_APPEND_READ(&block_size, 4);
 
 		if(pairer -> tiny_mode){
@@ -4927,7 +4950,9 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
 					}
 					break;
 				}
-				if( x1 == 32 && block_size > 60000 ){
+
+	//			#warning "================ THIS BLOCK WAS DISABLED ON 03OCT2019; MAKE SURE IT WORKS ON LONG READS/LONG READ RECORDS =============="
+				if(0 && x1 == 32 && block_size > 60000 ){
 					print_in_box(80,0,0,"");
 					print_in_box(80,0,0,"   ERROR: Alignment record is too long.");
 					print_in_box(80,0,0,"          Please use the long read mode.");
@@ -5054,10 +5079,9 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
 			}
 		}
 
-		//#warning "========= COMMENT NEXT ============="
-		//SUBREADprintf("OUTBIN_PTR=%d\n", out_bin_ptr);
 		reads ++;
 		if(out_bin_ptr > 60000){
+	//		SUBREADprintf("WRIR3: TINY=%d\n", pairer -> tiny_mode);
 			FIX_FLASH_OUT;
 		}
 	}


=====================================
src/input-files.h
=====================================
@@ -102,6 +102,7 @@ typedef struct {
 	unsigned char * input_buff_BIN;
 	int input_buff_BIN_used;
 	int input_buff_BIN_ptr;
+	int input_buff_BIN_capacity;
 	int orphant_block_no;
 	int need_find_start;
 	srInt_64 orphant_space;


=====================================
src/makefile.version
=====================================
@@ -1,4 +1,4 @@
-SUBREAD_VERSION_BASE=2.0.0
+SUBREAD_VERSION_BASE=2.0.1
 SUBREAD_VERSION_DATE=$(SUBREAD_VERSION_BASE)-$(shell date +"%d%b%Y")
 SUBREAD_VERSION="$(SUBREAD_VERSION_DATE)"
 SUBREAD_VERSION="$(SUBREAD_VERSION_BASE)"


=====================================
src/readSummary.c
=====================================
@@ -64,7 +64,7 @@
 #define FC_FLIST_SPLITOR "\026"
 
 typedef struct{
-	char gene_name[FEATURE_NAME_LENGTH];
+	char * gene_name;
 	unsigned int pos_first_base;
 	unsigned int pos_last_base;
 } fc_junction_gene_t;
@@ -97,7 +97,7 @@ typedef struct {
 } fc_junction_info_t;
 
 typedef struct {
-	unsigned int feature_name_pos;
+	srInt_64 feature_name_pos;
 	unsigned int start;
 	unsigned int end;
 	unsigned int sorted_order;
@@ -280,8 +280,8 @@ typedef struct {
 	char * debug_command;
 	char * unistr_buffer_space;
 	srInt_64 max_BAM_header_size;
-	unsigned int unistr_buffer_size;
-	unsigned int unistr_buffer_used;
+	srInt_64 unistr_buffer_size;
+	srInt_64 unistr_buffer_used;
 	HashTable * scRNA_sample_sheet_table;
 	ArrayList * scRNA_sample_barcode_list;
 	ArrayList * scRNA_cell_barcodes_array;
@@ -323,7 +323,7 @@ typedef struct {
 	char ** exontable_chr;
 	srInt_64 * exontable_start;
 	srInt_64 * exontable_stop;
-	char feature_name_column[100];
+	char feature_name_column[2000];
 	char gene_id_column[100];
 
 	srInt_64 * exontable_block_end_index;
@@ -581,19 +581,19 @@ int calc_junctions_from_cigar(fc_thread_global_context_t * global_context, int f
 }
 
 
-unsigned int unistr_cpy(fc_thread_global_context_t * global_context, char * str, int strl)
+srInt_64 unistr_cpy(fc_thread_global_context_t * global_context, char * str, int strl)
 {
-	unsigned int ret;
+	srInt_64 ret;
 	if(global_context->unistr_buffer_used + strl >= global_context->unistr_buffer_size-1)
 	{
-		if( global_context->unistr_buffer_size < (1000u*1000u*1000u*2)) // 2GB
+		if( global_context->unistr_buffer_size < (1000llu*1000u*1000u*32)) // 32GB
 		{
 			global_context -> unistr_buffer_size = global_context->unistr_buffer_size /2 *3;
 			global_context -> unistr_buffer_space = realloc(global_context -> unistr_buffer_space, global_context->unistr_buffer_size);
 		}
 		else
 		{
-			SUBREADprintf("Error: exceed memory limit (4GB) for storing annotation data.\n");
+			SUBREADprintf("Error: exceed memory limit (32GB) for storing feature names.\n");
 			return 0xffffffffu;
 		}
 	}
@@ -823,12 +823,18 @@ int fc_strcmp(const void * s1, const void * s2)
 	return strcmp((char*)s1, (char*)s2);
 }
 
+void junc_gene_free(void *vv){
+	fc_junction_gene_t *v = vv;
+	free(v -> gene_name);
+	free(v);
+}
+
 void register_junc_feature(fc_thread_global_context_t *global_context, char * feature_name, char * chro, unsigned int start, unsigned int stop){
 	HashTable * gene_table = HashTableGet(global_context -> junction_features_table, chro);
 	//SUBREADprintf("REG %s : %p\n", chro, gene_table);
 	if(NULL == gene_table){
 		gene_table = HashTableCreate(48367);
-		HashTableSetDeallocationFunctions(gene_table, NULL, free);
+		HashTableSetDeallocationFunctions(gene_table, NULL, junc_gene_free);
 		HashTableSetKeyComparisonFunction(gene_table, fc_strcmp);
 		HashTableSetHashFunction(gene_table, fc_chro_hash);
 
@@ -839,7 +845,7 @@ void register_junc_feature(fc_thread_global_context_t *global_context, char * fe
 	fc_junction_gene_t * gene_info = HashTableGet(gene_table, feature_name);
 	if(NULL == gene_info){
 		gene_info = malloc(sizeof(fc_junction_gene_t));
-		strcpy(gene_info -> gene_name, feature_name);
+		gene_info -> gene_name = strdup(feature_name);
 		gene_info -> pos_first_base = start;
 		gene_info -> pos_last_base = stop;
 
@@ -856,6 +862,18 @@ void free_bucket_table_list(void * pv){
 	free(list);
 }
 
+int match_feature_name_column(char * infile, char * needed){
+	char * ptt = NULL;
+	char lneeded[strlen(needed)+1];
+	strcpy(lneeded, needed);
+	char * t1 = strtok_r(lneeded, ",", &ptt);
+	while(t1){
+		if(strcmp(t1, infile)==0) return 1;
+		t1 = strtok_r(NULL,",", &ptt);
+	}
+	return 0;
+}
+
 #define JUNCTION_BUCKET_STEP (128*1024)
 
 int locate_junc_features(fc_thread_global_context_t *global_context, char * chro, unsigned int pos, fc_junction_gene_t ** ret_info, int max_ret_info_size){
@@ -906,10 +924,11 @@ int locate_junc_features(fc_thread_global_context_t *global_context, char * chro
 // Memory will be allowcated in this function. The pointer is saved in *loaded_features.
 // The invoker must release the memory itself.
 
+#define MAX_ANNOT_LINE_LENGTH 200000
 int load_feature_info(fc_thread_global_context_t *global_context, const char * annotation_file, int file_type, fc_feature_info_t ** loaded_features)
 {
 	unsigned int features = 0, xk1 = 0, lineno=0;
-	char * file_line = malloc(MAX_LINE_LENGTH+1);
+	char * file_line = malloc(MAX_ANNOT_LINE_LENGTH+1);
 	autozip_fp anno_fp;
 	int apret = autozip_open(annotation_file, &anno_fp);
 	int is_GFF_warned = 0;
@@ -937,8 +956,8 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
 	// also create chro_name_table : chro_name => fc_chromosome_index_info 
 	while(0)
 	{
-		//char * fgets_ret = fgets(file_line, MAX_LINE_LENGTH, fp);
-		int rchars = autozip_gets(&anno_fp, file_line, MAX_LINE_LENGTH);
+		//char * fgets_ret = fgets(file_line, MAX_ANNOT_LINE_LENGTH, fp);
+		int rchars = autozip_gets(&anno_fp, file_line, MAX_ANNOT_LINE_LENGTH);
 		char * token_temp = NULL, *chro_name;
 		fc_chromosome_index_info * chro_stab;
 		unsigned int feature_pos = 0;
@@ -951,7 +970,7 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
 			chro_name = strtok_r(file_line,"\t",&token_temp);
 			strtok_r(NULL,"\t", &token_temp); // lib_name (not needed)
 			char * feature_type = strtok_r(NULL,"\t", &token_temp);
-			if(strcmp(feature_type, global_context -> feature_name_column)==0)
+			if(match_feature_name_column(feature_type, global_context -> feature_name_column))
 			{
 				strtok_r(NULL,"\t", &token_temp); // feature_start
 				feature_pos = atoi(strtok_r(NULL,"\t", &token_temp));// feature_end
@@ -1004,8 +1023,12 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
 	while(1)
 	{
 		int is_gene_id_found = 0;
-		int rchars = autozip_gets(&anno_fp, file_line, MAX_LINE_LENGTH);
+		int rchars = autozip_gets(&anno_fp, file_line, MAX_ANNOT_LINE_LENGTH);
 		if(rchars < 1) break;
+		if(rchars >= MAX_ANNOT_LINE_LENGTH - 1){
+			SUBREADprintf("\nERROR: the %u-th line in your GTF file is extremely long (longer than %d bytes).\nThe program cannot parse this line.\n", lineno+1, MAX_ANNOT_LINE_LENGTH-1);
+			return -2;
+		}
 
 		lineno++;
 		char * token_temp = NULL;
@@ -1018,9 +1041,12 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
 			}
 			char * feature_name = strtok_r(file_line,"\t",&token_temp);
 			int feature_name_len = strlen(feature_name);
-			if(feature_name_len > FEATURE_NAME_LENGTH) feature_name[FEATURE_NAME_LENGTH -1 ] = 0;
+			if(feature_name_len > FEATURE_NAME_LENGTH-2){
+				SUBREADprintf("WARNING: feature name on the %d-th line is longer than %d bytes. The name is truncated\n", lineno, FEATURE_NAME_LENGTH -2);
+				feature_name[FEATURE_NAME_LENGTH -2 ] = 0;
+			}
 
-			unsigned int genename_pos = unistr_cpy(global_context, (char *)feature_name, feature_name_len);
+			srInt_64 genename_pos = unistr_cpy(global_context, (char *)feature_name, feature_name_len);
 			
 	//		SUBREADprintf("REALL: '%s'=%d  [%d] %p  POS=%d\t\tOLD_NAME_POS=%d\n" , feature_name, feature_name_len , xk1, ret_features+xk1, genename_pos, xk1>0?ret_features[xk1-1].feature_name_pos:-1);
 			ret_features[xk1].feature_name_pos = genename_pos;
@@ -1028,7 +1054,7 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
 			char * seq_name = strtok_r(NULL,"\t", &token_temp);
 			int chro_name_len = strlen(seq_name);
 			if(chro_name_len > CHROMOSOME_NAME_LENGTH) seq_name[CHROMOSOME_NAME_LENGTH -1 ] = 0;
-			unsigned int chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
+			srInt_64 chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
 			global_context -> longest_chro_name = max(chro_name_len, global_context -> longest_chro_name);
 
 
@@ -1125,7 +1151,7 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
 			char * seq_name = strtok_r(file_line,"\t",&token_temp);
 			strtok_r(NULL,"\t", &token_temp);// source
 			char * feature_type = strtok_r(NULL,"\t", &token_temp);// feature_type
-			if(strcmp(feature_type, global_context -> feature_name_column)==0) {
+			if(match_feature_name_column(feature_type, global_context -> feature_name_column)) {
 
 				if(xk1 >= ret_features_size) {
 					ret_features_size *=2;
@@ -1221,12 +1247,15 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
 				}
 
 				int feature_name_len = strlen(feature_name_tmp);
-				if(feature_name_len > FEATURE_NAME_LENGTH) feature_name_tmp[FEATURE_NAME_LENGTH -1 ] = 0;
+				if(feature_name_len > FEATURE_NAME_LENGTH-2){
+					SUBREADprintf("WARNING: feature name on the %d-th line is longer than %d bytes. The name is truncated\n", lineno, FEATURE_NAME_LENGTH-2);
+					feature_name_tmp[FEATURE_NAME_LENGTH -2 ] = 0;
+				}
 				ret_features[xk1].feature_name_pos = unistr_cpy(global_context, (char *)feature_name_tmp, feature_name_len);
 
 				int chro_name_len = strlen(seq_name);
 				if(chro_name_len > CHROMOSOME_NAME_LENGTH) seq_name[CHROMOSOME_NAME_LENGTH -1 ] = 0;
-				unsigned int chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
+				srInt_64 chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
 				global_context -> longest_chro_name = max(chro_name_len, global_context -> longest_chro_name);
 
 				ret_features[xk1].chro_name_pos_delta = chro_name_pos - ret_features[xk1].feature_name_pos;
@@ -3991,7 +4020,7 @@ void vote_and_add_count(fc_thread_global_context_t * global_context, fc_thread_t
 							add_scRNA_read_to_pool(global_context, thread_context, assignment_target_number, read_name);
 						}
 					}else if(global_context -> is_multi_overlap_allowed) {
-						#define GENE_NAME_LIST_BUFFER_SIZE 5000
+						#define GENE_NAME_LIST_BUFFER_SIZE (FEATURE_NAME_LENGTH * 50) 
 
 						char final_feture_names[GENE_NAME_LIST_BUFFER_SIZE];
 						int assigned_no = 0, xk1;
@@ -4118,7 +4147,7 @@ void scRNA_merge_thread_umitables(void *ky, void *val, HashTable * tab){
 }
 
 #define MIN_EXPRESSED_UMIS_PER_CELL 100 
-#define MIN_EXPRESSED_UMIS_PER_GENE 3 
+#define MIN_EXPRESSED_UMIS_PER_GENE (3-2) 
 
 void scRNA_merge_write_copy_gene_nos(void * ky, void * va , HashTable *tab){
 	HashTable * used_gene_table = tab -> appendix2;
@@ -4127,10 +4156,10 @@ void scRNA_merge_write_copy_gene_nos(void * ky, void * va , HashTable *tab){
 	srInt_64 UMIs = HashTableGet(used_gene_table, ky)-NULL;
 	HashTablePut(used_gene_table, ky, NULL + UMIs + one_sampl_gene_to_cell_umis -> numOfElements);
 }
-int scRNA_merge_write_zero_gene(fc_thread_global_context_t * global_context, char * linebuf, ArrayList * used_cell_barcode_list){
+int scRNA_merge_write_zero_gene(fc_thread_global_context_t * global_context, char * linebuf, ArrayList * high_confid_barcode_index_list){
 	int ret=0;
 	srInt_64 x1;
-	for(x1=0;x1<used_cell_barcode_list->numOfElements;x1++)ret += sprintf(linebuf + ret,"\t0");
+	for(x1=0;x1<high_confid_barcode_index_list->numOfElements;x1++)ret += sprintf(linebuf + ret,"\t0");
 	return ret;
 }
 
@@ -4259,7 +4288,7 @@ ArrayList * scRNA_reduce_cellno_umino_p1_list(fc_thread_global_context_t * globa
 	return ret;
 }
 
-int scRNA_merged_write_a_gene(fc_thread_global_context_t * global_context,  HashTable ** merged_tables_gene_to_cell_umis , HashTable ** used_cell_barcode_tabs , ArrayList ** used_cell_barcode_list , ArrayList * umi_merged_list, ArrayList * gen_no_list,  srInt_64 used_gene_i, char * linebuf){
+int scRNA_merged_write_a_gene(fc_thread_global_context_t * global_context,  HashTable ** merged_tables_gene_to_cell_umis , HashTable ** used_cell_barcode_tabs , ArrayList ** high_confid_barcode_index_list , ArrayList * umi_merged_list, ArrayList * gen_no_list,  srInt_64 used_gene_i, char * linebuf){
 	srInt_64 gene_no = ArrayListGet(gen_no_list, used_gene_i) -NULL -1, x1;
 	srInt_64 line_number = global_context -> is_gene_level? gene_no :  HashTableGet(global_context -> lineno_2_sortedno_tab, NULL+1+gene_no) -NULL -1; // convert line_no to sorted_order
 	int linebuf_ptr = 0;
@@ -4278,12 +4307,12 @@ int scRNA_merged_write_a_gene(fc_thread_global_context_t * global_context,  Hash
 	for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++){
 		ArrayList* cellno_umino_p1_list = HashTableGet(merged_tables_gene_to_cell_umis[x1], NULL + gene_no + 1);
 		if(NULL == cellno_umino_p1_list){
-			linebuf_ptr += scRNA_merge_write_zero_gene(global_context, linebuf+ linebuf_ptr, used_cell_barcode_list[x1]);
+			linebuf_ptr += scRNA_merge_write_zero_gene(global_context, linebuf+ linebuf_ptr, high_confid_barcode_index_list[x1]);
 		}else{
 			int tab_cell_ptr=0, used_cell_bc_ptr ;
 
-			for(used_cell_bc_ptr = 0; used_cell_bc_ptr < used_cell_barcode_list[x1] -> numOfElements; used_cell_bc_ptr++){
-				srInt_64 used_cell_no = ArrayListGet(used_cell_barcode_list[x1], used_cell_bc_ptr) - NULL -1;
+			for(used_cell_bc_ptr = 0; used_cell_bc_ptr < high_confid_barcode_index_list[x1] -> numOfElements; used_cell_bc_ptr++){
+				srInt_64 used_cell_no = ArrayListGet(high_confid_barcode_index_list[x1], used_cell_bc_ptr) - NULL -1;
 
 				int write_cnt=0;
 				if(tab_cell_ptr<cellno_umino_p1_list ->numOfElements)while(1){
@@ -4308,73 +4337,282 @@ int scRNA_merged_write_a_gene(fc_thread_global_context_t * global_context,  Hash
 	return total_count_in_row;
 }
 
-// this function writes a single count table.
-// Rows: genes
-// Cols: Cell_Barcode +"."+ SampleName
-void scRNA_merged_to_tables_write( fc_thread_global_context_t * global_context, HashTable ** merged_tables_gene_to_cell_umis , HashTable ** used_cell_barcode_tabs , ArrayList * merged_umi_list){
-	char ofname[MAX_FILE_NAME_LENGTH + 20];
-	sprintf(ofname,"%s.scRNA.table",global_context->input_file_name);
-	FILE * ofp = fopen( ofname , "w" );
-	int x1, cell_items_row = 0;
-	srInt_64 xkk;
-	ArrayList ** used_cell_barcode_list = malloc(sizeof(void*)* global_context -> scRNA_sample_sheet_table -> numOfElements );
+#define SCRNA_AMBIENT_RESCURE_MEDIAN_FRACTION 0.01
+void scRNA_merged_ambient_rescure(fc_thread_global_context_t * global_context, HashTable * merged_gene_to_cell_umis_tab, HashTable * used_cell_barcode_tab, ArrayList * this_sample_ambient_rescure_candi, ArrayList * this_sample_45k_90k_barcode_idx, ArrayList * highconf_list){
+	ArrayList * sorted_idx = HashTableSortedIndexes( used_cell_barcode_tab, 1);
+	HashTable * highconf_list_tab = ArrayListToLookupTable_Int(highconf_list);
+	srInt_64 x1;
+	for(x1=0; x1 < sorted_idx -> numOfElements; x1++){
+		void * this_bc_pnt = ArrayListGet(sorted_idx ,  x1);
+		if(!HashTableGet(highconf_list_tab, this_bc_pnt)) break; // assuming that all high-umi barcodes are high-confident, this makes x1 being the # of total high-confidence barcodes.	
+	}
+	if(x1 >0){
+		srInt_64 median_umis = HashTableGet(used_cell_barcode_tab, ArrayListGet(sorted_idx ,  (x1-1)/2))-NULL;
+		srInt_64 median_umis_001_cut = (srInt_64)(median_umis *1. *SCRNA_AMBIENT_RESCURE_MEDIAN_FRACTION);
+		//SUBREADprintf("MEDIANTEST : X1 = %lld, MID = %lld, MID_CUT = %lld\n", x1, median_umis, median_umis_001_cut);
+		for(x1=0; x1 < sorted_idx -> numOfElements; x1++){
+			void * this_bc_pnt = ArrayListGet(sorted_idx ,  x1);
+			if(HashTableGet(highconf_list_tab, this_bc_pnt)) continue; // it is in high-conf list
+			srInt_64 this_bc_umis = HashTableGet(used_cell_barcode_tab, this_bc_pnt) - NULL;
+			if(this_bc_umis < median_umis_001_cut) break;
+			ArrayListPush(this_sample_ambient_rescure_candi, this_bc_pnt-1);
+		}
+	}
+	for(x1=45000; x1 < sorted_idx -> numOfElements; x1++){
+		if(x1 >= 90000) break;
+		ArrayListPush(this_sample_45k_90k_barcode_idx, ArrayListGet(sorted_idx ,  x1) );
+	}
+	ArrayListDestroy(sorted_idx);
+	//SUBREADprintf("AMBIENT_CANDIDATES = %lld   45K-90K = %lld\n", this_sample_ambient_rescure_candi -> numOfElements, this_sample_45k_90k_barcode_idx-> numOfElements);
+	HashTableDestroy(highconf_list_tab);
+}
 
-	fprintf(ofp, global_context -> is_gene_level?"Geneid":"FeatureNo");
-	for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++){
-//		SUBREADprintf("CELL BARCODE USED TAB = %ld  ; GENE_TO_CELL&UMI TAB = %ld ; for %d sample\n", used_cell_barcode_tabs[x1]-> numOfElements, merged_tables_gene_to_cell_umis[x1]-> numOfElements, x1);
-		ArrayList * raw_used_cells = HashTableKeys(used_cell_barcode_tabs[x1]);
-		ArrayListSort(raw_used_cells, NULL);
-		used_cell_barcode_list[x1] = ArrayListCreate(raw_used_cells -> numOfElements);
-		for(xkk = 0; xkk < raw_used_cells -> numOfElements; xkk++){
-			srInt_64 cell_no = ArrayListGet(raw_used_cells, xkk) - NULL - 1;
-			srInt_64 umis_of_cell = HashTableGet(used_cell_barcode_tabs[x1], NULL + 1 + cell_no)-NULL;
-			if(umis_of_cell >= MIN_EXPRESSED_UMIS_PER_CELL) ArrayListPush( used_cell_barcode_list[x1], NULL + 1 + cell_no );
+
+#define SCRNA_BOOTSTRAP_HIGH_INDEX 30
+#define SCRNA_BOOTSTRAP_SAMPLING_TIMES 100
+
+void scRNA_merged_bootstrap_a_sample(fc_thread_global_context_t * global_context, HashTable * merged_gene_to_cell_umis_tab, HashTable * used_cell_barcode_tab, ArrayList * merged_umi_list, ArrayList * highconf_list){
+	ArrayList * sorted_idx = HashTableSortedIndexes( used_cell_barcode_tab, 1);
+	srInt_64 UMIs_30th_high = HashTableGet(used_cell_barcode_tab, ArrayListGet(sorted_idx ,  SCRNA_BOOTSTRAP_HIGH_INDEX -1 ))-NULL;
+
+	if(0){
+		SUBREADprintf("HIGHEST 30 TH UMIs = %lld\n", UMIs_30th_high);
+		UMIs_30th_high = HashTableGet(used_cell_barcode_tab, ArrayListGet(sorted_idx ,  SCRNA_BOOTSTRAP_HIGH_INDEX -0 ))-NULL;
+		SUBREADprintf("HIGHEST 31 TH UMIs = %lld\n", UMIs_30th_high);
+		UMIs_30th_high = HashTableGet(used_cell_barcode_tab, ArrayListGet(sorted_idx ,  SCRNA_BOOTSTRAP_HIGH_INDEX -2 ))-NULL;
+		SUBREADprintf("HIGHEST 29 TH UMIs = %lld\n", UMIs_30th_high);
+	}
+
+	srInt_64 x2,x1;
+	srInt_64 this_total = 0;
+	for(x1 = 0; x1 < SCRNA_BOOTSTRAP_SAMPLING_TIMES; x1++){
+		for(x2 = 0; x2 < sorted_idx -> numOfElements ; x2++){
+			void * barcode_idx = ArrayListRandom(sorted_idx);
+			srInt_64 this_umis = HashTableGet( used_cell_barcode_tab, barcode_idx )-NULL;
+			if(this_umis >= UMIs_30th_high/10) this_total ++;
 		}
-		ArrayListDestroy(raw_used_cells);
-		char * sample_name = ArrayListGet(global_context -> scRNA_sample_id_to_name, x1);
+	}
+	this_total /= SCRNA_BOOTSTRAP_SAMPLING_TIMES;
+	if(0) SUBREADprintf("FINAL SELECTION_IDX =  %lld\n",this_total);
+	for(x1 = 0; x1 < min(sorted_idx -> numOfElements, this_total) ; x1++) ArrayListPush(highconf_list, ArrayListGet( sorted_idx, x1 ) -1 );
+}
+
+void build_exon_name(fc_thread_global_context_t * global_context, fc_feature_info_t * loaded_features, int i, char * exon_name){
+    sprintf(exon_name, "%s:fc,spl:%s:fc,spl:%u:fc,spl:%u:fc,spl:%c", global_context -> unistr_buffer_space + loaded_features[i].feature_name_pos,
+       global_context-> unistr_buffer_space + loaded_features[i].feature_name_pos + loaded_features[i].chro_name_pos_delta,
+       loaded_features[i].start, loaded_features[i].end, loaded_features[i].is_negative_strand == 1?'N':(  loaded_features[i].is_negative_strand ==  0? 'P':'X'));
+}
 
-		for(xkk = 0; xkk < used_cell_barcode_list[x1]->numOfElements; xkk++){
-			srInt_64 cell_bcno = ArrayListGet(used_cell_barcode_list[x1], xkk)-NULL-1;
-			char * cell_name = ArrayListGet( global_context -> scRNA_cell_barcodes_array, cell_bcno );
-			fprintf(ofp, "\t%s.%s", cell_name, sample_name);
+int scRNA_merged_write_sparse_matrix(fc_thread_global_context_t * global_context, HashTable * merged_tab_gene_to_cell_umis, HashTable * cell_barcode_tab, ArrayList * used_cell_barcodes, int sample_index, char * tabtype, fc_feature_info_t* loaded_features){
+	int x1,x2;
+
+	char ofname[MAX_FILE_NAME_LENGTH + 100];
+	sprintf(ofname,"%s.scRNA.%03d.%s.BCtab",global_context->input_file_name, sample_index+1,tabtype);
+	FILE * ofp_bcs = fopen( ofname , "w" );
+	sprintf(ofname,"%s.scRNA.%03d.%s.GENEtab",global_context->input_file_name, sample_index+1,tabtype);
+	FILE * ofp_genes = fopen( ofname , "w" );
+	sprintf(ofname,"%s.scRNA.%03d.%s.spmtx",global_context->input_file_name, sample_index+1,tabtype);
+	FILE * ofp_mtx = fopen( ofname , "w" );
+	fprintf(ofp_mtx,"%%%%MatrixMarket matrix coordinate integer general\n");
+
+	HashTable * used_cell_barcodes_tab = ArrayListToLookupTable_Int( used_cell_barcodes );
+
+	ArrayList * output_gene_idxs = HashTableKeys(merged_tab_gene_to_cell_umis);
+	ArrayListSort(output_gene_idxs, NULL);
+	unsigned int nonzero_items = 0, nonozero_genes = 0;
+	HashTable * bc_no_to_output_no_tab = HashTableCreate( global_context -> scRNA_cell_barcodes_array->numOfElements/8 );
+	ArrayList * output_no_tab_to_bcno_arr = ArrayListCreate( global_context -> scRNA_cell_barcodes_array->numOfElements);
+	for(x1 = 0; x1 < output_gene_idxs -> numOfElements; x1++){
+		srInt_64 gene_index = ArrayListGet(output_gene_idxs, x1)-NULL-1;
+		ArrayList * bc_umi_p1_list = HashTableGet(merged_tab_gene_to_cell_umis, NULL+1+ gene_index );
+		assert(bc_umi_p1_list!=NULL);
+		assert(bc_umi_p1_list -> numOfElements >0);
+
+		int old_bc_no = -1, this_gene_has_bc = 0;
+		for(x2=0; x2 < bc_umi_p1_list -> numOfElements; x2++){
+			srInt_64 bc_umi = ArrayListGet(bc_umi_p1_list, x2) - NULL - 1;
+			int bc_no = (int)(bc_umi >>32);
+			void * this_bc_needed = HashTableGet(used_cell_barcodes_tab, NULL+1+bc_no); 
+			if(!this_bc_needed) continue;
+			int out_bc_no = HashTableGet( bc_no_to_output_no_tab , NULL+1+bc_no ) -NULL;
+			if( out_bc_no <1 ){
+				out_bc_no = bc_no_to_output_no_tab -> numOfElements +1;
+				HashTablePut(  bc_no_to_output_no_tab , NULL+1+bc_no, NULL + out_bc_no );
+				ArrayListPush(output_no_tab_to_bcno_arr, NULL+bc_no);
+				fprintf(ofp_bcs,"%s\n", (char*)ArrayListGet(global_context -> scRNA_cell_barcodes_array, bc_no));
+			}
+
+			this_gene_has_bc = 1;
+			if(old_bc_no != bc_no){
+				nonzero_items++;
+				old_bc_no = bc_no;
+			}
+		}
+		if(this_gene_has_bc){
+			nonozero_genes ++;
 		}
-		cell_items_row += xkk;
 	}
-	fprintf(ofp, "\n");
- 
-	HashTable * uniq_gene_nos_in_samples = HashTableCreate(10000);
-	for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++){
-		merged_tables_gene_to_cell_umis[x1]->appendix1 = global_context;
-		merged_tables_gene_to_cell_umis[x1]->appendix2 = uniq_gene_nos_in_samples;
-		HashTableIteration( merged_tables_gene_to_cell_umis[x1], scRNA_merge_write_copy_gene_nos);
+	fprintf(ofp_mtx, "%u %u %u\n", nonozero_genes, (unsigned int)bc_no_to_output_no_tab->numOfElements, nonzero_items );
+
+	nonozero_genes = 0;
+	for(x1 = 0; x1 < output_gene_idxs -> numOfElements; x1++){
+		srInt_64 gene_index = ArrayListGet(output_gene_idxs, x1)-NULL-1;
+		ArrayList * bc_umi_p1_list = HashTableGet(merged_tab_gene_to_cell_umis, NULL+1+ gene_index );
+		assert(bc_umi_p1_list!=NULL);
+
+		unsigned int mybc_UMIs = 0;
+		int old_bc_no = -1, old_out_bc_no = -1, this_gene_has_bc = 0;
+		for(x2=0; x2 < bc_umi_p1_list -> numOfElements; x2++){
+			srInt_64 bc_umi = ArrayListGet(bc_umi_p1_list, x2) - NULL - 1;
+			int bc_no = (int)(bc_umi >>32);
+			int out_bc_no = HashTableGet( bc_no_to_output_no_tab , NULL+1+bc_no ) -NULL;
+			void * this_bc_needed = HashTableGet(used_cell_barcodes_tab, NULL+1+bc_no); 
+			if(!this_bc_needed) continue;
+
+			this_gene_has_bc = 1;
+			if(old_bc_no == bc_no){
+				mybc_UMIs++;
+			}else{
+				if( old_out_bc_no >=0 )fprintf(ofp_mtx,"%d %d %d\n", nonozero_genes+1, old_out_bc_no, mybc_UMIs);
+				old_bc_no = bc_no;
+				old_out_bc_no = out_bc_no;
+				mybc_UMIs = 1;
+			}
+		}
+		if(mybc_UMIs>0) fprintf(ofp_mtx,"%d %d %d\n", nonozero_genes+1, old_out_bc_no, mybc_UMIs);
+
+		if(this_gene_has_bc){
+			if(global_context->is_gene_level){
+				char* gene_name = (char*)global_context -> gene_name_array [gene_index];
+				fprintf(ofp_genes,"%s\n", gene_name);
+			}else{
+				char exon_name[FEATURE_NAME_LENGTH+60];
+				build_exon_name(global_context, loaded_features, gene_index, exon_name);
+				fprintf(ofp_genes,"%s\n", exon_name);
+			}
+			//if(x1 < 7) SUBREADprintf("GOT A GENE SMP_%d: [%d]  %s\n", sample_index+1, x1, gene_name);
+		}
+		nonozero_genes += this_gene_has_bc;
 	}
+	HashTableDestroy(bc_no_to_output_no_tab);
+	HashTableDestroy(used_cell_barcodes_tab);
+	ArrayListDestroy(output_gene_idxs);
+	fclose(ofp_bcs);
+	fclose(ofp_genes);
+	fclose(ofp_mtx);
 
-	ArrayList * gen_no_list_raw = HashTableKeys(uniq_gene_nos_in_samples);
-	ArrayList * gen_no_list = ArrayListCreate(gen_no_list_raw -> numOfElements);
-	for(x1 = 0; x1<gen_no_list_raw->numOfElements;x1++){
-		srInt_64 gene_no = ArrayListGet(gen_no_list_raw, x1) - NULL - 1;
-		srInt_64 UMIs = HashTableGet( uniq_gene_nos_in_samples, NULL+ gene_no+1 )-NULL;
-		if(UMIs >= MIN_EXPRESSED_UMIS_PER_GENE) ArrayListPush( gen_no_list, NULL+1+gene_no ); 
+	return 0;
+}
+
+void scRNA_merged_45K_to_90K_sum_SUM(void * kyGeneID, void * Vcb_umi_arr, HashTable * me){
+	HashTable * gene_to_umis  = me -> appendix1;
+	HashTable * bcid_look_tab = me -> appendix2;
+	ArrayList * cb_umi_arr = Vcb_umi_arr;
+
+	srInt_64 x1, this_gene_added = HashTableGet(gene_to_umis, kyGeneID)-NULL;
+	int has_adding = 0;
+	for(x1 = 0; x1< cb_umi_arr -> numOfElements; x1++){
+		srInt_64 cb_umi = ArrayListGet(cb_umi_arr,x1) - NULL-1;
+		srInt_64 cbno = cb_umi >>32;
+		if(!HashTableGet( bcid_look_tab, NULL+1+cbno ))continue;
+		this_gene_added ++;
+		has_adding = 1;
+	}
+	if(has_adding)HashTablePut(gene_to_umis, kyGeneID, NULL+this_gene_added);
+}
+
+void scRNA_merged_45K_to_90K_sum_WRT(void * kyGeneID, void * valUMIs, HashTable * me){
+	fc_thread_global_context_t * global_context = me -> appendix1;
+	FILE * ofp = me -> appendix2;
+	fc_feature_info_t * loaded_features = me->appendix3;
+
+	if(global_context -> is_gene_level){
+		unsigned char * gene_name = global_context -> gene_name_array[ kyGeneID - NULL-1 ];
+		fprintf(ofp, "%s\t%u\n", gene_name, (unsigned int) (valUMIs-NULL));
+	}else{
+		char exon_name[FEATURE_NAME_LENGTH+60];
+		build_exon_name(global_context, loaded_features, kyGeneID-NULL-1, exon_name);
+		fprintf(ofp,"%s\t%u\n", exon_name, (unsigned int) (valUMIs-NULL));
 	}
-	ArrayListDestroy(gen_no_list_raw);
-	ArrayListSort(gen_no_list, NULL);
+}
 
-	char * linebuf = malloc(15*cell_items_row+MAX_GENE_NAME_LEN * 6);
-	for(xkk=0; xkk <  gen_no_list -> numOfElements; xkk ++){
-		int should_write = scRNA_merged_write_a_gene( global_context, merged_tables_gene_to_cell_umis, used_cell_barcode_tabs, used_cell_barcode_list, merged_umi_list, gen_no_list, xkk,  linebuf);
-		if(should_write)
-			fprintf(ofp,"%s\n", linebuf);
+void scRNA_merged_45K_to_90K_sum(fc_thread_global_context_t * global_context, HashTable * gene_to_cell_umis_tab, ArrayList * bcid_arr, int sample_no, fc_feature_info_t * loaded_features ){
+	HashTable * ret = HashTableCreate( gene_to_cell_umis_tab->numOfElements/6 );
+	HashTable * bcid_look_tab = ArrayListToLookupTable_Int(bcid_arr);
+	gene_to_cell_umis_tab -> appendix1 = ret;
+	gene_to_cell_umis_tab -> appendix2 = bcid_look_tab;
+	HashTableIteration( gene_to_cell_umis_tab, scRNA_merged_45K_to_90K_sum_SUM );
+
+	char ofname[MAX_FILE_NAME_LENGTH + 100];
+	sprintf(ofname,"%s.scRNA.%03d.AmbSum",global_context->input_file_name, sample_no+1);
+	FILE * write_fp = fopen(ofname,"w");
+	fprintf(write_fp,"GeneID\tUMIs\n");
+	ret -> appendix1 = global_context;
+	ret -> appendix2 = write_fp;
+	ret -> appendix3 = loaded_features;
+	ret -> counter1 = sample_no;
+	HashTableIteration( ret, scRNA_merged_45K_to_90K_sum_WRT );
+	HashTableDestroy(bcid_look_tab);
+	HashTableDestroy(ret);
+	fclose(write_fp);
+}
 
+void scRNA_merged_write_nozero_geneids_WRT(void *k, void *v, HashTable* me){
+	FILE * fp = me->appendix1;
+	fc_thread_global_context_t * global_context = me->appendix2;
+	fc_feature_info_t * loaded_features = me->appendix3;
+	if(global_context -> is_gene_level){
+		unsigned char* gene_symbol = global_context -> gene_name_array [k-NULL-1];
+		fprintf(fp, "%s\n", gene_symbol);
+	}else{
+		char exon_name[FEATURE_NAME_LENGTH+60];
+		build_exon_name(global_context, loaded_features, k-NULL-1, exon_name);
+		fprintf(fp,"%s\n", exon_name);
 	}
-	free(linebuf);
+}
 
-	for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++)
-		ArrayListDestroy(used_cell_barcode_list[x1]);
-	free(used_cell_barcode_list);	
+void scRNA_merged_write_nozero_geneids(  fc_thread_global_context_t * global_context, HashTable * gene_to_cell_umis_tab, int samplenno, fc_feature_info_t * loaded_features ){
+	char ofname[MAX_FILE_NAME_LENGTH + 100];
+	sprintf(ofname,"%s.scRNA.%03d.no0Genes",global_context->input_file_name, samplenno+1);
+	FILE * fp = fopen( ofname , "w" );
+	gene_to_cell_umis_tab -> appendix1 =fp;
+	gene_to_cell_umis_tab -> appendix2 =global_context;
+	gene_to_cell_umis_tab -> appendix3 =loaded_features;
+	HashTableIteration(gene_to_cell_umis_tab, scRNA_merged_write_nozero_geneids_WRT);
+	fclose(fp);
+}
 
-	HashTableDestroy(uniq_gene_nos_in_samples);
-	ArrayListDestroy(gen_no_list);
-	fclose(ofp);
+// this function writes a single count table.
+// Rows: genes
+// Cols: Cell_Barcode +"."+ SampleName
+void scRNA_merged_to_tables_write( fc_thread_global_context_t * global_context, HashTable ** merged_tables_gene_to_cell_umis , HashTable ** used_cell_barcode_tabs , ArrayList * merged_umi_list, fc_feature_info_t * loaded_features){
+	char ofname[MAX_FILE_NAME_LENGTH + 20];
+	sprintf(ofname,"%s.scRNA.SampleTable",global_context->input_file_name);
+	FILE * sample_tab_fp = fopen( ofname , "w" );
+	int x1;
+
+	fprintf(sample_tab_fp,"SampleName\tIndex\n");
+	for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++){
+		char * this_sample_name = ArrayListGet(global_context -> scRNA_sample_id_to_name, x1);
+		fprintf(sample_tab_fp,"%s\t%d\n", this_sample_name, 1+x1);
+		ArrayList * high_confid_barcode_index_list = ArrayListCreate(20000);
+		ArrayList * this_sample_ambient_rescure_candi = ArrayListCreate(10000);
+		ArrayList * this_sample_45k_90k_barcode_idx = ArrayListCreate(90000 - 45000 + 100);
+		scRNA_merged_bootstrap_a_sample(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1],merged_umi_list, high_confid_barcode_index_list);
+		scRNA_merged_ambient_rescure(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1], this_sample_ambient_rescure_candi, this_sample_45k_90k_barcode_idx, high_confid_barcode_index_list);
+
+		//SUBREADprintf("HAVING_HIGHCONF_TOTAL %lld\n", high_confid_barcode_index_list -> numOfElements);
+		scRNA_merged_write_sparse_matrix(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1], high_confid_barcode_index_list, x1, "HighConf",  loaded_features);
+		scRNA_merged_write_sparse_matrix(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1], this_sample_ambient_rescure_candi, x1, "RescCand",  loaded_features);
+		scRNA_merged_45K_to_90K_sum( global_context, merged_tables_gene_to_cell_umis[x1], this_sample_45k_90k_barcode_idx, x1 , loaded_features);
+		scRNA_merged_write_nozero_geneids( global_context, merged_tables_gene_to_cell_umis[x1], x1, loaded_features );
+
+		ArrayListDestroy(this_sample_ambient_rescure_candi);
+		ArrayListDestroy(this_sample_45k_90k_barcode_idx);
+		ArrayListDestroy(high_confid_barcode_index_list);
+	}
+
+	fclose(sample_tab_fp);
 }
 
 void  scRNA_merge_merge_UMIs(void * gno_ky, void * cell_umi_2_reads_list_va , HashTable * tab){
@@ -4389,7 +4627,7 @@ void  scRNA_merge_merge_UMIs(void * gno_ky, void * cell_umi_2_reads_list_va , Ha
 
 
 // return the number of RG result sets
-int fc_thread_merge_results(fc_thread_global_context_t * global_context, read_count_type_t * nreads , srInt_64 *nreads_mapped_to_exon, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * RGmerged_table)
+int fc_thread_merge_results(fc_thread_global_context_t * global_context, read_count_type_t * nreads , srInt_64 *nreads_mapped_to_exon, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * RGmerged_table, fc_feature_info_t * loaded_features)
 {
 	int xk1, xk2, ret = 0;
 
@@ -4442,7 +4680,7 @@ int fc_thread_merge_results(fc_thread_global_context_t * global_context, read_co
 			HashTableIteration(  merged_sample_cell_umi_tables[xk1], scRNA_merge_merge_UMIs);
 		}
 
-		scRNA_merged_to_tables_write(global_context , merged_sample_cell_umi_tables , used_cell_no_tables, merged_umi_list);
+		scRNA_merged_to_tables_write(global_context , merged_sample_cell_umi_tables , used_cell_no_tables, merged_umi_list, loaded_features);
 
 	//	SUBREADprintf("MERGED UMI TABLE = %ld items\n", merged_umi_table -> numOfElements);
 		ArrayListDestroy(merged_umi_list);
@@ -5629,9 +5867,11 @@ void print_usage()
 	SUBREADputs("                      'SAF'. 'GTF' by default.  For SAF format, please refer to");
 	SUBREADputs("                      Users Guide.");
 	SUBREADputs("");
-	SUBREADputs("  -t <string>         Specify feature type in GTF annotation. 'exon' by ");
-	SUBREADputs("                      default. Features used for read counting will be ");
-	SUBREADputs("                      extracted from annotation using the provided value.");
+	SUBREADputs("  -t <string>         Specify feature type(s) in a GTF annotation. If multiple");
+	SUBREADputs("                      types are provided, they should be separated by ',' with");
+	SUBREADputs("                      no space in between. 'exon' by default. Rows in the");
+	SUBREADputs("                      annotation with a matched feature will be extracted and");
+	SUBREADputs("                      used for read mapping. ");
 	SUBREADputs("");
 	SUBREADputs("  -g <string>         Specify attribute type in GTF annotation. 'gene_id' by ");
 	SUBREADputs("                      default. Meta-features used for read counting will be ");
@@ -6206,7 +6446,7 @@ HashTable * scRNA_copy_loaded_features(srInt_64 nexons, fc_feature_info_t* loade
 	return ret;
 }
 
-int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons,  int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junc_glob_tab, HashTable * splicing_glob_tab, HashTable * merged_RG_table);
+int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons,  int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junc_glob_tab, HashTable * splicing_glob_tab, HashTable * merged_RG_table, fc_feature_info_t * loaded_features);
 
 int Input_Files_And_Strand_Mode_Pair(char * fnames, char * smodes){
 	int ret = 0, ch, bad_fmt = 0, numbs = 0;
@@ -6737,7 +6977,7 @@ int readSummary(int argc,char *argv[]){
 		global_context.is_read_details_out = isReadSummaryReport;
 		global_context.max_M = max_M;
 
-		ret_int = ret_int || readSummary_single_file(& global_context, column_numbers, nexons, geneid, chr, start, stop, sorted_strand, anno_chr_2ch, anno_chrs, anno_chr_head, block_end_index, block_min_start, block_max_end, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table);
+		ret_int = ret_int || readSummary_single_file(& global_context, column_numbers, nexons, geneid, chr, start, stop, sorted_strand, anno_chr_2ch, anno_chrs, anno_chr_head, block_end_index, block_min_start, block_max_end, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table, loaded_features);
 		if(global_context.disk_is_full){
 			SUBREADprintf("ERROR: disk is full. Please check the free space in the output directory.\n");
 		}
@@ -6806,6 +7046,7 @@ int readSummary(int argc,char *argv[]){
 	if(global_context.is_input_bad_format){
 		SUBREADprintf("\nFATAL Error: The program has to terminate and no counting file is generated.\n\n");
 	}else if(!global_context.disk_is_full){
+		print_in_box(80,0,0,"Write the final count table.");
 		if(isGeneLevel){
 			char ** sorted_extra_columns = NULL;
 			if(global_context.reported_extra_columns != NULL){
@@ -6823,11 +7064,15 @@ int readSummary(int argc,char *argv[]){
 		} else
 			fc_write_final_results(&global_context, argv[3], nexons, table_columns, table_column_names, loaded_features, isCVersion);
 	}
-	if(global_context.do_junction_counting && !global_context.disk_is_full)
+	if(global_context.do_junction_counting && global_context.is_input_bad_format == 0 && !global_context.disk_is_full){
+		print_in_box(80,0,0,"Write the junction count table.");
 		fc_write_final_junctions(&global_context, argv[3], table_column_names, junction_global_table_list, splicing_global_table_list);
+	}
 
-	if(!global_context.disk_is_full)
+	if(global_context.is_input_bad_format == 0 && !global_context.disk_is_full){
+		print_in_box(80,0,0,"Write the read assignment summary.");
 		fc_write_final_counts(&global_context, argv[3], table_column_names,  read_counters, isCVersion);
+	}
 
 	ArrayListDestroy(table_columns);
 	ArrayListDestroy(table_column_names);
@@ -6962,7 +7207,7 @@ void sort_bucket_table(fc_thread_global_context_t * global_context){
 
 
 
-int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons,  int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * merged_RG_table)
+int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons,  int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * merged_RG_table, fc_feature_info_t * loaded_features)
 {
 	int read_length = 0;
 	int is_first_read_PE=0;
@@ -7000,7 +7245,7 @@ int readSummary_single_file(fc_thread_global_context_t * global_context, read_co
 	fc_thread_wait_threads(global_context);
 
 	srInt_64 nreads_mapped_to_exon = 0;
-	fc_thread_merge_results(global_context, column_numbers , &nreads_mapped_to_exon, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table);
+	fc_thread_merge_results(global_context, column_numbers , &nreads_mapped_to_exon, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table, loaded_features);
 	fc_thread_destroy_thread_context(global_context);
 
 	if(global_context -> sambam_chro_table) free(global_context -> sambam_chro_table);
@@ -7031,7 +7276,7 @@ int feature_count_main(int argc, char ** argv)
 	int cmd_rebuilt_size = 2000;
 	char * cmd_rebuilt = malloc(cmd_rebuilt_size);
 	char max_M_str[8];
-	char nameFeatureTypeColumn[66];
+	char nameFeatureTypeColumn[2000];
 	char nameGeneIDColumn[66];
 	int min_qual_score = 0;
 	int min_dist = 50;


=====================================
src/sambam-file.c
=====================================
@@ -557,7 +557,8 @@ int PBam_chunk_headers(char * chunk, int *chunk_ptr, int chunk_len, SamBam_Refer
 					}
 
 					SamBam_Reference_Info * new_event = (*bam_chro_table) + (* table_items);
-					strncpy(new_event->chro_name, chro_name, BAM_MAX_CHROMOSOME_NAME_LEN);
+					if(strlen(chro_name)>=BAM_MAX_CHROMOSOME_NAME_LEN) chro_name[BAM_MAX_CHROMOSOME_NAME_LEN-1]=0;
+					strcpy(new_event->chro_name, chro_name);
 					new_event -> chro_length = chro_len;
 
 					(* table_items)++;
@@ -1782,7 +1783,7 @@ int SamBam_unzip(char * out , char * in , int inlen)
 }
 
 
-int SamBam_writer_sort_buff_one_compare(void * Lbin, void * Rbin){
+int SamBam_writer_sort_buff_one_compare(void * Lbin, void * Rbin, ArrayList * me){
 	unsigned long long * Lv = Lbin, *Rv = Rbin;
 	if(*Lv > *Rv) return 1;
 	else if(*Lv < *Rv) return -1;
@@ -2052,7 +2053,7 @@ void SamBam_writer_one_thread_merge_sortedbins(SamBam_Writer * writer){
 
 int level_min_binno[] = {0, 1, 9, 73, 585, 4681};
 
-int SamBam_writer_merge_chunks_compare(void * vL, void * vR){
+int SamBam_writer_merge_chunks_compare(void * vL, void * vR, ArrayList * me){
 	long long * lL = vL;
 	long long * lR = vR;
 	if( (*lL) > (*lR)) return 1;


=====================================
src/sorted-hashtable.c
=====================================
@@ -125,9 +125,9 @@ int _gehash_resize_bucket(gehash_t * the_table , int bucket_no, char is_small_ta
 	if (current_bucket->space_size<1)
 	{
 		if(is_small_table)
-			new_bucket_length = (int)(GEHASH_BUCKET_LENGTH*1.5);
+			new_bucket_length = 5 ;
 		else
-			new_bucket_length = (int)(GEHASH_BUCKET_LENGTH);
+			new_bucket_length = 2 ;
 
 		if(the_table->version_number == SUBINDEX_VER0)
 			new_item_keys = (gehash_key_t *) malloc(sizeof(gehash_key_t) * new_bucket_length);
@@ -167,7 +167,7 @@ int _gehash_resize_bucket(gehash_t * the_table , int bucket_no, char is_small_ta
 			//new_bucket_length = (int)max(15,current_bucket->space_size*1.5+1);
 			new_bucket_length = (int)(current_bucket->space_size*5);
 		else
-			new_bucket_length = (int)(current_bucket->space_size*1.5);
+			new_bucket_length = max((int)(current_bucket->space_size*1.5), (int)(current_bucket->space_size+4));	
 
 		if(the_table->version_number == SUBINDEX_VER0)
 			current_bucket->item_keys = (gehash_key_t *) realloc(current_bucket->item_keys ,  sizeof(gehash_key_t) * new_bucket_length);
@@ -188,200 +188,6 @@ int _gehash_resize_bucket(gehash_t * the_table , int bucket_no, char is_small_ta
 	return 0;
 }
 
-int _gehash_resize_bucket_old(gehash_t * the_table , int bucket_no, char is_small_table)
-{
-	int new_bucket_length;
-	struct gehash_bucket * current_bucket = &(the_table -> buckets [bucket_no]);
-	gehash_key_t * new_item_keys = NULL;
-	short * new_new_item_keys = NULL;
-	gehash_data_t * new_item_values;
-
-	if (current_bucket->space_size<1)
-	{
-		int randv = myrand_rand();
-		if(is_small_table)
-			//new_bucket_length = (int)max(15,current_bucket->space_size*1.5+1);
-			new_bucket_length = (int)(GEHASH_BUCKET_LENGTH*(1.5+3.*pow(randv * 1./RAND_MAX,30)));
-		else
-			new_bucket_length = (int)(GEHASH_BUCKET_LENGTH*(1. +3.*pow(randv * 1./RAND_MAX,30)));
-
-
-	//	printf("NS1=%d\nNS2=%d\n", new_bucket_length, new_bucket_length);
-		if(the_table->version_number == SUBINDEX_VER0)
-		{
-			new_item_keys = (gehash_key_t *) malloc(sizeof(gehash_key_t) * new_bucket_length);
-		}
-		else
-		{
-			new_new_item_keys = (short *) malloc(sizeof(short)*new_bucket_length);
-		}
-
-		new_item_values = (gehash_data_t *) malloc(sizeof(gehash_data_t) * new_bucket_length);
-
-		if(!((new_item_keys|| new_new_item_keys) && new_item_values))
-		{
-			SUBREADputs(MESSAGE_OUT_OF_MEMORY);
-			return 1;
-		}
-
-		if(the_table->version_number == SUBINDEX_VER0)
-			memset(new_item_keys, 0, sizeof(gehash_key_t) * new_bucket_length);
-		else
-			memset(new_new_item_keys, 0, sizeof(short) * new_bucket_length);
-
-		memset(new_item_values, 0, sizeof(gehash_data_t) * new_bucket_length);
-
-		if(the_table->version_number == SUBINDEX_VER0)
-			current_bucket->item_keys = new_item_keys;
-		else
-			current_bucket->new_item_keys = new_new_item_keys;
-
-		current_bucket->item_values = new_item_values;
-		current_bucket->space_size = new_bucket_length;
-
-	}
-	else
-	{
-		int test_start = myrand_rand() % the_table-> buckets_number;
-		int i;
-		int need_allowc = 1;
-		for (i =test_start; i<test_start+10000; i++)
-		{
-			if (i==bucket_no)continue;
-			if (i>=the_table-> buckets_number)
-			{
-				test_start = myrand_rand() % the_table-> buckets_number;
-				i = test_start;
-				continue;
-			} 
-
-			if(the_table-> buckets[i].space_size > current_bucket->current_items+1 && current_bucket->space_size > the_table-> buckets[i].current_items+1 && current_bucket->current_items > the_table-> buckets[i].current_items)
-			{
-				int j;
-				gehash_data_t t_d;
-
-				for (j=0; j<current_bucket->current_items; j++)
-				{
-					if (j< the_table -> buckets[i].current_items)
-					{
-						if(the_table->version_number == SUBINDEX_VER0)
-						{
-							gehash_key_t t_key;
-							t_key = current_bucket->item_keys[j];
-							current_bucket->item_keys[j] = the_table-> buckets[i].item_keys[j];
-							the_table-> buckets[i].item_keys[j] = t_key;
-						}
-						else
-						{
-
-							short t_key;
-							t_key = current_bucket->new_item_keys[j];
-							current_bucket->new_item_keys[j] = the_table-> buckets[i].new_item_keys[j];
-							the_table-> buckets[i].new_item_keys[j] = t_key;
-						}
-
-
-						t_d = current_bucket->item_values[j];
-						current_bucket->item_values[j] = the_table-> buckets[i].item_values[j];
-						the_table-> buckets[i].item_values[j] = t_d;
-					}
-					else
-					{
-						if(the_table->version_number == SUBINDEX_VER0)
-							the_table-> buckets[i].item_keys[j] = current_bucket->item_keys[j];
-						else
-							the_table-> buckets[i].new_item_keys[j] = current_bucket->new_item_keys[j];
-						the_table-> buckets[i].item_values[j] = current_bucket->item_values[j];
-					}
-				}
-
-				gehash_key_t * p_key;
-				short * ps_key;
-				gehash_data_t * p_d;
-				int i_size;
-
-				if(the_table->version_number == SUBINDEX_VER0)
-				{
-					p_key = the_table-> buckets[i].item_keys ;
-					the_table-> buckets[i].item_keys = current_bucket->item_keys;
-					current_bucket->item_keys = p_key;
-				}else{
-					ps_key = the_table-> buckets[i].new_item_keys ;
-					the_table-> buckets[i].new_item_keys = current_bucket->new_item_keys;
-					current_bucket->new_item_keys = ps_key;
-				}
-
-				p_d = the_table-> buckets[i].item_values ;
-				the_table-> buckets[i].item_values = current_bucket->item_values;
-				current_bucket->item_values = p_d;
-
-				i_size = the_table-> buckets[i].space_size;
-				the_table-> buckets[i].space_size =current_bucket->space_size;
-				current_bucket->space_size=i_size;
-				
-				need_allowc = 0;
-				break;
-			}
-		} 
-		if(need_allowc)
-		{
-			if(is_small_table)
-				//new_bucket_length = (int)max(15,current_bucket->space_size*1.5+1);
-				new_bucket_length = (int)(current_bucket->space_size*5);
-			else
-				new_bucket_length = (int)(current_bucket->space_size*1.5);
-
-	//		printf("NSS1=%d\nNSS2=%d\n", new_bucket_length, new_bucket_length);
-			if(the_table->version_number == SUBINDEX_VER0)
-				new_item_keys = (gehash_key_t *) malloc(sizeof(gehash_key_t) * new_bucket_length);
-			else
-				new_new_item_keys = (short *) malloc(sizeof(short) * new_bucket_length);
-
-			new_item_values = (gehash_data_t *) malloc(sizeof(gehash_data_t) * new_bucket_length);
-
-
-			if(!((new_item_keys||new_new_item_keys) && new_item_values))
-			{
-				SUBREADputs(MESSAGE_OUT_OF_MEMORY);
-				return 1;
-			}
-
-			if(the_table->version_number == SUBINDEX_VER0)
-				memset(new_item_keys, 0, sizeof(gehash_key_t) * new_bucket_length);
-			else
-				memset(new_new_item_keys, 0, sizeof(short) * new_bucket_length);
-
-			memset(new_item_values, 0, sizeof(gehash_data_t) * new_bucket_length);
-
-			if(the_table->version_number == SUBINDEX_VER0)
-				memcpy(new_item_keys, current_bucket->item_keys, current_bucket->current_items*sizeof(gehash_key_t));
-			else
-				memcpy(new_new_item_keys, current_bucket->item_keys, current_bucket->current_items*sizeof(short));
-
-			memcpy(new_item_values, current_bucket->item_values, current_bucket->current_items*sizeof(gehash_data_t));
-
-			if(the_table->version_number == SUBINDEX_VER0)
-				free(current_bucket->item_keys);
-			else
-				free(current_bucket->new_item_keys);
-
-			free(current_bucket->item_values);
-
-			if(the_table->version_number == SUBINDEX_VER0)
-				current_bucket->item_keys = new_item_keys;
-			else
-				current_bucket->new_item_keys = new_new_item_keys;
-
-			current_bucket->item_values = new_item_values;
-			current_bucket->space_size = new_bucket_length;
-
-
-		}
-	}
-	return 0;
-
-}
-
 #define _gehash_get_bucket(tab, key)  ( (tab) -> buckets + _gehash_hash( key ) % (tab) -> buckets_number )
 #define _gehash_get_bucketNO(tab, key)  ( _gehash_hash( key ) % (tab) -> buckets_number )
 
@@ -419,13 +225,59 @@ int gehash_insert(gehash_t * the_table, gehash_key_t key, gehash_data_t data, un
 	{
 		short step_number = key/the_table -> buckets_number;
 		if(current_bucket-> new_item_keys == NULL && bucket_sizes){
-			unsigned int bkno = _gehash_get_bucketNO(the_table, key) ;
-			unsigned char * mem_block = malloc((sizeof(gehash_data_t) + sizeof(short)) * bucket_sizes[ bkno ] ); 
-			current_bucket->item_values = ( gehash_data_t * )mem_block;
-			current_bucket->new_item_keys = ( short*)(mem_block + sizeof(gehash_data_t) *  bucket_sizes[ bkno ]) ;
-			current_bucket->current_items = 0;
-			current_bucket->space_size = bucket_sizes[ bkno ];
-			the_table -> free_item_only = 1;
+			//this will go only once when all the bucktes are NULL.
+
+			unsigned int * memory_sizes = malloc(sizeof(int) * the_table -> buckets_number);
+			memset(memory_sizes, 0xff, sizeof(int) * the_table -> buckets_number);
+
+			unsigned int grp_bytes = 0, grps = 0, this_grp_bucks = 0, xk1, bucks_per_grp = the_table -> buckets_number / GEHASH_MEM_PTR_NO +2;
+			for(xk1=0; xk1<  the_table -> buckets_number ; xk1++){
+				unsigned int buck_size =(sizeof(short) + sizeof(gehash_data_t)) * bucket_sizes[xk1];
+				grp_bytes += buck_size;
+				this_grp_bucks ++;
+				if(  this_grp_bucks >= bucks_per_grp ){
+					memory_sizes[grps++] = grp_bytes;
+					grp_bytes=0;
+					this_grp_bucks=0;
+				}
+			}
+			if(this_grp_bucks)
+				memory_sizes[grps] = grp_bytes;
+
+			for(xk1=0; xk1 < GEHASH_MEM_PTR_NO; xk1++){
+				unsigned int current_bytes = memory_sizes[xk1];
+				if(current_bytes<0xff000000u){
+					if(the_table -> malloc_ptr[xk1]!=NULL){
+						SUBREADprintf("ERROR : no-NULL ptr : %p\n",the_table -> malloc_ptr[xk1]);
+					}
+					the_table -> malloc_ptr[xk1] = malloc(current_bytes);
+					if(!the_table -> malloc_ptr[xk1]){
+						SUBREADputs(MESSAGE_OUT_OF_MEMORY);
+						return 1;
+					}
+				}
+			}
+
+			grp_bytes = 0; grps = 0; this_grp_bucks = 0;
+			for(xk1=0; xk1<  the_table -> buckets_number ; xk1++){
+				struct gehash_bucket * mem_bucket = the_table -> buckets+xk1;
+				memset(mem_bucket, 0, sizeof(struct gehash_bucket));
+				mem_bucket -> new_item_keys = (short*)((char*)the_table -> malloc_ptr[grps] + grp_bytes);
+				mem_bucket -> item_values = (gehash_data_t*)((char *)mem_bucket -> new_item_keys + sizeof(short) * bucket_sizes[xk1]);
+				mem_bucket -> space_size = bucket_sizes[xk1];
+
+				unsigned int buck_size =(sizeof(short) + sizeof(gehash_data_t)) * bucket_sizes[xk1];
+				grp_bytes += buck_size;
+				this_grp_bucks ++;
+				if(  this_grp_bucks >= bucks_per_grp ){
+					grps++;
+					grp_bytes=0;
+					this_grp_bucks=0;
+				}
+			}
+
+			the_table -> free_item_only = 2;
+			free(memory_sizes);
 		}
 
 		if (current_bucket->current_items >= current_bucket->space_size) {
@@ -1548,7 +1400,7 @@ int gehash_load(gehash_t * the_table, const char fname [])
 		//return -1;
 	}
 	
-	the_table -> malloc_ptr = NULL;
+	memset(the_table -> malloc_ptr ,0, sizeof(void*) * GEHASH_MEM_PTR_NO);
 	the_table -> index_gap = 0;
 
 	FILE * fp = f_subr_open(fname, "rb");
@@ -1622,69 +1474,82 @@ int gehash_load(gehash_t * the_table, const char fname [])
 			SUBREADputs("ERROR: the index format is unrecognizable.");
 			assert(the_table -> current_items >0 && the_table -> current_items <=0xffffffffllu);
 		}
+
+		unsigned int * bucket_bytes = malloc(sizeof(int) * GEHASH_MEM_PTR_NO);
+		memset(bucket_bytes, 0xff, sizeof(int) * GEHASH_MEM_PTR_NO);
 		the_table -> buckets_number = load_int32(fp);
 		the_table -> buckets = (struct gehash_bucket * )malloc(sizeof(struct gehash_bucket) * the_table -> buckets_number);
 		if(!the_table -> buckets)
 		{
+			SUBREADputs("Creating buckets.");
 			SUBREADputs(MESSAGE_OUT_OF_MEMORY);
 			return 1;
 		}
 
-		size_t fp_curr = ftello(fp), fp_end=fp_curr;
-		 
-		fseeko(fp, 0, SEEK_END);
-		fp_end = ftello(fp); 	
-		fseeko(fp, fp_curr, SEEK_SET);
-//		SUBREADprintf("CACL_SIZE %zd %zd\n", fp_end, fp_curr);
-		
-		size_t rem_len = fp_end - fp_curr;
-		the_table -> malloc_ptr = malloc(rem_len);
-		if(!the_table -> malloc_ptr){
-			SUBREADputs(MESSAGE_OUT_OF_MEMORY);
-			return 1;
-		}
-
-		size_t rdbytes = 0;
-		while(!feof(fp) && rdbytes < rem_len){
-			size_t rrr = fread(the_table -> malloc_ptr + rdbytes, 1, rem_len - rdbytes, fp);
-			//SUBREADprintf("LOAD: %zd + %zd ==> %zd\n", rdbytes , rrr, rem_len);
-			rdbytes += rrr;
-		}
-		fclose(fp);
-		if(rdbytes != rem_len){
-			SUBREADprintf("Error: the index cannot be fully loaded (%ld != %ld). It may contain format errors or file '%s' may be truncated.\n", (long) fp_end, (long) fp_curr, fname);
-			return -1;
-		}
-		if(rdbytes<1) {
-			SUBREADprintf("Error: the index seem to contain no data at all. It may contain format errors or file '%s' may be truncated.\n", fname);
-			return -1;
+		srInt_64 fp_curr = ftello(fp);
+		unsigned int accued_bytes = 0, grp_i = 0, curr_bucks = 0, per_group_bucks = the_table -> buckets_number/ GEHASH_MEM_PTR_NO + 2;
+		for(i=0; i<the_table -> buckets_number ; i++){
+			unsigned int current_items = load_int32(fp);
+			load_int32(fp);//useless for loading : space size
+			unsigned int current_bytes = current_items*( sizeof(short) + sizeof(gehash_data_t) );
+			accued_bytes += current_bytes; 
+			curr_bucks ++;
+			if(curr_bucks >= per_group_bucks){
+				//SUBREADprintf("Allocating %d : %d buckets : %u\n", grp_i, i, accued_bytes);
+				bucket_bytes[grp_i++] = accued_bytes;
+				accued_bytes = 0;
+				curr_bucks = 0;
+			}
+			fseeko(fp, current_bytes, SEEK_CUR);
 		}
-		char * curr_ptr = the_table -> malloc_ptr;
-		for (i=0; i<the_table -> buckets_number; i++)
-		{
-			struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
-			memcpy(&current_bucket -> current_items, curr_ptr, 4);
-			curr_ptr+=4;
-
-			memcpy(&current_bucket -> space_size, curr_ptr, 4);
-			curr_ptr+=4;
-
-			current_bucket -> new_item_keys = (short*)curr_ptr;
-			curr_ptr += sizeof(short)*current_bucket -> current_items;
-			current_bucket -> item_values = (gehash_data_t*)curr_ptr;
-			curr_ptr += sizeof(gehash_data_t)*current_bucket -> current_items;
-			if( curr_ptr > the_table -> malloc_ptr + rdbytes){
-				SUBREADprintf("Error: the index cannot be fully loaded : %p > %p + %ld. It may contain format errors or file '%s' may be truncated.\n",curr_ptr,  the_table -> malloc_ptr, rdbytes , fname);
-				return -1;
+		if(curr_bucks)
+			bucket_bytes[grp_i++] = accued_bytes;
+		fseeko(fp, (off_t)fp_curr, SEEK_SET);
+
+		for(i=0; i<GEHASH_MEM_PTR_NO ; i++){
+			unsigned int current_bytes = bucket_bytes[i];
+			if(current_bytes<0xff000000u){
+				the_table -> malloc_ptr[i] = malloc(current_bytes);
+				//SUBREADprintf("Allocating %d buckets : %u\n", i, current_bytes);
+				if(!the_table -> malloc_ptr[i]){
+					SUBREADputs(MESSAGE_OUT_OF_MEMORY);
+					return 1;
+				}
 			}
 		}
-		the_table -> is_small_table = *curr_ptr;
-		curr_ptr ++;
-		if( curr_ptr != the_table -> malloc_ptr + rdbytes ){
-			SUBREADprintf("Error: the index cannot be fully loaded. It may contain format errors or file '%s' may be truncated.\n", fname);
-			return -1;
+
+		grp_i = 0;
+		curr_bucks = 0;
+		accued_bytes = 0;
+		for(i=0; i<the_table -> buckets_number ; i++){
+			struct gehash_bucket * current_bucket = the_table -> buckets+i;
+			current_bucket -> current_items = load_int32(fp);
+			load_int32(fp);//useless for lo: space size
+			current_bucket -> space_size =  current_bucket -> current_items;
+			unsigned int current_bytes = current_bucket -> current_items*( sizeof(short) + sizeof(gehash_data_t) );
+
+			current_bucket -> new_item_keys = (short*)(the_table -> malloc_ptr[grp_i] + accued_bytes);
+			current_bucket -> item_values = (gehash_data_t*)(the_table -> malloc_ptr[grp_i] + accued_bytes + current_bucket -> current_items * sizeof(short));
+			read_length = fread(current_bucket -> new_item_keys, sizeof(short), current_bucket -> current_items, fp);
+			read_length += fread(current_bucket -> item_values, sizeof(gehash_data_t), current_bucket -> current_items, fp);
+			if(read_length < 2* current_bucket -> current_items){
+				SUBREADprintf("ERROR: the index is incomplete.\n");
+				return 1;
+			}
+			
+			accued_bytes += current_bytes;
+			curr_bucks ++;
+			if(curr_bucks >= per_group_bucks){
+				curr_bucks=0;
+				accued_bytes=0;
+				grp_i++;
+			}
 		}
-		assert( curr_ptr == the_table -> malloc_ptr + rdbytes );
+
+		int rval = fread(&(the_table -> is_small_table), sizeof(char), 1, fp);
+		if (rval != 1)SUBREADprintf("ERROR: cannot find the table table.\n");
+		free(bucket_bytes);
+		fclose(fp);
 		return 0;
 
 	}
@@ -1736,7 +1601,10 @@ int gehash_load(gehash_t * the_table, const char fname [])
 		}
 
 		read_length = fread(&(the_table -> is_small_table), sizeof(char), 1, fp);
-		assert(read_length>0);
+		if(read_length!=1){
+			SUBREADprintf("ERROR: the index is incomplete.\n");
+			return 1;
+		}
 		fclose(fp);
 		return 0;
 	}
@@ -2027,10 +1895,14 @@ int gehash_dump(gehash_t * the_table, const char fname [])
 
 void gehash_destory(gehash_t * the_table)
 {
-	int i;
+	int i, is_ptr = 0;
+
+	for(i = 0; i < GEHASH_MEM_PTR_NO; i++) if(the_table -> malloc_ptr[i]){
+		free(the_table -> malloc_ptr[i]);
+		is_ptr=1;
+	}
 
-	if(the_table -> malloc_ptr) free(the_table -> malloc_ptr);
-	else for (i=0; i<the_table -> buckets_number; i++)
+	if(!is_ptr) for (i=0; i<the_table -> buckets_number; i++)
 	{
 		struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
 		if (current_bucket -> space_size > 0)
@@ -2039,6 +1911,7 @@ void gehash_destory(gehash_t * the_table)
 			free (current_bucket -> item_values);
 		}
 	}
+
 	free (the_table -> buckets);
 
 	the_table -> current_items = 0;


=====================================
src/subread.h
=====================================
@@ -270,6 +270,12 @@ struct gehash_bucket {
 	gehash_data_t * item_values;
 };
 
+
+#ifdef __MINGW32__
+#define GEHASH_MEM_PTR_NO (64) 
+#else
+#define GEHASH_MEM_PTR_NO (64*1024) 
+#endif
 typedef struct {
 	int version_number;
 	unsigned long long int current_items;
@@ -278,7 +284,7 @@ typedef struct {
 	struct gehash_bucket * buckets;
 	int index_gap;
 	int padding;
-	char * malloc_ptr;
+	char * malloc_ptr [GEHASH_MEM_PTR_NO];
 	int free_item_only;
 } gehash_t;
 


=====================================
src/tx-unique.c
=====================================
@@ -85,7 +85,7 @@ int txunique_load_annotation(txunique_context_t * context){
 	return 0;
 }
 
-int txunique_process_flat_comp( void * ex1p, void * ex2p ){
+int txunique_process_flat_comp( void * ex1p, void * ex2p, ArrayList *me ){
 	txunique_exon_t * ex1 = ex1p;
 	txunique_exon_t * ex2 = ex2p;
 
@@ -150,7 +150,7 @@ void debug_print_edges(ArrayList * exs){
 	}
 }
 
-int txunique_process_gene_edge_comp(void * e1p, void * e2p){
+int txunique_process_gene_edge_comp(void * e1p, void * e2p, ArrayList *me){
 	struct _txunique_tmp_edges * e1 = e1p;
 	struct _txunique_tmp_edges * e2 = e2p;
 	if(e1 -> base_open_end < e2 -> base_open_end) return -1;


=====================================
test/subread-align/data/test-err-mut-r1.fq.gz deleted
=====================================
Binary files a/test/subread-align/data/test-err-mut-r1.fq.gz and /dev/null differ


=====================================
test/subread-align/data/test-err-mut-r2.fq.gz deleted
=====================================
Binary files a/test/subread-align/data/test-err-mut-r2.fq.gz and /dev/null differ



View it on GitLab: https://salsa.debian.org/med-team/subread/-/commit/581d5ef26ce8e53c4a026feb096ee98e2510aec4

-- 
View it on GitLab: https://salsa.debian.org/med-team/subread/-/commit/581d5ef26ce8e53c4a026feb096ee98e2510aec4
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200522/f6808858/attachment-0001.html>


More information about the debian-med-commit mailing list