[med-svn] [Git][med-team/subread][upstream] New upstream version 2.0.1+dfsg
Steffen Möller
gitlab at salsa.debian.org
Fri May 22 18:59:40 BST 2020
Steffen Möller pushed to branch upstream at Debian Med / subread
Commits:
581d5ef2 by Steffen Moeller at 2020-05-22T19:30:00+02:00
New upstream version 2.0.1+dfsg
- - - - -
22 changed files:
- doc/SubreadUsersGuide.tex
- src/HelperFunctions.c
- src/core-interface-aligner.c
- src/core-junction.c
- src/core.c
- src/core.h
- src/detection-calls.c
- src/flattenAnnotations.c
- src/gene-algorithms.c
- src/hashtable.c
- src/hashtable.h
- src/index-builder.c
- src/input-files.c
- src/input-files.h
- src/makefile.version
- src/readSummary.c
- src/sambam-file.c
- src/sorted-hashtable.c
- src/subread.h
- src/tx-unique.c
- − test/subread-align/data/test-err-mut-r1.fq.gz
- − test/subread-align/data/test-err-mut-r2.fq.gz
Changes:
=====================================
doc/SubreadUsersGuide.tex
=====================================
@@ -36,19 +36,19 @@
\begin{center}
{\Huge\bf Rsubread/Subread Users Guide}\\
\vspace{1 cm}
-{\centering\large Rsubread v2.0.0/Subread v2.0.0\\}
+{\centering\large Subread v2.0.1/Rsubread v2.2.0\\}
\vspace{1 cm}
-\centering 2 September 2019\\
+\centering 13 May 2020\\
\vspace{5 cm}
\Large Wei Shi and Yang Liao\\
\vspace{1 cm}
\small
-{\large Bioinformatics Division\\
-The Walter and Eliza Hall Institute of Medical Research\\
-The University of Melbourne\\
-Melbourne, Australia\\}
+{\large Olivia Newton-John Cancer Research Institute\\
+Walter and Eliza Hall Institute of Medical Research\\
+University of Melbourne\\
+Australia\\}
\vspace{7 cm}
-\centering Copyright \small{\copyright} 2011 - 2019\\
+\centering Copyright \small{\copyright} 2011 - 2020\\
\end{center}
\end{titlepage}
@@ -1027,7 +1027,7 @@ read mapping that produced the provided SAM/BAM files. This optional argument ca
\hline
-s $<int or string>$ \newline (\code{isStrandSpecific}) & Indicate if strand-specific read counting should be performed. A single integer value (applied to all input files) or a string of comma-separated values (applied to each corresponding input file) should be provided. Possible values include: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). Default value is 0 (ie. unstranded read counting carried out for all input files). For paired-end reads, strand of the first read is taken as the strand of the whole fragment. FLAG field is used to tell if a read is first or second read in a pair. Value of \code{isStrandSpecific} parameter in {\Rsubread} {\featureCounts} is a vector which has a length of either {\code{1}}, or the same with the total number of input files provided. \\
\hline
--t $<string>$ \newline (\code{GTF.featureType}) & Specify the feature type. Only rows which have the matched feature type in the provided GTF annotation file will be included for read counting. `exon' by default.\\
+-t $<string>$ \newline (\code{GTF.featureType}) & Specify the feature type(s). If more than one feature type is provided, they should be separated by `,' (no space). Only rows which have a matched feature type in the provided GTF annotation file will be included for read counting. `exon' by default.\\
\hline
-T $<int>$ \newline (\code{nthreads}) & Number of the threads. The value should be between 1 and 32. 1 by default.\\
\hline
@@ -1190,6 +1190,17 @@ featureCounts(files="mapping_results_PE.bam",isPairedEnd=TRUE,requireBothEndsMap
featureCounts(files="mapping_results_PE.bam",isPairedEnd=TRUE,countChimericFragments=FALSE)
\end{Rcode}
+\chapter{Mapping and quantification of single-cell data}
+
+\section{cellCounts}
+
+The \emph{cellCounts} program was developed to process single-cell RNA-seq (scRNA-seq) data to generate UMI counts for genes, exons or other genomic features.
+It calls \emph{align} function to perform mapping of scRNA-seq reads and calls \emph{featureCounts} functions to generate UMI counts.
+\emph{cellCounts} demultiplexes cell barcodes and also deduplicates reads.
+It accepts BCL or FASTQ files as input.
+Quantification of a scRNA-seq dataset can be simply done by a single \emph{cellCounts} function call.
+It is much faster than other scRNA-seq quantification tools.
+
\chapter{SNP calling}
\section{Algorithm}
=====================================
src/HelperFunctions.c
=====================================
@@ -1235,10 +1235,12 @@ HashTable * load_alias_table(char * fname) {
if((!sam_chr)||(!anno_chr)) continue;
sam_chr[strlen(sam_chr)-1]=0;
+ if(sam_chr[strlen(sam_chr)-1]=='\r') sam_chr[strlen(sam_chr)-1]=0;
char * anno_chr_buf = malloc(strlen(anno_chr)+1);
strcpy(anno_chr_buf, anno_chr);
char * sam_chr_buf = malloc(strlen(sam_chr)+1);
strcpy(sam_chr_buf, sam_chr);
+// SUBREADprintf("ALIAS_LOAD : '%s' => '%s'\n", sam_chr_buf, anno_chr_buf);
HashTablePut(ret, sam_chr_buf, anno_chr_buf);
}
=====================================
src/core-interface-aligner.c
=====================================
@@ -68,6 +68,7 @@ static struct option long_options[] =
{"multiMapping", no_argument, 0, 0},
{"keepReadOrder", no_argument, 0, 0},
{"sortReadsByCoordinates", no_argument, 0, 0},
+ {"noTLENpreference", no_argument, 0, 0},
{0, 0, 0, 0}
};
@@ -583,6 +584,10 @@ int parse_opts_aligner(int argc , char ** argv, global_context_t * global_contex
global_context -> config.use_memory_buffer = 1;
global_context -> config.reads_per_chunk = 600llu*1024*1024;
}
+ else if(strcmp("noTLENpreference", long_options[option_index].name) == 0)
+ {
+ global_context -> config.no_TLEN_preference = 1;
+ }
else if(strcmp("gtfFeature", long_options[option_index].name) == 0)
{
strncpy(global_context->config.exon_annotation_feature_name_column, optarg, MAX_READ_NAME_LEN - 1);
=====================================
src/core-junction.c
=====================================
@@ -1106,7 +1106,7 @@ void copy_vote_to_alignment_res(global_context_t * global_context, thread_contex
// if the covered region has a wrong arrangement to their relative positions.
int test_minor_res = test_junction_minor(global_context, thread_context, current_vote, vote_i, vote_j, i, j, dist);
//#warning " ============ DEBUG 1 ==================== "
- if(0 && FIXLENstrcmp("R002403247", read_name) == 0) {
+ if(0 && FIXLENstrcmp("R002403247", read_name) == 0) {
char posout2[100];
char posout1[100];
absoffset_to_posstr(global_context, current_vote -> pos[vote_i][vote_j], posout1);
@@ -2402,7 +2402,6 @@ int process_voting_junction_PE_topK(global_context_t * global_context, thread_co
simple_mapping_t * current_loc = is_second_read?comb_buffer[i].r2_loc:comb_buffer[i].r1_loc;
assert(current_loc);
unsigned int current_pos = current_loc->mapping_position;
- //SUBREADprintf("CLLL BUF %d R_%d : %u\n", i, 1+is_second_read, current_loc->mapping_position);
int is_exist = 0;
for(j = 0; j < *current_r_cursor; j++)
@@ -2412,6 +2411,7 @@ int process_voting_junction_PE_topK(global_context_t * global_context, thread_co
break;
}
}
+ //SUBREADprintf("CLLL BUF %d R_%d : %u ; EXIST %d. Written into the %d-th best location\n", i, 1+is_second_read, current_loc->mapping_position, is_exist, *current_r_cursor);
if(!is_exist){
if(current_loc -> is_vote_t_item)
@@ -2984,9 +2984,10 @@ int final_CIGAR_quality(global_context_t * global_context, thread_context_t * th
else
tail_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, read_text + read_cursor, current_perfect_section_abs, tmp_int, 1, adj_coverage_end);
- //SUBREADprintf("SSTAIL:%d\n", tail_soft_clipped);
+ if(1 && FIXLENstrcmp("NS500643:556:HGTMTBGXB:4:13403:18179:8012", read_name)==0)
+ SUBREADprintf("SSTAIL:%d\n", tail_soft_clipped);
- if(tail_soft_clipped == tmp_int){
+ if(1 && tail_soft_clipped == tmp_int){
tail_soft_clipped = 0;
if(full_section_clipped)(*full_section_clipped) = 1;
} else has_clipping_this_section_tail = 1;
@@ -3125,6 +3126,8 @@ int final_CIGAR_quality(global_context_t * global_context, thread_context_t * th
}
}
+ if(1 && FIXLENstrcmp("NS500643:556:HGTMTBGXB:4:13403:18179:8012", read_name)==0)
+ SUBREADprintf("NEW_CIGAR_2 : %s\n", new_cigar_tmp);
strcpy(cigar_string, new_cigar_tmp);
}
@@ -3347,11 +3350,11 @@ unsigned int finalise_explain_CIGAR(global_context_t * global_context, thread_co
//#warning " ========== COMMENT THIS LINE !! ========="
- if(0 && FIXLENstrcmp("R00000110641", explain_context -> read_name) ==0){
+ if(1 && FIXLENstrcmp("NS500643:556:HGTMTBGXB:4:13403:18179:8012", explain_context -> read_name) ==0){
char outpos1[100];
absoffset_to_posstr(global_context, final_position, outpos1);
SUBREADprintf("FINALQUAL %s : FINAL_POS=%s ( %u )\tCIGAR=%s\tMM=%d / MAPLEN=%d > %d?\tVOTE=%d > %0.2f x %d ? MASK=%d\tQUAL=%d\tBRNO=%d\nKNOWN_JUNCS=%d PENALTY=%d\n\n", explain_context -> read_name, outpos1 , final_position , tmp_cigar, mismatch_bases, non_clipped_length, applied_mismatch, result -> selected_votes, global_context -> config.minimum_exonic_subread_fraction,result-> used_subreads_in_vote, result->result_flags, final_qual, explain_context -> best_read_id, known_junction_supp, explain_context -> best_indel_penalty);
- exit(0);
+ //exit(0);
}
=====================================
src/core.c
=====================================
@@ -2676,7 +2676,7 @@ int do_iteration_two(global_context_t * global_context, thread_context_t * threa
if(current_result->selected_votes < global_context->config.minimum_subread_for_second_read || max_votes < global_context->config.minimum_subread_for_first_read)
{
// if(current_read_number == 111 || current_read_number == 112)
- // SUBREADprintf("RESET0 [%d] R_%d SEL=%d MAX=%d\n", current_read_number, is_second_read + 1, current_result->selected_votes, max_votes);
+ //SUBREADprintf("RESET0 [%lld] R_%d SEL=%d MAX=%d\n", current_read_number, is_second_read + 1, current_result->selected_votes, max_votes);
current_result -> selected_votes = 0;
continue;
}
@@ -2688,7 +2688,7 @@ int do_iteration_two(global_context_t * global_context, thread_context_t * threa
}
//#warning ">>>>>>> COMMENT THIS <<<<<<<"
- //printf("OCT27-STEP21-%s:%d-ALN%02d-THRE %d\n", current_read_name, is_second_read + 1, best_read_id, thread_context -> thread_id);
+ //printf("OCT27-STEP21-%s:%d-ALN%02d-THRE %d in MM=%d\n", current_read_name, is_second_read + 1, best_read_id, thread_context -> thread_id, global_context -> config.multi_best_reads);
int is_negative_strand = (current_result -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
@@ -2844,7 +2844,7 @@ int do_iteration_two(global_context_t * global_context, thread_context_t * threa
test_PE_and_same_chro_align(global_context , realignment_result_R1 , realignment_result_R2, &is_exonic_regions, &is_PE, &is_same_chro , read_len_1, read_len_2, read_name_1, &tlen);
unsigned long long TLEN_exp_score = 0;
- if(is_PE && global_context -> config.reported_multi_best_reads<2){
+ if(is_PE && global_context -> config.no_TLEN_preference == 0 && global_context -> config.reported_multi_best_reads<2){
TLEN_exp_score = (tlen > expected_TLEN)?(tlen - expected_TLEN):(expected_TLEN - tlen);
if(TLEN_exp_score>999) TLEN_exp_score=0;
else TLEN_exp_score = 999-TLEN_exp_score;
@@ -3017,6 +3017,7 @@ int do_iteration_two(global_context_t * global_context, thread_context_t * threa
}
free(final_realignments);
+ free(final_realignment_number);
free(final_MATCH_buffer1);
free(final_MISMATCH_buffer1);
free(final_PENALTY_buffer1);
@@ -3248,7 +3249,7 @@ int do_voting(global_context_t * global_context, thread_context_t * thread_conte
SUBREADprintf(" ======= PAIR %s = %llu ; NON_INFORMATIVE = %d, %d =======\n", read_name_1, current_read_number, vote_1 -> noninformative_subreads, vote_2 -> noninformative_subreads);
print_votes(vote_1, global_context -> config.index_prefix);
print_votes(vote_2, global_context -> config.index_prefix);
- if(is_reversed==1)exit(0);
+ //if(is_reversed==1)exit(0);
}
if(global_context -> input_reads.is_paired_end_reads)
@@ -3615,6 +3616,7 @@ int read_chunk_circles(global_context_t *global_context)
print_in_box(80,0,0, "Load the %d-th index block...",1+ global_context->current_index_block_number);
if(gehash_load(global_context -> current_index, tmp_fname)) return -1;
+ print_in_box(80,0,0, "The index block has been loaded.");
sprintf(tmp_fname, "%s.%02d.%c.array", global_context->config.index_prefix, global_context->current_index_block_number, global_context->config.space_type == GENE_SPACE_COLOR?'c':'b');
}
period_load_index = miltime() - time_load_index;
@@ -3628,6 +3630,7 @@ int read_chunk_circles(global_context_t *global_context)
global_context -> is_final_voting_run = 1;
else global_context -> is_final_voting_run = 0;
+ print_in_box(80,0,0, "Start read mapping in chunk.");
double time_start_voting = miltime();
ret = run_maybe_threads(global_context, STEP_VOTING);
double period_voting = miltime() - time_start_voting;
@@ -3980,12 +3983,46 @@ int do_anno_bitmap_add_feature(char * gene_name, char * tracnscript_id, char * c
return 0;
}
-void warning_anno_vs_index(HashTable * anno_chros_tab, gene_offset_t * index_chros_offset){
- HashTable * index_chros_tab = index_chros_offset -> read_name_to_index;
- warning_hash_hash(anno_chros_tab, index_chros_tab, "Chromosomes/contigs in annotation but not in index :");
- warning_hash_hash(index_chros_tab, anno_chros_tab, "Chromosomes/contigs in index but not in annotation :");
+void convert_table_key_to_alias(void * ky ,void * va, HashTable * tab){
+ HashTable * alias_index_to_annot = tab -> appendix2;
+ HashTable * result_annot_chros = tab -> appendix1;
+
+ char * index_name = ky;
+ char * annot_name = HashTableGet(alias_index_to_annot, index_name);
+ if(annot_name==NULL) annot_name = index_name;
+ HashTablePut( result_annot_chros, annot_name, va );
}
+void warning_anno_vs_index(HashTable * anno_chros_tab, gene_offset_t * index_chros_offset, HashTable * alias_index_to_annot){
+
+ HashTable * index_chros_tab = index_chros_offset -> read_name_to_index;
+ HashTable * chros_in_annot_using_index_names = anno_chros_tab ;
+ HashTable * chros_in_index_using_annot_names = index_chros_tab ;
+ HashTable * alias_annot_to_index = NULL;
+
+ if(alias_index_to_annot){
+ chros_in_annot_using_index_names = StringTableCreate(1000);
+ chros_in_index_using_annot_names = StringTableCreate(1000);
+
+ alias_annot_to_index = StringTableReverse(alias_index_to_annot);
+ anno_chros_tab -> appendix1 = chros_in_annot_using_index_names;
+ anno_chros_tab -> appendix2 = alias_annot_to_index;
+ HashTableIteration(anno_chros_tab, convert_table_key_to_alias);
+
+ index_chros_tab -> appendix1 = chros_in_index_using_annot_names;
+ index_chros_tab -> appendix2 = alias_index_to_annot;
+ HashTableIteration(index_chros_tab, convert_table_key_to_alias);
+ }
+
+ warning_hash_hash(anno_chros_tab, chros_in_index_using_annot_names, "Chromosomes/contigs in annotation but not in index :");
+ warning_hash_hash(index_chros_tab, chros_in_annot_using_index_names, "Chromosomes/contigs in index but not in annotation :");
+
+ if(alias_index_to_annot){
+ HashTableDestroy(alias_annot_to_index);
+ HashTableDestroy(chros_in_annot_using_index_names);
+ HashTableDestroy(chros_in_index_using_annot_names);
+ }
+}
int load_annotated_exon_regions(global_context_t * global_context){
int bitmap_size = (4096 / EXONIC_REGION_RESOLUTION / 8)*1024*1024;
@@ -3999,8 +4036,6 @@ int load_annotated_exon_regions(global_context_t * global_context){
int loaded_features = load_features_annotation(global_context->config.exon_annotation_file, global_context->config.exon_annotation_file_type, global_context->config.exon_annotation_gene_id_column, NULL, global_context->config.exon_annotation_feature_name_column, global_context, do_anno_bitmap_add_feature);
if(loaded_features < 0)return -1;
else print_in_box(80,0,0,"%d annotation records were loaded.\n", loaded_features);
- warning_anno_vs_index(global_context -> annotation_chro_table, &global_context -> chromosome_table);
- HashTableDestroy(global_context -> annotation_chro_table);
return 0;
}
@@ -4021,9 +4056,10 @@ int load_global_context(global_context_t * context)
context -> config.multi_best_reads = max(context -> config.multi_best_reads , context -> config.reported_multi_best_reads);
if(context->config.multi_best_reads>1) context -> config.reads_per_chunk /= context->config.multi_best_reads;
}
+ print_in_box(80,0,0,"Check the input reads.");
subread_init_lock(&context->input_reads.input_lock);
if(core_geinput_open(context, &context->input_reads.first_read_file, 1,1)) {
- //sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"Unable to open '%s' as input. Please check if it exists, you have the permission to read it, and it is in the correct format.\n", context->config.first_read_file);
+ // sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"Unable to open '%s' as input. Please check if it exists, you have the permission to read it, and it is in the correct format.\n", context->config.first_read_file);
return -1;
}
@@ -4061,6 +4097,7 @@ int load_global_context(global_context_t * context)
context -> config.max_vote_number_cutoff = 2;
}
+ print_in_box(80,0,0,"Initialise the memory objects.");
context -> config.max_vote_simples = max(context -> config.max_vote_simples , context -> config.reported_multi_best_reads);
context -> config.max_vote_combinations = max(context -> config.max_vote_combinations , context -> config.reported_multi_best_reads);
@@ -4075,6 +4112,7 @@ int load_global_context(global_context_t * context)
stat(context->config.first_read_file , &ginp1_stat);
context->input_reads.first_read_file_size = ginp1_stat.st_size;
+ print_in_box(80,0,0,"Estimate the mean read length.");
context -> input_reads.avg_read_length = guess_reads_density_format(context->config.first_read_file , context->config.is_SAM_file_input?1:0, &min_phred_score, &max_phred_score , &guess_tested_reads);
if(context -> input_reads.avg_read_length<0 )context -> input_reads.avg_read_length = 250;
// SUBREADprintf("QR=[%d,%d]; ALEN=%f\n", min_phred_score, max_phred_score, context -> input_reads.avg_read_length);
@@ -4110,6 +4148,7 @@ int load_global_context(global_context_t * context)
// Only the sam file is opened here; other files like bed, indel and etc are opened in init_modules()
sprintf(tmp_fname,"%s", context->config.output_prefix);
+ print_in_box(80,0,0,"Create the output %s file.", context -> config.is_BAM_output?"BAM":"SAM");
if(context -> config.is_BAM_output)
{
context -> output_bam_writer = malloc(sizeof(SamBam_Writer));
@@ -4140,6 +4179,7 @@ int load_global_context(global_context_t * context)
}
// ====== check index files, count blocks and load chro table ======
+ print_in_box(80,0,0,"Check the index.");
sprintf(tmp_fname, "%s.reads", context->config.index_prefix);
if(!does_file_exist(tmp_fname))
{
@@ -4163,7 +4203,6 @@ int load_global_context(global_context_t * context)
return -1;
}
-
context->index_block_number = 0;
while(1)
{
@@ -4181,6 +4220,7 @@ int load_global_context(global_context_t * context)
if(context->config.report_sam_file)
write_sam_headers(context);
+ print_in_box(80,0,0,"Init the voting space.");
// ====== init other variables ======
if(context -> config.all_threads>1)
subread_init_lock(&context -> thread_initial_lock);
@@ -4197,10 +4237,14 @@ int load_global_context(global_context_t * context)
context -> sam_chro_to_anno_chr_alias = NULL;
if(context->config.exon_annotation_file[0]){
+ print_in_box(80,0,0,"Load the annotation file.");
if( load_annotated_exon_regions( context ) ) return -1;
if(context->config.exon_annotation_alias_file[0])
context -> sam_chro_to_anno_chr_alias = load_alias_table(context->config.exon_annotation_alias_file);
+ warning_anno_vs_index(context -> annotation_chro_table, &context -> chromosome_table, context -> sam_chro_to_anno_chr_alias);
+ HashTableDestroy(context -> annotation_chro_table);
} else context -> exonic_region_bitmap = NULL;
+ print_in_box(80,0,0,"Global environment is initialised.");
return 0;
}
=====================================
src/core.h
=====================================
@@ -204,6 +204,7 @@ typedef struct{
int use_quality_score_break_ties;
int big_margin_record_size;
int PE_predominant_weight;
+ int no_TLEN_preference;
// subjunc
int entry_program_name;
=====================================
src/detection-calls.c
=====================================
@@ -12,8 +12,8 @@
#include "gene-algorithms.h"
typedef struct{
- char gene_name[FEATURE_NAME_LENGTH];
- char chro_name[MAX_CHROMOSOME_NAME_LEN];
+ char gene_name[FEATURE_NAME_LENGTH+2];
+ char chro_name[MAX_CHROMOSOME_NAME_LEN+2];
unsigned int start, end;
char is_negative_strand;
} DTCexon_t;
@@ -59,13 +59,13 @@ int DTCdo_add_feature(char * gene_name, char * transcript_name, char * chro_name
return 0;
}
-int DTCcompare_merge_genes(void * L_elem, void * R_elem){
+int DTCcompare_merge_genes(void * L_elem, void * R_elem, ArrayList *me){
unsigned int * le = L_elem, * re = R_elem;
if(le[1] > re[1]) return 1;
if(le[1] < re[1]) return -1;
return 0;
}
-int DTCcompare_exons(void * L_elem, void * R_elem){
+int DTCcompare_exons(void * L_elem, void * R_elem, ArrayList *me){
DTCexon_t * le = L_elem, * re = R_elem;
int chname_cmp = strcmp(le -> chro_name, re -> chro_name);
if(chname_cmp) return chname_cmp;
=====================================
src/flattenAnnotations.c
=====================================
@@ -145,7 +145,7 @@ int flatAnno_do_anno_1R(char * gene_name, char * transcript_name, char * chro_na
return 0;
}
-int flatAnno_do_anno_merge_one_array_compare(void * vL, void * vR){
+int flatAnno_do_anno_merge_one_array_compare(void * vL, void * vR, ArrayList *me){
int * iL = vL, *iR = vR;
if((*iL)>(*iR))return 1;
if((*iL)<(*iR))return -1;
@@ -246,12 +246,16 @@ void flatAnno_do_anno_merge_one_array(void * key, void * hashed_obj, HashTable *
this_list -> numOfElements = n1_items+1;
}
+int flatme_strcmp(void * L, void * R, ArrayList * me){
+ return strcmp(L, R);
+}
+
int flatAnno_do_anno_merge_and_write(flatAnno_context_t * context){
context -> gene_chro_strand_to_exons_table -> appendix1 = context;
HashTableIteration(context -> gene_chro_strand_to_exons_table, context -> merging_mode == MERGING_MODE_CHOPPING? flatAnno_do_anno_chop_one_array :flatAnno_do_anno_merge_one_array);
ArrayList * all_chro_st_list = HashTableKeyArray(context -> gene_chro_strand_to_exons_table);
- ArrayListSort(all_chro_st_list, (int(*)(void * , void*))strcmp);
+ ArrayListSort(all_chro_st_list, flatme_strcmp);
fprintf( context -> output_FP , "GeneID\tChr\tStart\tEnd\tStrand\n");
int i,j;
=====================================
src/gene-algorithms.c
=====================================
@@ -2551,6 +2551,7 @@ int get_base_error_prob64i(char v)
return PROB_QUAL_INT_TABLE[v-'@'];
}
+#ifdef SKIP_THIS_PART
void bad_reverse_cigar(char * cigar)
{
int cigar_cursor = 0;
@@ -2600,6 +2601,7 @@ int debug_main()
SUBREADprintf("%s\n",cg);
return 0;
}
+#endif
void remove_indel_neighbours(HashTable * indel_table)
{
=====================================
src/hashtable.c
=====================================
@@ -6,7 +6,7 @@
* Released to the public domain.
*
*--------------------------------------------------------------------------
- * $Id: hashtable.c,v 9999.39 2019/08/28 22:31:14 cvs Exp $
+ * $Id: hashtable.c,v 9999.42 2019/12/05 03:05:37 cvs Exp $
\*--------------------------------------------------------------------------*/
#include <stdio.h>
@@ -24,7 +24,7 @@ static int isProbablePrime(srInt_64 number);
static srInt_64 calculateIdealNumOfBuckets(HashTable *hashTable);
srInt_64 long_random_val(){
- long ret = 0;
+ srInt_64 ret = 0;
if(RAND_MAX<255){
SUBREADprintf("Is this a embedded computer????\n");
return -1;
@@ -37,6 +37,13 @@ srInt_64 long_random_val(){
return ret;
}
+HashTable * ArrayListToLookupTable_Int(ArrayList * arr){
+ srInt_64 x1;
+ HashTable * ret = HashTableCreate( max(1,arr -> numOfElements / 6) );
+ for(x1 = 0; x1 < arr->numOfElements; x1++) HashTablePut(ret, ArrayListGet(arr, x1)+1, NULL+1);
+ return ret;
+}
+
void * ArrayListShift(ArrayList * list){
if(list->numOfElements<1) return NULL;
void *ret = list->elementList [0];
@@ -116,7 +123,7 @@ void ArrayListSetDeallocationFunction(ArrayList * list, void (*elem_deallocator
list -> elemDeallocator = elem_deallocator;
}
-int ArrayListSort_comp_pntr( void * L_elem, void * R_elem ){
+int ArrayListSort_comp_pntr( void * L_elem, void * R_elem, ArrayList * me ){
if( L_elem > R_elem )return 1;
if( L_elem < R_elem )return -1;
return 0;
@@ -125,11 +132,11 @@ int ArrayListSort_comp_pntr( void * L_elem, void * R_elem ){
int ArrayListSort_compare(void * sortdata0, int L, int R){
void ** sortdata = sortdata0;
ArrayList * list = sortdata[0];
- int (*comp_elems)(void * L_elem, void * R_elem) = sortdata[1];
+ int (*comp_elems)(void * L_elem, void * R_elem, ArrayList * me) = sortdata[1];
void * L_elem = list -> elementList[L];
void * R_elem = list -> elementList[R];
- return comp_elems(L_elem, R_elem);
+ return comp_elems(L_elem, R_elem, list);
}
void ArrayListSort_exchange(void * sortdata0, int L, int R){
@@ -144,7 +151,7 @@ void ArrayListSort_exchange(void * sortdata0, int L, int R){
void ArrayListSort_merge(void * sortdata0, int start, int items, int items2){
void ** sortdata = sortdata0;
ArrayList * list = sortdata[0];
- int (*comp_elems)(void * L_elem, void * R_elem) = sortdata[1];
+ int (*comp_elems)(void * L_elem, void * R_elem, ArrayList * me) = sortdata[1];
void ** merged = malloc(sizeof(void *)*(items + items2));
int write_cursor, read1=start, read2=start+items;
@@ -153,7 +160,7 @@ void ArrayListSort_merge(void * sortdata0, int start, int items, int items2){
void * Elm1 = (read1 == start + items)? NULL:list -> elementList[read1];
void * Elm2 = (read2 == start + items + items2)?NULL:list -> elementList[read2];
- int select_1 = (read1 == start + items)?0:( read2 == start + items + items2 || comp_elems(Elm1, Elm2) < 0 );
+ int select_1 = (read1 == start + items)?0:( read2 == start + items + items2 || comp_elems(Elm1, Elm2, list) < 0 );
if(select_1) merged[write_cursor] = list -> elementList[read1++];
else merged[write_cursor] = list -> elementList[read2++];
if(read2 > start + items + items2){
@@ -167,7 +174,7 @@ void ArrayListSort_merge(void * sortdata0, int start, int items, int items2){
free(merged);
}
-void ArrayListSort(ArrayList * list, int compare_L_minus_R(void * L_elem, void * R_elem)){
+void ArrayListSort(ArrayList * list, int compare_L_minus_R(void * L_elem, void * R_elem, ArrayList * me)){
void * sortdata[2];
sortdata[0] = list;
sortdata[1] = compare_L_minus_R?compare_L_minus_R:ArrayListSort_comp_pntr;
@@ -239,6 +246,21 @@ HashTable * StringTableCreate(srInt_64 numOfBuckets){
return ret;
}
+void StringTableReverse_Run(void * ky, void * va, HashTable * tab){
+ HashTable * res = tab -> appendix1;
+ HashTablePut(res, va, ky);
+}
+
+HashTable * StringTableReverse(HashTable * ori){
+ HashTable * res = StringTableCreate(ori -> numOfBuckets);
+ void * oriap1 = ori -> appendix1;
+ ori -> appendix1 = res;
+
+ HashTableIteration(ori, StringTableReverse_Run);
+ ori -> appendix1 = oriap1;
+ return res;
+}
+
ArrayList * HashTableKeys(HashTable * tab){
int i;
ArrayList * ret = ArrayListCreate(tab -> numOfElements);
@@ -990,3 +1012,34 @@ void HashTableIteration(HashTable * tab, void process_item(void * key, void * ha
}
}
}
+
+void HashTableSortedIndexes_copy_idx( void *k, void *v, HashTable * k2int_tab ){
+ ArrayList * dsta = k2int_tab -> appendix1;
+ ArrayListPush(dsta, k);
+}
+
+int HashTableSortedIndexes_cmp_idx( void * Lv, void * Rv, ArrayList * me ){
+ void ** appdx = me -> appendix1;
+ HashTable * k2int_tab = appdx[0];
+ void * Lhash = HashTableGet(k2int_tab, Lv);
+ void * Rhash = HashTableGet(k2int_tab, Rv);
+ void * large_first = appdx[1];
+ if(large_first){
+ if(Lhash > Rhash) return -1;
+ if(Lhash < Rhash) return 1;
+ }else{
+ if(Lhash > Rhash) return 1;
+ if(Lhash < Rhash) return -1;
+ }
+ return 0;
+}
+
+ArrayList * HashTableSortedIndexes(HashTable * k2int_tab, int larger_value_first){
+ ArrayList *ret = HashTableKeys(k2int_tab);
+ void * appx[2];
+ ret -> appendix1 = appx;
+ appx[0]=k2int_tab;
+ appx[1]=NULL+larger_value_first;
+ ArrayListSort(ret, HashTableSortedIndexes_cmp_idx);
+ return ret;
+}
=====================================
src/hashtable.h
=====================================
@@ -6,7 +6,7 @@
* Released to the public domain.
*
*--------------------------------------------------------------------------
- * $Id: hashtable.h,v 9999.27 2019/08/25 10:12:06 cvs Exp $
+ * $Id: hashtable.h,v 9999.30 2019/12/05 03:05:37 cvs Exp $
\*--------------------------------------------------------------------------*/
#ifndef _HASHTABLE_H
@@ -71,7 +71,7 @@ int ArrayListPush_NoRepeatedPtr(ArrayList * list, void * new_elem);
void * ArrayListShift(ArrayList * list);
void * ArrayListPop(ArrayList * list);
void ArrayListSetDeallocationFunction(ArrayList * list, void (*elem_deallocator)(void *elem));
-void ArrayListSort(ArrayList * list, int compare_L_minus_R(void * L_elem, void * R_elem));
+void ArrayListSort(ArrayList * list, int compare_L_minus_R(void * L_elem, void * R_elem, ArrayList * me));
// A simple comparison function if you want to sort unsigned long long ints.
int ArrayListLLUComparison(void * L_elem, void * R_elem);
@@ -115,6 +115,7 @@ ArrayList * HashTableKeyArray(HashTable * tab);
\*--------------------------------------------------------------------------*/
HashTable * StringTableCreate(srInt_64 numOfBuckets);
+HashTable * StringTableReverse(HashTable * ori);
HashTable *HashTableCreate(srInt_64 numOfBuckets);
/*--------------------------------------------------------------------------*\
@@ -494,6 +495,11 @@ srUInt_64 HashTableStringHashFunction(const void *key);
void free_values_destroy(HashTable * tab);
int HashTablePutReplaceEx(HashTable *hashTable, const void *key, void *value, int replace_key, int dealloc_key, int dealloc_value);
+ArrayList * HashTableSortedIndexes(HashTable * k2int_tab, int larger_value_first);
+
+// NOTE: values are all added by 1.
+HashTable * ArrayListToLookupTable_Int(ArrayList * arr);
+
#endif /* _HASHTABLE_H */
=====================================
src/index-builder.c
=====================================
@@ -441,31 +441,25 @@ int build_gene_index(const char index_prefix [], char ** chro_files, int chro_fi
int add_repeated_subread(gehash_t * tab , unsigned int subr, unsigned char ** huge_index)
{
- int huge_byte = (subr>>2) &0x3fffffff;
- int huge_offset = (subr % 4) * 2;
- int ind_id = (huge_byte >> 20) & 1023;
+ int huge_byte = (subr>>1);
+ int huge_offset = (subr % 2) * 4;
+ int ind_id = (huge_byte >> 24) & 127;
- if(NULL == huge_index[ind_id]) huge_index[ind_id] = calloc(1024*1024, 1);
- if(NULL == huge_index[ind_id]){
- SUBREADprintf("ERROR: No memory can be allocated.\nThe program has to terminate\n");
- return -1;
- }
- unsigned int byte_value = huge_index[ ind_id ][huge_byte&0xfffff] ;
+ unsigned int byte_value = huge_index[ ind_id ][huge_byte&0xffffff] ;
- int huge_value = (byte_value>> huge_offset) & 0x3;
- if(huge_value <3){
+ int huge_value = (byte_value>> huge_offset) & 0xf;
+ if(huge_value <15){
huge_value ++;
- huge_index[ ind_id ][huge_byte&0xfffff] = (byte_value & (~(0x3 << huge_offset))) | (huge_value << huge_offset);
+ byte_value = (byte_value & (~(0xf << huge_offset))) | (huge_value << huge_offset);
+ huge_index[ ind_id ][huge_byte&0xffffff] = (unsigned char)(byte_value&0xff);
return 0;
}
unsigned int times = 0;
int matched = gehash_get(tab, subr, ×, 1);
if(matched)
- {
gehash_update(tab, subr, times+1);
- }
else
- if(gehash_insert(tab, subr,4, NULL)) return 1;
+ if(gehash_insert(tab, subr, 16, NULL)) return 1;
return 0;
}
@@ -480,26 +474,17 @@ int scan_gene_index(const char index_prefix [], char ** chro_files, int chro_fil
*actual_total_bases_inc_marging = padding_around_contigs;
gehash_t occurrence_table;
- unsigned char * huge_index[1024];
-
- for(i=0;i<1024;i++)
- {
- huge_index[i] = NULL;
+ unsigned char * huge_index[128];
- if(0){//memory is allocated when needed.
- huge_index[i] = (unsigned char *)malloc(1024*1024);
-
- if(!huge_index[i])
- {
- for(j=0;j<i;j++) free(huge_index[j]);
- SUBREADprintf("ERROR: You need at least one point five gigabytes of memory for building the index.\n");
- return -1;
- }
- memset(huge_index[i], 0 , 1024*1024);
+ for(i=0;i<128;i++) {
+ huge_index[i] = calloc(1024*1024*16,1);
+ if(NULL == huge_index[i]){
+ SUBREADprintf("ERROR: No memory can be allocated.\nThe program has to terminate\n");
+ return -1;
}
}
- if(gehash_create_ex(&occurrence_table, 150000000, 0, SUBINDEX_VER0, 1, 0)) return 1;
+ if(gehash_create_ex(&occurrence_table, 500000, 0, SUBINDEX_VER0, 1, 0)) return 1;
gene_input_t ginp;
@@ -677,7 +662,7 @@ int scan_gene_index(const char index_prefix [], char ** chro_files, int chro_fil
}
- for(i=0;i<1024;i++)
+ for(i=0;i<128;i++)
if(huge_index[i]) free(huge_index[i]);
gehash_destory(&occurrence_table);
=====================================
src/input-files.c
=====================================
@@ -2544,7 +2544,8 @@ int SAM_pairer_create(SAM_pairer_context_t * pairer, int all_threads, int bin_bu
pairer -> threads[x1].thread_id = x1;
pairer -> threads[x1].reads_in_SBAM = 0;
pairer -> threads[x1].input_buff_SBAM = malloc(pairer -> input_buff_SBAM_size);
- pairer -> threads[x1].input_buff_BIN = malloc(pairer -> input_buff_BIN_size);
+ pairer -> threads[x1].input_buff_BIN_capacity = pairer -> input_buff_BIN_size;
+ pairer -> threads[x1].input_buff_BIN = malloc(pairer -> threads[x1].input_buff_BIN_capacity );
pairer -> threads[x1].input_buff_BIN_used = 0;
pairer -> threads[x1].orphant_table = HashTableCreate(pairer -> input_buff_SBAM_size / 100);
@@ -2615,7 +2616,7 @@ int SAM_pairer_read_BAM_block(FILE * fp, int max_read_len, char * inbuff) {
return -1;
}
if(gz_header_12[0]!=31 || gz_header_12[1]!=139){
- SUBREADprintf("Unrecognized Gzip headers: %u, %u\nPlease make sure if the input file is in the BAM format.\n", gz_header_12[0], gz_header_12[1]);
+ //SUBREADprintf("Unrecognized Gzip headers: %u, %u\nPlease make sure if the input file is in the BAM format.\n", gz_header_12[0], gz_header_12[1]);
return -1;
}
unsigned short xlen = 0, bsize = 0;
@@ -2768,16 +2769,31 @@ int SAM_pairer_fetch_BAM_block(SAM_pairer_context_t * pairer , SAM_pairer_thread
inflateReset(&thread_context -> strm);
- thread_context -> strm.avail_in = (unsigned int)(thread_context -> input_buff_SBAM_used - thread_context -> input_buff_SBAM_ptr);
+ int lin, lout;
+
+ lin=thread_context -> strm.avail_in = (unsigned int)(thread_context -> input_buff_SBAM_used - thread_context -> input_buff_SBAM_ptr);
thread_context -> strm.next_in = (unsigned char *)thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr;
- thread_context -> strm.avail_out = pairer -> input_buff_BIN_size - thread_context -> input_buff_BIN_used;
+
+ if( thread_context -> input_buff_BIN_capacity < thread_context -> input_buff_BIN_used + 128*1024){
+ thread_context -> input_buff_BIN_capacity = max(thread_context -> input_buff_BIN_used, thread_context -> input_buff_BIN_capacity )*1.5;
+ if(thread_context -> input_buff_BIN_capacity > 1024*1024*1024){
+ SUBREADprintf("ERROR: buffer size larger than 1GB\n");
+ return 1;
+ }else{
+ //SUBREADprintf("Resize Buffer of Th_%d to %d (used %d); In_ava=%d - %d\n", thread_context -> thread_id, thread_context -> input_buff_BIN_capacity, thread_context -> input_buff_BIN_used, thread_context -> input_buff_SBAM_used , thread_context -> input_buff_SBAM_ptr);
+ }
+ thread_context -> input_buff_BIN = realloc( thread_context -> input_buff_BIN , thread_context -> input_buff_BIN_capacity);
+ //SUBREADprintf(" PTR=%p\n",thread_context -> input_buff_BIN);
+ assert( thread_context -> input_buff_BIN );
+ }
+ lout=thread_context -> strm.avail_out = thread_context -> input_buff_BIN_capacity - thread_context -> input_buff_BIN_used;
thread_context -> strm.next_out = (unsigned char *)thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_used;
int ret = inflate(&thread_context ->strm, Z_FINISH);
if(ret == Z_OK || ret == Z_STREAM_END)
{
- int have = pairer -> input_buff_BIN_size - thread_context ->strm.avail_out - thread_context -> input_buff_BIN_used;
- int used_BAM = (unsigned int)(thread_context -> input_buff_SBAM_used - thread_context -> input_buff_SBAM_ptr) - thread_context -> strm.avail_in;
+ int have = lout - thread_context ->strm.avail_out;
+ int used_BAM = lin - thread_context -> strm.avail_in;
//SUBREADprintf("ABBO TH %d : INFLATED BAM_CONSUMED: %d BIN_USED: %d => %d NEED_FIND_START=%d\n", thread_context -> thread_id, used_BAM, thread_context -> input_buff_BIN_used , thread_context -> input_buff_BIN_used+have, thread_context -> need_find_start);
thread_context -> input_buff_BIN_used += have;
@@ -2785,16 +2801,24 @@ int SAM_pairer_fetch_BAM_block(SAM_pairer_context_t * pairer , SAM_pairer_thread
if(thread_context -> need_find_start){
int test_read_bin = SAM_pairer_find_start(pairer, thread_context);
- //#warning "====================== FOR TESTINg FALL BACK ONLY, MUST REMOVE FROM RELEASE ============================"
if(test_read_bin<1 && thread_context -> input_buff_BIN_used >= 32 ){
pairer -> is_bad_format = 1;
- //SUBREADprintf("ABBO : BAD_FMT 01\n");
+ // #warning "BADFORMAT-DEBUG"
+ // SUBREADprintf("ABBO : BAD_FMT 01\n");
}
}
+ //SUBREADprintf("FETCHED BLOCK DECOMP=%d FROM COMP=%d\n", have, used_BAM);
} else {
- SUBREADprintf("GZIP ERROR:%d\n", ret);
+ if(ret == -5){
+ SUBREADprintf("Cannot parse the input BAM file. If the BAM file contains long reads, please run featureCounts on the long-read mode.\n");
+ }else{
+ SUBREADprintf("GZIP ERROR:%d\n", ret);
+ }
+ pairer -> is_bad_format = 1;
+ pairer -> is_internal_error = 1;
return 1;
}
+
return 0;
}
@@ -2834,7 +2858,7 @@ void SAM_pairer_reduce_BAM_bin(SAM_pairer_context_t * pairer, SAM_pairer_thread_
}
-#define MAX_BIN_RECORD_LENGTH ( 24*1024)
+#define MAX_BIN_RECORD_LENGTH ( 20*1024*1024)
int reduce_SAM_to_BAM(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context, int include_sequence);
int is_read_bin(char * bin, int bin_len, int max_refID);
@@ -2886,7 +2910,8 @@ int SAM_pairer_get_next_read_BIN( SAM_pairer_context_t * pairer , SAM_pairer_thr
ref_bin_len += 4;
}
- is_OK = is_OK || pairer -> output_header(pairer, thread_context -> thread_id, 0, pairer -> BAM_n_ref , header_txt , ref_bin_len );
+ is_OK = is_OK || pairer -> output_header(pairer, thread_context -> thread_id, 0, pairer -> BAM_n_ref , header_txt , ref_bin_len );
+ //SUBREADprintf("TFMT:HEADER REFS=%d TXTS=%d SIGN=%u\n", pairer -> BAM_n_ref, pairer->BAM_l_text, bam_signature);
if(header_txt) free(header_txt);
if(is_OK){
@@ -2895,11 +2920,7 @@ int SAM_pairer_get_next_read_BIN( SAM_pairer_context_t * pairer , SAM_pairer_thr
}
pairer -> BAM_header_parsed = 1;
- //if(pairer -> display_progress)
- // SUBREADprintf("\nThe header was parsed: %d reference sequences were found.\nScanning the input file.\n", pairer -> BAM_n_ref);
SAM_pairer_fetch_BAM_block(pairer, thread_context);
-
- //SUBREADprintf("HEAD_FINISHED, BAD=%d\n", pairer -> is_bad_format);
}
if(pairer -> is_bad_format) return 0;
@@ -2935,15 +2956,19 @@ int SAM_pairer_get_next_read_BIN( SAM_pairer_context_t * pairer , SAM_pairer_thr
}
}
+ //SUBREADprintf("TFMT:RLEN=%d\n", record_len);
+
if(!pairer -> is_bad_format){
unsigned int seq_len = 0;
thread_context -> input_buff_BIN_ptr += 4;
memcpy(&seq_len, thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr + 16, 4);
//SUBREADprintf("REDUCE_2: record %u, %u\n", record_len, seq_len);
- if(record_len < 32 || record_len > min(MAX_BIN_RECORD_LENGTH,60000) || seq_len >= pairer -> long_read_minimum_length){
+ // #warning "=========== CHECK IF '0 && ' IS CORRECT ==========="
+ if(record_len < 32 || (0 && record_len > min(MAX_BIN_RECORD_LENGTH,60000))|| seq_len >= pairer -> long_read_minimum_length){
if(seq_len >= pairer -> long_read_minimum_length) pairer -> is_single_end_mode = 1;
- // SUBREADprintf("BADFMT: THID=%d; rlen %d; seqlen %d; room %d < %d\n", thread_context -> thread_id, record_len, seq_len);
+ // #warning "BADFORMAT-DEBUG"
+ // SUBREADprintf("BADFMT: THID=%d; rlen %d; seqlen %d; BIN_PTR=%d\n", thread_context -> thread_id, record_len, seq_len, thread_context -> input_buff_BIN_ptr);
pairer -> is_bad_format = 1;
return 0;
}
@@ -2951,16 +2976,6 @@ int SAM_pairer_get_next_read_BIN( SAM_pairer_context_t * pairer , SAM_pairer_thr
(* bin_where) = thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr - 4;
(* bin_len) = record_len + 4;
- if(0){
- unsigned int Treclen=0, Tseqlen = 0;
- memcpy(&Treclen, *bin_where, 4);
- memcpy(&Tseqlen, (*bin_where)+20, 4);
- assert(record_len == Treclen);
- int is_bin_start = is_read_bin((char *)*bin_where, thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_used +4 , pairer -> BAM_n_ref);
- SUBREADprintf("TREC TH_%d: reclen=%u, seqlen=%u, IS_BIN=%d\n", thread_context -> thread_id, Treclen, Tseqlen, is_bin_start);
- assert(is_bin_start == 1);
- }
-
thread_context -> input_buff_BIN_ptr += record_len;
}
return 1;
@@ -4473,7 +4488,7 @@ int SAM_pairer_find_start(SAM_pairer_context_t * pairer , SAM_pairer_thread_t *
}
}
thread_context -> input_buff_BIN_ptr = start_pos;
- //SUBREADprintf("ABBO TH %d : FOUND START AT %d in %d\n", thread_context -> thread_id , start_pos, thread_context -> input_buff_BIN_used);
+// SUBREADprintf("ABBO TH %d : FOUND START AT %d in %d\n", thread_context -> thread_id , start_pos, thread_context -> input_buff_BIN_used);
return start_pos < min(MAX_BIN_RECORD_LENGTH, thread_context -> input_buff_BIN_used);
}
@@ -4700,70 +4715,81 @@ int fix_load_next_block(FILE * in, char * binbuf, z_stream * strm){
}
int fix_write_block(FILE * out, char * bin, int binlen, z_stream * strm){
- char * bam_buf = malloc(70000);
- int x1, bam_len = 0;
-
- if(binlen > 0){
- strm -> avail_in = binlen;
- strm -> next_in = (unsigned char*)bin;
- strm -> avail_out = 70000;
- strm -> next_out = (unsigned char*)bam_buf;
- deflate(strm , Z_FINISH);
- bam_len = 70000 - strm -> avail_out;
- deflateReset(strm);
- }else{
- z_stream nstrm;
- nstrm.zalloc = Z_NULL;
- nstrm.zfree = Z_NULL;
- nstrm.opaque = Z_NULL;
- nstrm.avail_in = 0;
- nstrm.next_in = Z_NULL;
-
- deflateInit2(&nstrm, SAMBAM_COMPRESS_LEVEL, Z_DEFLATED,
- PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ int is_end_mode = binlen == 0, written=0;
+ //SUBREADprintf("FIX_WRTR : %d\n", binlen);
- nstrm.avail_in = 0;
- nstrm.next_in = (unsigned char*)bin;
- nstrm.avail_out = 70000;
- nstrm.next_out = (unsigned char*)bam_buf;
- deflate(&nstrm, Z_FINISH);
- bam_len = 70000 - nstrm.avail_out;
- deflateEnd(&nstrm);
- }
+ while(1){
+ if(binlen - written<1 && !is_end_mode) return 0;
- //SUBREADprintf("FIX_COMPR: %d -> %d RET=%d\n", binlen , bam_len, retbam);
-
- unsigned int crc0 = crc32(0, NULL, 0);
- unsigned int crc = crc32(crc0, (unsigned char *) bin , binlen);
-
- fputc(31, out);
- fputc(139, out);
- fputc(8, out);
- fputc(4, out);
- fputc(0, out);
- fputc(0, out);
- fputc(0, out);
- fputc(0, out);
-
- fputc(0, out);//XFL
- fputc(0xff, out);//OS
-
- x1 = 6;
- fwrite( &x1, 2, 1 , out );
- fputc( 66, out );
- fputc( 67, out );
- x1 = 2;
- fwrite( &x1, 2, 1 , out );
- x1 = bam_len + 19 + 6;
- fwrite( &x1, 2, 1 , out );
- int write_len = fwrite( bam_buf , 1,bam_len, out );
+ char * bam_buf = malloc(70000);
+ int x1, bam_len = 0, old_in= 0, this_sec_len = 0, old_start = written;
- fwrite( &crc, 4, 1, out );
- fwrite( &binlen, 4, 1, out );
-
- free(bam_buf);
-
- if(write_len < bam_len)return 1;
+ if(binlen - written > 0){
+ old_in = strm -> avail_in = binlen - written;
+ strm -> next_in = (unsigned char*)bin + written;
+ strm -> avail_out = 70000;
+ strm -> next_out = (unsigned char*)bam_buf;
+ deflate(strm , Z_FINISH);
+ bam_len = 70000 - strm -> avail_out;
+ this_sec_len = old_in - strm -> avail_in;
+ written += this_sec_len;
+
+ deflateReset(strm);
+ }else{
+ z_stream nstrm;
+ nstrm.zalloc = Z_NULL;
+ nstrm.zfree = Z_NULL;
+ nstrm.opaque = Z_NULL;
+ nstrm.avail_in = 0;
+ nstrm.next_in = Z_NULL;
+
+ deflateInit2(&nstrm, SAMBAM_COMPRESS_LEVEL, Z_DEFLATED,
+ PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+
+ nstrm.avail_in = 0;
+ nstrm.next_in = (unsigned char*)bin;
+ nstrm.avail_out = 70000;
+ nstrm.next_out = (unsigned char*)bam_buf;
+ deflate(&nstrm, Z_FINISH);
+ bam_len = 70000 - nstrm.avail_out;
+ deflateEnd(&nstrm);
+ }
+
+ //SUBREADprintf("FIX_COMPR: %d -> %d RET=%d\n", binlen , bam_len, retbam);
+
+ unsigned int crc0 = crc32(0, NULL, 0);
+ unsigned int crc = crc32(crc0, (unsigned char *) bin + old_start, this_sec_len);
+
+ fputc(31, out);
+ fputc(139, out);
+ fputc(8, out);
+ fputc(4, out);
+ fputc(0, out);
+ fputc(0, out);
+ fputc(0, out);
+ fputc(0, out);
+
+ fputc(0, out);//XFL
+ fputc(0xff, out);//OS
+
+ x1 = 6;
+ fwrite( &x1, 2, 1 , out );
+ fputc( 66, out );
+ fputc( 67, out );
+ x1 = 2;
+ fwrite( &x1, 2, 1 , out );
+ x1 = bam_len + 19 + 6;
+ fwrite( &x1, 2, 1 , out );
+ int write_len = fwrite( bam_buf , 1,bam_len, out );
+
+ fwrite( &crc, 4, 1, out );
+ fwrite( &binlen, 4, 1, out );
+
+ free(bam_buf);
+
+ if(write_len < bam_len)return 1;
+ if(binlen<1) return 0;
+ }
return 0;
}
@@ -4775,7 +4801,7 @@ int fix_write_block(FILE * out, char * bin, int binlen, z_stream * strm){
#define FIX_FLASH_OUT { if(out_bin_ptr > 0)disk_is_full |= fix_write_block(new_fp, out_bin, out_bin_ptr, &out_strm); out_bin_ptr = 0; }
-#define FIX_APPEND_OUT(p, c) { if(out_bin_ptr > 60000){FIX_FLASH_OUT} ; memcpy(out_bin + out_bin_ptr, p, c); out_bin_ptr +=c ; }
+#define FIX_APPEND_OUT(p, c) { if(out_bin_ptr > 60002){FIX_FLASH_OUT} ; memcpy(out_bin + out_bin_ptr, p, c); out_bin_ptr +=c ; }
#define FIX_APPEND_READ(p, c){ memcpy(out_bin + out_bin_ptr, p, c); out_bin_ptr +=c ; }
int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
@@ -4786,8 +4812,8 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
sprintf(tmpfname, "%s.fixbam", pairer -> tmp_file_prefix);
FILE * new_fp = f_subr_open(tmpfname, "wb");
- char * in_bin = malloc(140000);
- char * out_bin = malloc(70000);
+ char * in_bin = malloc(1024*70);
+ char * out_bin = malloc(20*1024*1024);
z_stream in_strm;
z_stream out_strm;
@@ -4849,7 +4875,7 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
content_size += (nch << (8 * x1));
}
FIX_APPEND_OUT(&content_size, 4);
- //SUBREADprintf("FIX: CHROLEN=%d\n", content_size);
+ //SUBREADprintf("LONGFIX: CHROLEN=%d\n", content_size);
for(content_count = 0; content_count < content_size; content_count++){
int namelen = 0;
for(x1 = 0; x1 < 4; x1++){
@@ -4864,7 +4890,7 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
FIX_APPEND_READ(&nch, 1);
}
- if(out_bin_ptr > 60000){
+ if(out_bin_ptr > 60003){
FIX_FLASH_OUT;
}
}
@@ -4891,9 +4917,6 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
block_size += (nch << (8 * x1));
}
- if(block_size + out_bin_ptr > 60000 && !pairer -> tiny_mode)
- FIX_FLASH_OUT;
-
FIX_APPEND_READ(&block_size, 4);
if(pairer -> tiny_mode){
@@ -4927,7 +4950,9 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
}
break;
}
- if( x1 == 32 && block_size > 60000 ){
+
+ // #warning "================ THIS BLOCK WAS DISABLED ON 03OCT2019; MAKE SURE IT WORKS ON LONG READS/LONG READ RECORDS =============="
+ if(0 && x1 == 32 && block_size > 60000 ){
print_in_box(80,0,0,"");
print_in_box(80,0,0," ERROR: Alignment record is too long.");
print_in_box(80,0,0," Please use the long read mode.");
@@ -5054,10 +5079,9 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
}
}
- //#warning "========= COMMENT NEXT ============="
- //SUBREADprintf("OUTBIN_PTR=%d\n", out_bin_ptr);
reads ++;
if(out_bin_ptr > 60000){
+ // SUBREADprintf("WRIR3: TINY=%d\n", pairer -> tiny_mode);
FIX_FLASH_OUT;
}
}
=====================================
src/input-files.h
=====================================
@@ -102,6 +102,7 @@ typedef struct {
unsigned char * input_buff_BIN;
int input_buff_BIN_used;
int input_buff_BIN_ptr;
+ int input_buff_BIN_capacity;
int orphant_block_no;
int need_find_start;
srInt_64 orphant_space;
=====================================
src/makefile.version
=====================================
@@ -1,4 +1,4 @@
-SUBREAD_VERSION_BASE=2.0.0
+SUBREAD_VERSION_BASE=2.0.1
SUBREAD_VERSION_DATE=$(SUBREAD_VERSION_BASE)-$(shell date +"%d%b%Y")
SUBREAD_VERSION="$(SUBREAD_VERSION_DATE)"
SUBREAD_VERSION="$(SUBREAD_VERSION_BASE)"
=====================================
src/readSummary.c
=====================================
@@ -64,7 +64,7 @@
#define FC_FLIST_SPLITOR "\026"
typedef struct{
- char gene_name[FEATURE_NAME_LENGTH];
+ char * gene_name;
unsigned int pos_first_base;
unsigned int pos_last_base;
} fc_junction_gene_t;
@@ -97,7 +97,7 @@ typedef struct {
} fc_junction_info_t;
typedef struct {
- unsigned int feature_name_pos;
+ srInt_64 feature_name_pos;
unsigned int start;
unsigned int end;
unsigned int sorted_order;
@@ -280,8 +280,8 @@ typedef struct {
char * debug_command;
char * unistr_buffer_space;
srInt_64 max_BAM_header_size;
- unsigned int unistr_buffer_size;
- unsigned int unistr_buffer_used;
+ srInt_64 unistr_buffer_size;
+ srInt_64 unistr_buffer_used;
HashTable * scRNA_sample_sheet_table;
ArrayList * scRNA_sample_barcode_list;
ArrayList * scRNA_cell_barcodes_array;
@@ -323,7 +323,7 @@ typedef struct {
char ** exontable_chr;
srInt_64 * exontable_start;
srInt_64 * exontable_stop;
- char feature_name_column[100];
+ char feature_name_column[2000];
char gene_id_column[100];
srInt_64 * exontable_block_end_index;
@@ -581,19 +581,19 @@ int calc_junctions_from_cigar(fc_thread_global_context_t * global_context, int f
}
-unsigned int unistr_cpy(fc_thread_global_context_t * global_context, char * str, int strl)
+srInt_64 unistr_cpy(fc_thread_global_context_t * global_context, char * str, int strl)
{
- unsigned int ret;
+ srInt_64 ret;
if(global_context->unistr_buffer_used + strl >= global_context->unistr_buffer_size-1)
{
- if( global_context->unistr_buffer_size < (1000u*1000u*1000u*2)) // 2GB
+ if( global_context->unistr_buffer_size < (1000llu*1000u*1000u*32)) // 32GB
{
global_context -> unistr_buffer_size = global_context->unistr_buffer_size /2 *3;
global_context -> unistr_buffer_space = realloc(global_context -> unistr_buffer_space, global_context->unistr_buffer_size);
}
else
{
- SUBREADprintf("Error: exceed memory limit (4GB) for storing annotation data.\n");
+ SUBREADprintf("Error: exceed memory limit (32GB) for storing feature names.\n");
return 0xffffffffu;
}
}
@@ -823,12 +823,18 @@ int fc_strcmp(const void * s1, const void * s2)
return strcmp((char*)s1, (char*)s2);
}
+void junc_gene_free(void *vv){
+ fc_junction_gene_t *v = vv;
+ free(v -> gene_name);
+ free(v);
+}
+
void register_junc_feature(fc_thread_global_context_t *global_context, char * feature_name, char * chro, unsigned int start, unsigned int stop){
HashTable * gene_table = HashTableGet(global_context -> junction_features_table, chro);
//SUBREADprintf("REG %s : %p\n", chro, gene_table);
if(NULL == gene_table){
gene_table = HashTableCreate(48367);
- HashTableSetDeallocationFunctions(gene_table, NULL, free);
+ HashTableSetDeallocationFunctions(gene_table, NULL, junc_gene_free);
HashTableSetKeyComparisonFunction(gene_table, fc_strcmp);
HashTableSetHashFunction(gene_table, fc_chro_hash);
@@ -839,7 +845,7 @@ void register_junc_feature(fc_thread_global_context_t *global_context, char * fe
fc_junction_gene_t * gene_info = HashTableGet(gene_table, feature_name);
if(NULL == gene_info){
gene_info = malloc(sizeof(fc_junction_gene_t));
- strcpy(gene_info -> gene_name, feature_name);
+ gene_info -> gene_name = strdup(feature_name);
gene_info -> pos_first_base = start;
gene_info -> pos_last_base = stop;
@@ -856,6 +862,18 @@ void free_bucket_table_list(void * pv){
free(list);
}
+int match_feature_name_column(char * infile, char * needed){
+ char * ptt = NULL;
+ char lneeded[strlen(needed)+1];
+ strcpy(lneeded, needed);
+ char * t1 = strtok_r(lneeded, ",", &ptt);
+ while(t1){
+ if(strcmp(t1, infile)==0) return 1;
+ t1 = strtok_r(NULL,",", &ptt);
+ }
+ return 0;
+}
+
#define JUNCTION_BUCKET_STEP (128*1024)
int locate_junc_features(fc_thread_global_context_t *global_context, char * chro, unsigned int pos, fc_junction_gene_t ** ret_info, int max_ret_info_size){
@@ -906,10 +924,11 @@ int locate_junc_features(fc_thread_global_context_t *global_context, char * chro
// Memory will be allowcated in this function. The pointer is saved in *loaded_features.
// The invoker must release the memory itself.
+#define MAX_ANNOT_LINE_LENGTH 200000
int load_feature_info(fc_thread_global_context_t *global_context, const char * annotation_file, int file_type, fc_feature_info_t ** loaded_features)
{
unsigned int features = 0, xk1 = 0, lineno=0;
- char * file_line = malloc(MAX_LINE_LENGTH+1);
+ char * file_line = malloc(MAX_ANNOT_LINE_LENGTH+1);
autozip_fp anno_fp;
int apret = autozip_open(annotation_file, &anno_fp);
int is_GFF_warned = 0;
@@ -937,8 +956,8 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
// also create chro_name_table : chro_name => fc_chromosome_index_info
while(0)
{
- //char * fgets_ret = fgets(file_line, MAX_LINE_LENGTH, fp);
- int rchars = autozip_gets(&anno_fp, file_line, MAX_LINE_LENGTH);
+ //char * fgets_ret = fgets(file_line, MAX_ANNOT_LINE_LENGTH, fp);
+ int rchars = autozip_gets(&anno_fp, file_line, MAX_ANNOT_LINE_LENGTH);
char * token_temp = NULL, *chro_name;
fc_chromosome_index_info * chro_stab;
unsigned int feature_pos = 0;
@@ -951,7 +970,7 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
chro_name = strtok_r(file_line,"\t",&token_temp);
strtok_r(NULL,"\t", &token_temp); // lib_name (not needed)
char * feature_type = strtok_r(NULL,"\t", &token_temp);
- if(strcmp(feature_type, global_context -> feature_name_column)==0)
+ if(match_feature_name_column(feature_type, global_context -> feature_name_column))
{
strtok_r(NULL,"\t", &token_temp); // feature_start
feature_pos = atoi(strtok_r(NULL,"\t", &token_temp));// feature_end
@@ -1004,8 +1023,12 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
while(1)
{
int is_gene_id_found = 0;
- int rchars = autozip_gets(&anno_fp, file_line, MAX_LINE_LENGTH);
+ int rchars = autozip_gets(&anno_fp, file_line, MAX_ANNOT_LINE_LENGTH);
if(rchars < 1) break;
+ if(rchars >= MAX_ANNOT_LINE_LENGTH - 1){
+ SUBREADprintf("\nERROR: the %u-th line in your GTF file is extremely long (longer than %d bytes).\nThe program cannot parse this line.\n", lineno+1, MAX_ANNOT_LINE_LENGTH-1);
+ return -2;
+ }
lineno++;
char * token_temp = NULL;
@@ -1018,9 +1041,12 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
}
char * feature_name = strtok_r(file_line,"\t",&token_temp);
int feature_name_len = strlen(feature_name);
- if(feature_name_len > FEATURE_NAME_LENGTH) feature_name[FEATURE_NAME_LENGTH -1 ] = 0;
+ if(feature_name_len > FEATURE_NAME_LENGTH-2){
+ SUBREADprintf("WARNING: feature name on the %d-th line is longer than %d bytes. The name is truncated\n", lineno, FEATURE_NAME_LENGTH -2);
+ feature_name[FEATURE_NAME_LENGTH -2 ] = 0;
+ }
- unsigned int genename_pos = unistr_cpy(global_context, (char *)feature_name, feature_name_len);
+ srInt_64 genename_pos = unistr_cpy(global_context, (char *)feature_name, feature_name_len);
// SUBREADprintf("REALL: '%s'=%d [%d] %p POS=%d\t\tOLD_NAME_POS=%d\n" , feature_name, feature_name_len , xk1, ret_features+xk1, genename_pos, xk1>0?ret_features[xk1-1].feature_name_pos:-1);
ret_features[xk1].feature_name_pos = genename_pos;
@@ -1028,7 +1054,7 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
char * seq_name = strtok_r(NULL,"\t", &token_temp);
int chro_name_len = strlen(seq_name);
if(chro_name_len > CHROMOSOME_NAME_LENGTH) seq_name[CHROMOSOME_NAME_LENGTH -1 ] = 0;
- unsigned int chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
+ srInt_64 chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
global_context -> longest_chro_name = max(chro_name_len, global_context -> longest_chro_name);
@@ -1125,7 +1151,7 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
char * seq_name = strtok_r(file_line,"\t",&token_temp);
strtok_r(NULL,"\t", &token_temp);// source
char * feature_type = strtok_r(NULL,"\t", &token_temp);// feature_type
- if(strcmp(feature_type, global_context -> feature_name_column)==0) {
+ if(match_feature_name_column(feature_type, global_context -> feature_name_column)) {
if(xk1 >= ret_features_size) {
ret_features_size *=2;
@@ -1221,12 +1247,15 @@ int load_feature_info(fc_thread_global_context_t *global_context, const char * a
}
int feature_name_len = strlen(feature_name_tmp);
- if(feature_name_len > FEATURE_NAME_LENGTH) feature_name_tmp[FEATURE_NAME_LENGTH -1 ] = 0;
+ if(feature_name_len > FEATURE_NAME_LENGTH-2){
+ SUBREADprintf("WARNING: feature name on the %d-th line is longer than %d bytes. The name is truncated\n", lineno, FEATURE_NAME_LENGTH-2);
+ feature_name_tmp[FEATURE_NAME_LENGTH -2 ] = 0;
+ }
ret_features[xk1].feature_name_pos = unistr_cpy(global_context, (char *)feature_name_tmp, feature_name_len);
int chro_name_len = strlen(seq_name);
if(chro_name_len > CHROMOSOME_NAME_LENGTH) seq_name[CHROMOSOME_NAME_LENGTH -1 ] = 0;
- unsigned int chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
+ srInt_64 chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
global_context -> longest_chro_name = max(chro_name_len, global_context -> longest_chro_name);
ret_features[xk1].chro_name_pos_delta = chro_name_pos - ret_features[xk1].feature_name_pos;
@@ -3991,7 +4020,7 @@ void vote_and_add_count(fc_thread_global_context_t * global_context, fc_thread_t
add_scRNA_read_to_pool(global_context, thread_context, assignment_target_number, read_name);
}
}else if(global_context -> is_multi_overlap_allowed) {
- #define GENE_NAME_LIST_BUFFER_SIZE 5000
+ #define GENE_NAME_LIST_BUFFER_SIZE (FEATURE_NAME_LENGTH * 50)
char final_feture_names[GENE_NAME_LIST_BUFFER_SIZE];
int assigned_no = 0, xk1;
@@ -4118,7 +4147,7 @@ void scRNA_merge_thread_umitables(void *ky, void *val, HashTable * tab){
}
#define MIN_EXPRESSED_UMIS_PER_CELL 100
-#define MIN_EXPRESSED_UMIS_PER_GENE 3
+#define MIN_EXPRESSED_UMIS_PER_GENE (3-2)
void scRNA_merge_write_copy_gene_nos(void * ky, void * va , HashTable *tab){
HashTable * used_gene_table = tab -> appendix2;
@@ -4127,10 +4156,10 @@ void scRNA_merge_write_copy_gene_nos(void * ky, void * va , HashTable *tab){
srInt_64 UMIs = HashTableGet(used_gene_table, ky)-NULL;
HashTablePut(used_gene_table, ky, NULL + UMIs + one_sampl_gene_to_cell_umis -> numOfElements);
}
-int scRNA_merge_write_zero_gene(fc_thread_global_context_t * global_context, char * linebuf, ArrayList * used_cell_barcode_list){
+int scRNA_merge_write_zero_gene(fc_thread_global_context_t * global_context, char * linebuf, ArrayList * high_confid_barcode_index_list){
int ret=0;
srInt_64 x1;
- for(x1=0;x1<used_cell_barcode_list->numOfElements;x1++)ret += sprintf(linebuf + ret,"\t0");
+ for(x1=0;x1<high_confid_barcode_index_list->numOfElements;x1++)ret += sprintf(linebuf + ret,"\t0");
return ret;
}
@@ -4259,7 +4288,7 @@ ArrayList * scRNA_reduce_cellno_umino_p1_list(fc_thread_global_context_t * globa
return ret;
}
-int scRNA_merged_write_a_gene(fc_thread_global_context_t * global_context, HashTable ** merged_tables_gene_to_cell_umis , HashTable ** used_cell_barcode_tabs , ArrayList ** used_cell_barcode_list , ArrayList * umi_merged_list, ArrayList * gen_no_list, srInt_64 used_gene_i, char * linebuf){
+int scRNA_merged_write_a_gene(fc_thread_global_context_t * global_context, HashTable ** merged_tables_gene_to_cell_umis , HashTable ** used_cell_barcode_tabs , ArrayList ** high_confid_barcode_index_list , ArrayList * umi_merged_list, ArrayList * gen_no_list, srInt_64 used_gene_i, char * linebuf){
srInt_64 gene_no = ArrayListGet(gen_no_list, used_gene_i) -NULL -1, x1;
srInt_64 line_number = global_context -> is_gene_level? gene_no : HashTableGet(global_context -> lineno_2_sortedno_tab, NULL+1+gene_no) -NULL -1; // convert line_no to sorted_order
int linebuf_ptr = 0;
@@ -4278,12 +4307,12 @@ int scRNA_merged_write_a_gene(fc_thread_global_context_t * global_context, Hash
for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++){
ArrayList* cellno_umino_p1_list = HashTableGet(merged_tables_gene_to_cell_umis[x1], NULL + gene_no + 1);
if(NULL == cellno_umino_p1_list){
- linebuf_ptr += scRNA_merge_write_zero_gene(global_context, linebuf+ linebuf_ptr, used_cell_barcode_list[x1]);
+ linebuf_ptr += scRNA_merge_write_zero_gene(global_context, linebuf+ linebuf_ptr, high_confid_barcode_index_list[x1]);
}else{
int tab_cell_ptr=0, used_cell_bc_ptr ;
- for(used_cell_bc_ptr = 0; used_cell_bc_ptr < used_cell_barcode_list[x1] -> numOfElements; used_cell_bc_ptr++){
- srInt_64 used_cell_no = ArrayListGet(used_cell_barcode_list[x1], used_cell_bc_ptr) - NULL -1;
+ for(used_cell_bc_ptr = 0; used_cell_bc_ptr < high_confid_barcode_index_list[x1] -> numOfElements; used_cell_bc_ptr++){
+ srInt_64 used_cell_no = ArrayListGet(high_confid_barcode_index_list[x1], used_cell_bc_ptr) - NULL -1;
int write_cnt=0;
if(tab_cell_ptr<cellno_umino_p1_list ->numOfElements)while(1){
@@ -4308,73 +4337,282 @@ int scRNA_merged_write_a_gene(fc_thread_global_context_t * global_context, Hash
return total_count_in_row;
}
-// this function writes a single count table.
-// Rows: genes
-// Cols: Cell_Barcode +"."+ SampleName
-void scRNA_merged_to_tables_write( fc_thread_global_context_t * global_context, HashTable ** merged_tables_gene_to_cell_umis , HashTable ** used_cell_barcode_tabs , ArrayList * merged_umi_list){
- char ofname[MAX_FILE_NAME_LENGTH + 20];
- sprintf(ofname,"%s.scRNA.table",global_context->input_file_name);
- FILE * ofp = fopen( ofname , "w" );
- int x1, cell_items_row = 0;
- srInt_64 xkk;
- ArrayList ** used_cell_barcode_list = malloc(sizeof(void*)* global_context -> scRNA_sample_sheet_table -> numOfElements );
+#define SCRNA_AMBIENT_RESCURE_MEDIAN_FRACTION 0.01
+void scRNA_merged_ambient_rescure(fc_thread_global_context_t * global_context, HashTable * merged_gene_to_cell_umis_tab, HashTable * used_cell_barcode_tab, ArrayList * this_sample_ambient_rescure_candi, ArrayList * this_sample_45k_90k_barcode_idx, ArrayList * highconf_list){
+ ArrayList * sorted_idx = HashTableSortedIndexes( used_cell_barcode_tab, 1);
+ HashTable * highconf_list_tab = ArrayListToLookupTable_Int(highconf_list);
+ srInt_64 x1;
+ for(x1=0; x1 < sorted_idx -> numOfElements; x1++){
+ void * this_bc_pnt = ArrayListGet(sorted_idx , x1);
+ if(!HashTableGet(highconf_list_tab, this_bc_pnt)) break; // assuming that all high-umi barcodes are high-confident, this makes x1 being the # of total high-confidence barcodes.
+ }
+ if(x1 >0){
+ srInt_64 median_umis = HashTableGet(used_cell_barcode_tab, ArrayListGet(sorted_idx , (x1-1)/2))-NULL;
+ srInt_64 median_umis_001_cut = (srInt_64)(median_umis *1. *SCRNA_AMBIENT_RESCURE_MEDIAN_FRACTION);
+ //SUBREADprintf("MEDIANTEST : X1 = %lld, MID = %lld, MID_CUT = %lld\n", x1, median_umis, median_umis_001_cut);
+ for(x1=0; x1 < sorted_idx -> numOfElements; x1++){
+ void * this_bc_pnt = ArrayListGet(sorted_idx , x1);
+ if(HashTableGet(highconf_list_tab, this_bc_pnt)) continue; // it is in high-conf list
+ srInt_64 this_bc_umis = HashTableGet(used_cell_barcode_tab, this_bc_pnt) - NULL;
+ if(this_bc_umis < median_umis_001_cut) break;
+ ArrayListPush(this_sample_ambient_rescure_candi, this_bc_pnt-1);
+ }
+ }
+ for(x1=45000; x1 < sorted_idx -> numOfElements; x1++){
+ if(x1 >= 90000) break;
+ ArrayListPush(this_sample_45k_90k_barcode_idx, ArrayListGet(sorted_idx , x1) );
+ }
+ ArrayListDestroy(sorted_idx);
+ //SUBREADprintf("AMBIENT_CANDIDATES = %lld 45K-90K = %lld\n", this_sample_ambient_rescure_candi -> numOfElements, this_sample_45k_90k_barcode_idx-> numOfElements);
+ HashTableDestroy(highconf_list_tab);
+}
- fprintf(ofp, global_context -> is_gene_level?"Geneid":"FeatureNo");
- for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++){
-// SUBREADprintf("CELL BARCODE USED TAB = %ld ; GENE_TO_CELL&UMI TAB = %ld ; for %d sample\n", used_cell_barcode_tabs[x1]-> numOfElements, merged_tables_gene_to_cell_umis[x1]-> numOfElements, x1);
- ArrayList * raw_used_cells = HashTableKeys(used_cell_barcode_tabs[x1]);
- ArrayListSort(raw_used_cells, NULL);
- used_cell_barcode_list[x1] = ArrayListCreate(raw_used_cells -> numOfElements);
- for(xkk = 0; xkk < raw_used_cells -> numOfElements; xkk++){
- srInt_64 cell_no = ArrayListGet(raw_used_cells, xkk) - NULL - 1;
- srInt_64 umis_of_cell = HashTableGet(used_cell_barcode_tabs[x1], NULL + 1 + cell_no)-NULL;
- if(umis_of_cell >= MIN_EXPRESSED_UMIS_PER_CELL) ArrayListPush( used_cell_barcode_list[x1], NULL + 1 + cell_no );
+
+#define SCRNA_BOOTSTRAP_HIGH_INDEX 30
+#define SCRNA_BOOTSTRAP_SAMPLING_TIMES 100
+
+void scRNA_merged_bootstrap_a_sample(fc_thread_global_context_t * global_context, HashTable * merged_gene_to_cell_umis_tab, HashTable * used_cell_barcode_tab, ArrayList * merged_umi_list, ArrayList * highconf_list){
+ ArrayList * sorted_idx = HashTableSortedIndexes( used_cell_barcode_tab, 1);
+ srInt_64 UMIs_30th_high = HashTableGet(used_cell_barcode_tab, ArrayListGet(sorted_idx , SCRNA_BOOTSTRAP_HIGH_INDEX -1 ))-NULL;
+
+ if(0){
+ SUBREADprintf("HIGHEST 30 TH UMIs = %lld\n", UMIs_30th_high);
+ UMIs_30th_high = HashTableGet(used_cell_barcode_tab, ArrayListGet(sorted_idx , SCRNA_BOOTSTRAP_HIGH_INDEX -0 ))-NULL;
+ SUBREADprintf("HIGHEST 31 TH UMIs = %lld\n", UMIs_30th_high);
+ UMIs_30th_high = HashTableGet(used_cell_barcode_tab, ArrayListGet(sorted_idx , SCRNA_BOOTSTRAP_HIGH_INDEX -2 ))-NULL;
+ SUBREADprintf("HIGHEST 29 TH UMIs = %lld\n", UMIs_30th_high);
+ }
+
+ srInt_64 x2,x1;
+ srInt_64 this_total = 0;
+ for(x1 = 0; x1 < SCRNA_BOOTSTRAP_SAMPLING_TIMES; x1++){
+ for(x2 = 0; x2 < sorted_idx -> numOfElements ; x2++){
+ void * barcode_idx = ArrayListRandom(sorted_idx);
+ srInt_64 this_umis = HashTableGet( used_cell_barcode_tab, barcode_idx )-NULL;
+ if(this_umis >= UMIs_30th_high/10) this_total ++;
}
- ArrayListDestroy(raw_used_cells);
- char * sample_name = ArrayListGet(global_context -> scRNA_sample_id_to_name, x1);
+ }
+ this_total /= SCRNA_BOOTSTRAP_SAMPLING_TIMES;
+ if(0) SUBREADprintf("FINAL SELECTION_IDX = %lld\n",this_total);
+ for(x1 = 0; x1 < min(sorted_idx -> numOfElements, this_total) ; x1++) ArrayListPush(highconf_list, ArrayListGet( sorted_idx, x1 ) -1 );
+}
+
+void build_exon_name(fc_thread_global_context_t * global_context, fc_feature_info_t * loaded_features, int i, char * exon_name){
+ sprintf(exon_name, "%s:fc,spl:%s:fc,spl:%u:fc,spl:%u:fc,spl:%c", global_context -> unistr_buffer_space + loaded_features[i].feature_name_pos,
+ global_context-> unistr_buffer_space + loaded_features[i].feature_name_pos + loaded_features[i].chro_name_pos_delta,
+ loaded_features[i].start, loaded_features[i].end, loaded_features[i].is_negative_strand == 1?'N':( loaded_features[i].is_negative_strand == 0? 'P':'X'));
+}
- for(xkk = 0; xkk < used_cell_barcode_list[x1]->numOfElements; xkk++){
- srInt_64 cell_bcno = ArrayListGet(used_cell_barcode_list[x1], xkk)-NULL-1;
- char * cell_name = ArrayListGet( global_context -> scRNA_cell_barcodes_array, cell_bcno );
- fprintf(ofp, "\t%s.%s", cell_name, sample_name);
+int scRNA_merged_write_sparse_matrix(fc_thread_global_context_t * global_context, HashTable * merged_tab_gene_to_cell_umis, HashTable * cell_barcode_tab, ArrayList * used_cell_barcodes, int sample_index, char * tabtype, fc_feature_info_t* loaded_features){
+ int x1,x2;
+
+ char ofname[MAX_FILE_NAME_LENGTH + 100];
+ sprintf(ofname,"%s.scRNA.%03d.%s.BCtab",global_context->input_file_name, sample_index+1,tabtype);
+ FILE * ofp_bcs = fopen( ofname , "w" );
+ sprintf(ofname,"%s.scRNA.%03d.%s.GENEtab",global_context->input_file_name, sample_index+1,tabtype);
+ FILE * ofp_genes = fopen( ofname , "w" );
+ sprintf(ofname,"%s.scRNA.%03d.%s.spmtx",global_context->input_file_name, sample_index+1,tabtype);
+ FILE * ofp_mtx = fopen( ofname , "w" );
+ fprintf(ofp_mtx,"%%%%MatrixMarket matrix coordinate integer general\n");
+
+ HashTable * used_cell_barcodes_tab = ArrayListToLookupTable_Int( used_cell_barcodes );
+
+ ArrayList * output_gene_idxs = HashTableKeys(merged_tab_gene_to_cell_umis);
+ ArrayListSort(output_gene_idxs, NULL);
+ unsigned int nonzero_items = 0, nonozero_genes = 0;
+ HashTable * bc_no_to_output_no_tab = HashTableCreate( global_context -> scRNA_cell_barcodes_array->numOfElements/8 );
+ ArrayList * output_no_tab_to_bcno_arr = ArrayListCreate( global_context -> scRNA_cell_barcodes_array->numOfElements);
+ for(x1 = 0; x1 < output_gene_idxs -> numOfElements; x1++){
+ srInt_64 gene_index = ArrayListGet(output_gene_idxs, x1)-NULL-1;
+ ArrayList * bc_umi_p1_list = HashTableGet(merged_tab_gene_to_cell_umis, NULL+1+ gene_index );
+ assert(bc_umi_p1_list!=NULL);
+ assert(bc_umi_p1_list -> numOfElements >0);
+
+ int old_bc_no = -1, this_gene_has_bc = 0;
+ for(x2=0; x2 < bc_umi_p1_list -> numOfElements; x2++){
+ srInt_64 bc_umi = ArrayListGet(bc_umi_p1_list, x2) - NULL - 1;
+ int bc_no = (int)(bc_umi >>32);
+ void * this_bc_needed = HashTableGet(used_cell_barcodes_tab, NULL+1+bc_no);
+ if(!this_bc_needed) continue;
+ int out_bc_no = HashTableGet( bc_no_to_output_no_tab , NULL+1+bc_no ) -NULL;
+ if( out_bc_no <1 ){
+ out_bc_no = bc_no_to_output_no_tab -> numOfElements +1;
+ HashTablePut( bc_no_to_output_no_tab , NULL+1+bc_no, NULL + out_bc_no );
+ ArrayListPush(output_no_tab_to_bcno_arr, NULL+bc_no);
+ fprintf(ofp_bcs,"%s\n", (char*)ArrayListGet(global_context -> scRNA_cell_barcodes_array, bc_no));
+ }
+
+ this_gene_has_bc = 1;
+ if(old_bc_no != bc_no){
+ nonzero_items++;
+ old_bc_no = bc_no;
+ }
+ }
+ if(this_gene_has_bc){
+ nonozero_genes ++;
}
- cell_items_row += xkk;
}
- fprintf(ofp, "\n");
-
- HashTable * uniq_gene_nos_in_samples = HashTableCreate(10000);
- for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++){
- merged_tables_gene_to_cell_umis[x1]->appendix1 = global_context;
- merged_tables_gene_to_cell_umis[x1]->appendix2 = uniq_gene_nos_in_samples;
- HashTableIteration( merged_tables_gene_to_cell_umis[x1], scRNA_merge_write_copy_gene_nos);
+ fprintf(ofp_mtx, "%u %u %u\n", nonozero_genes, (unsigned int)bc_no_to_output_no_tab->numOfElements, nonzero_items );
+
+ nonozero_genes = 0;
+ for(x1 = 0; x1 < output_gene_idxs -> numOfElements; x1++){
+ srInt_64 gene_index = ArrayListGet(output_gene_idxs, x1)-NULL-1;
+ ArrayList * bc_umi_p1_list = HashTableGet(merged_tab_gene_to_cell_umis, NULL+1+ gene_index );
+ assert(bc_umi_p1_list!=NULL);
+
+ unsigned int mybc_UMIs = 0;
+ int old_bc_no = -1, old_out_bc_no = -1, this_gene_has_bc = 0;
+ for(x2=0; x2 < bc_umi_p1_list -> numOfElements; x2++){
+ srInt_64 bc_umi = ArrayListGet(bc_umi_p1_list, x2) - NULL - 1;
+ int bc_no = (int)(bc_umi >>32);
+ int out_bc_no = HashTableGet( bc_no_to_output_no_tab , NULL+1+bc_no ) -NULL;
+ void * this_bc_needed = HashTableGet(used_cell_barcodes_tab, NULL+1+bc_no);
+ if(!this_bc_needed) continue;
+
+ this_gene_has_bc = 1;
+ if(old_bc_no == bc_no){
+ mybc_UMIs++;
+ }else{
+ if( old_out_bc_no >=0 )fprintf(ofp_mtx,"%d %d %d\n", nonozero_genes+1, old_out_bc_no, mybc_UMIs);
+ old_bc_no = bc_no;
+ old_out_bc_no = out_bc_no;
+ mybc_UMIs = 1;
+ }
+ }
+ if(mybc_UMIs>0) fprintf(ofp_mtx,"%d %d %d\n", nonozero_genes+1, old_out_bc_no, mybc_UMIs);
+
+ if(this_gene_has_bc){
+ if(global_context->is_gene_level){
+ char* gene_name = (char*)global_context -> gene_name_array [gene_index];
+ fprintf(ofp_genes,"%s\n", gene_name);
+ }else{
+ char exon_name[FEATURE_NAME_LENGTH+60];
+ build_exon_name(global_context, loaded_features, gene_index, exon_name);
+ fprintf(ofp_genes,"%s\n", exon_name);
+ }
+ //if(x1 < 7) SUBREADprintf("GOT A GENE SMP_%d: [%d] %s\n", sample_index+1, x1, gene_name);
+ }
+ nonozero_genes += this_gene_has_bc;
}
+ HashTableDestroy(bc_no_to_output_no_tab);
+ HashTableDestroy(used_cell_barcodes_tab);
+ ArrayListDestroy(output_gene_idxs);
+ fclose(ofp_bcs);
+ fclose(ofp_genes);
+ fclose(ofp_mtx);
- ArrayList * gen_no_list_raw = HashTableKeys(uniq_gene_nos_in_samples);
- ArrayList * gen_no_list = ArrayListCreate(gen_no_list_raw -> numOfElements);
- for(x1 = 0; x1<gen_no_list_raw->numOfElements;x1++){
- srInt_64 gene_no = ArrayListGet(gen_no_list_raw, x1) - NULL - 1;
- srInt_64 UMIs = HashTableGet( uniq_gene_nos_in_samples, NULL+ gene_no+1 )-NULL;
- if(UMIs >= MIN_EXPRESSED_UMIS_PER_GENE) ArrayListPush( gen_no_list, NULL+1+gene_no );
+ return 0;
+}
+
+void scRNA_merged_45K_to_90K_sum_SUM(void * kyGeneID, void * Vcb_umi_arr, HashTable * me){
+ HashTable * gene_to_umis = me -> appendix1;
+ HashTable * bcid_look_tab = me -> appendix2;
+ ArrayList * cb_umi_arr = Vcb_umi_arr;
+
+ srInt_64 x1, this_gene_added = HashTableGet(gene_to_umis, kyGeneID)-NULL;
+ int has_adding = 0;
+ for(x1 = 0; x1< cb_umi_arr -> numOfElements; x1++){
+ srInt_64 cb_umi = ArrayListGet(cb_umi_arr,x1) - NULL-1;
+ srInt_64 cbno = cb_umi >>32;
+ if(!HashTableGet( bcid_look_tab, NULL+1+cbno ))continue;
+ this_gene_added ++;
+ has_adding = 1;
+ }
+ if(has_adding)HashTablePut(gene_to_umis, kyGeneID, NULL+this_gene_added);
+}
+
+void scRNA_merged_45K_to_90K_sum_WRT(void * kyGeneID, void * valUMIs, HashTable * me){
+ fc_thread_global_context_t * global_context = me -> appendix1;
+ FILE * ofp = me -> appendix2;
+ fc_feature_info_t * loaded_features = me->appendix3;
+
+ if(global_context -> is_gene_level){
+ unsigned char * gene_name = global_context -> gene_name_array[ kyGeneID - NULL-1 ];
+ fprintf(ofp, "%s\t%u\n", gene_name, (unsigned int) (valUMIs-NULL));
+ }else{
+ char exon_name[FEATURE_NAME_LENGTH+60];
+ build_exon_name(global_context, loaded_features, kyGeneID-NULL-1, exon_name);
+ fprintf(ofp,"%s\t%u\n", exon_name, (unsigned int) (valUMIs-NULL));
}
- ArrayListDestroy(gen_no_list_raw);
- ArrayListSort(gen_no_list, NULL);
+}
- char * linebuf = malloc(15*cell_items_row+MAX_GENE_NAME_LEN * 6);
- for(xkk=0; xkk < gen_no_list -> numOfElements; xkk ++){
- int should_write = scRNA_merged_write_a_gene( global_context, merged_tables_gene_to_cell_umis, used_cell_barcode_tabs, used_cell_barcode_list, merged_umi_list, gen_no_list, xkk, linebuf);
- if(should_write)
- fprintf(ofp,"%s\n", linebuf);
+void scRNA_merged_45K_to_90K_sum(fc_thread_global_context_t * global_context, HashTable * gene_to_cell_umis_tab, ArrayList * bcid_arr, int sample_no, fc_feature_info_t * loaded_features ){
+ HashTable * ret = HashTableCreate( gene_to_cell_umis_tab->numOfElements/6 );
+ HashTable * bcid_look_tab = ArrayListToLookupTable_Int(bcid_arr);
+ gene_to_cell_umis_tab -> appendix1 = ret;
+ gene_to_cell_umis_tab -> appendix2 = bcid_look_tab;
+ HashTableIteration( gene_to_cell_umis_tab, scRNA_merged_45K_to_90K_sum_SUM );
+
+ char ofname[MAX_FILE_NAME_LENGTH + 100];
+ sprintf(ofname,"%s.scRNA.%03d.AmbSum",global_context->input_file_name, sample_no+1);
+ FILE * write_fp = fopen(ofname,"w");
+ fprintf(write_fp,"GeneID\tUMIs\n");
+ ret -> appendix1 = global_context;
+ ret -> appendix2 = write_fp;
+ ret -> appendix3 = loaded_features;
+ ret -> counter1 = sample_no;
+ HashTableIteration( ret, scRNA_merged_45K_to_90K_sum_WRT );
+ HashTableDestroy(bcid_look_tab);
+ HashTableDestroy(ret);
+ fclose(write_fp);
+}
+void scRNA_merged_write_nozero_geneids_WRT(void *k, void *v, HashTable* me){
+ FILE * fp = me->appendix1;
+ fc_thread_global_context_t * global_context = me->appendix2;
+ fc_feature_info_t * loaded_features = me->appendix3;
+ if(global_context -> is_gene_level){
+ unsigned char* gene_symbol = global_context -> gene_name_array [k-NULL-1];
+ fprintf(fp, "%s\n", gene_symbol);
+ }else{
+ char exon_name[FEATURE_NAME_LENGTH+60];
+ build_exon_name(global_context, loaded_features, k-NULL-1, exon_name);
+ fprintf(fp,"%s\n", exon_name);
}
- free(linebuf);
+}
- for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++)
- ArrayListDestroy(used_cell_barcode_list[x1]);
- free(used_cell_barcode_list);
+void scRNA_merged_write_nozero_geneids( fc_thread_global_context_t * global_context, HashTable * gene_to_cell_umis_tab, int samplenno, fc_feature_info_t * loaded_features ){
+ char ofname[MAX_FILE_NAME_LENGTH + 100];
+ sprintf(ofname,"%s.scRNA.%03d.no0Genes",global_context->input_file_name, samplenno+1);
+ FILE * fp = fopen( ofname , "w" );
+ gene_to_cell_umis_tab -> appendix1 =fp;
+ gene_to_cell_umis_tab -> appendix2 =global_context;
+ gene_to_cell_umis_tab -> appendix3 =loaded_features;
+ HashTableIteration(gene_to_cell_umis_tab, scRNA_merged_write_nozero_geneids_WRT);
+ fclose(fp);
+}
- HashTableDestroy(uniq_gene_nos_in_samples);
- ArrayListDestroy(gen_no_list);
- fclose(ofp);
+// this function writes a single count table.
+// Rows: genes
+// Cols: Cell_Barcode +"."+ SampleName
+void scRNA_merged_to_tables_write( fc_thread_global_context_t * global_context, HashTable ** merged_tables_gene_to_cell_umis , HashTable ** used_cell_barcode_tabs , ArrayList * merged_umi_list, fc_feature_info_t * loaded_features){
+ char ofname[MAX_FILE_NAME_LENGTH + 20];
+ sprintf(ofname,"%s.scRNA.SampleTable",global_context->input_file_name);
+ FILE * sample_tab_fp = fopen( ofname , "w" );
+ int x1;
+
+ fprintf(sample_tab_fp,"SampleName\tIndex\n");
+ for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++){
+ char * this_sample_name = ArrayListGet(global_context -> scRNA_sample_id_to_name, x1);
+ fprintf(sample_tab_fp,"%s\t%d\n", this_sample_name, 1+x1);
+ ArrayList * high_confid_barcode_index_list = ArrayListCreate(20000);
+ ArrayList * this_sample_ambient_rescure_candi = ArrayListCreate(10000);
+ ArrayList * this_sample_45k_90k_barcode_idx = ArrayListCreate(90000 - 45000 + 100);
+ scRNA_merged_bootstrap_a_sample(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1],merged_umi_list, high_confid_barcode_index_list);
+ scRNA_merged_ambient_rescure(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1], this_sample_ambient_rescure_candi, this_sample_45k_90k_barcode_idx, high_confid_barcode_index_list);
+
+ //SUBREADprintf("HAVING_HIGHCONF_TOTAL %lld\n", high_confid_barcode_index_list -> numOfElements);
+ scRNA_merged_write_sparse_matrix(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1], high_confid_barcode_index_list, x1, "HighConf", loaded_features);
+ scRNA_merged_write_sparse_matrix(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1], this_sample_ambient_rescure_candi, x1, "RescCand", loaded_features);
+ scRNA_merged_45K_to_90K_sum( global_context, merged_tables_gene_to_cell_umis[x1], this_sample_45k_90k_barcode_idx, x1 , loaded_features);
+ scRNA_merged_write_nozero_geneids( global_context, merged_tables_gene_to_cell_umis[x1], x1, loaded_features );
+
+ ArrayListDestroy(this_sample_ambient_rescure_candi);
+ ArrayListDestroy(this_sample_45k_90k_barcode_idx);
+ ArrayListDestroy(high_confid_barcode_index_list);
+ }
+
+ fclose(sample_tab_fp);
}
void scRNA_merge_merge_UMIs(void * gno_ky, void * cell_umi_2_reads_list_va , HashTable * tab){
@@ -4389,7 +4627,7 @@ void scRNA_merge_merge_UMIs(void * gno_ky, void * cell_umi_2_reads_list_va , Ha
// return the number of RG result sets
-int fc_thread_merge_results(fc_thread_global_context_t * global_context, read_count_type_t * nreads , srInt_64 *nreads_mapped_to_exon, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * RGmerged_table)
+int fc_thread_merge_results(fc_thread_global_context_t * global_context, read_count_type_t * nreads , srInt_64 *nreads_mapped_to_exon, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * RGmerged_table, fc_feature_info_t * loaded_features)
{
int xk1, xk2, ret = 0;
@@ -4442,7 +4680,7 @@ int fc_thread_merge_results(fc_thread_global_context_t * global_context, read_co
HashTableIteration( merged_sample_cell_umi_tables[xk1], scRNA_merge_merge_UMIs);
}
- scRNA_merged_to_tables_write(global_context , merged_sample_cell_umi_tables , used_cell_no_tables, merged_umi_list);
+ scRNA_merged_to_tables_write(global_context , merged_sample_cell_umi_tables , used_cell_no_tables, merged_umi_list, loaded_features);
// SUBREADprintf("MERGED UMI TABLE = %ld items\n", merged_umi_table -> numOfElements);
ArrayListDestroy(merged_umi_list);
@@ -5629,9 +5867,11 @@ void print_usage()
SUBREADputs(" 'SAF'. 'GTF' by default. For SAF format, please refer to");
SUBREADputs(" Users Guide.");
SUBREADputs("");
- SUBREADputs(" -t <string> Specify feature type in GTF annotation. 'exon' by ");
- SUBREADputs(" default. Features used for read counting will be ");
- SUBREADputs(" extracted from annotation using the provided value.");
+ SUBREADputs(" -t <string> Specify feature type(s) in a GTF annotation. If multiple");
+ SUBREADputs(" types are provided, they should be separated by ',' with");
+ SUBREADputs(" no space in between. 'exon' by default. Rows in the");
+ SUBREADputs(" annotation with a matched feature will be extracted and");
+ SUBREADputs(" used for read mapping. ");
SUBREADputs("");
SUBREADputs(" -g <string> Specify attribute type in GTF annotation. 'gene_id' by ");
SUBREADputs(" default. Meta-features used for read counting will be ");
@@ -6206,7 +6446,7 @@ HashTable * scRNA_copy_loaded_features(srInt_64 nexons, fc_feature_info_t* loade
return ret;
}
-int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons, int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junc_glob_tab, HashTable * splicing_glob_tab, HashTable * merged_RG_table);
+int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons, int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junc_glob_tab, HashTable * splicing_glob_tab, HashTable * merged_RG_table, fc_feature_info_t * loaded_features);
int Input_Files_And_Strand_Mode_Pair(char * fnames, char * smodes){
int ret = 0, ch, bad_fmt = 0, numbs = 0;
@@ -6737,7 +6977,7 @@ int readSummary(int argc,char *argv[]){
global_context.is_read_details_out = isReadSummaryReport;
global_context.max_M = max_M;
- ret_int = ret_int || readSummary_single_file(& global_context, column_numbers, nexons, geneid, chr, start, stop, sorted_strand, anno_chr_2ch, anno_chrs, anno_chr_head, block_end_index, block_min_start, block_max_end, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table);
+ ret_int = ret_int || readSummary_single_file(& global_context, column_numbers, nexons, geneid, chr, start, stop, sorted_strand, anno_chr_2ch, anno_chrs, anno_chr_head, block_end_index, block_min_start, block_max_end, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table, loaded_features);
if(global_context.disk_is_full){
SUBREADprintf("ERROR: disk is full. Please check the free space in the output directory.\n");
}
@@ -6806,6 +7046,7 @@ int readSummary(int argc,char *argv[]){
if(global_context.is_input_bad_format){
SUBREADprintf("\nFATAL Error: The program has to terminate and no counting file is generated.\n\n");
}else if(!global_context.disk_is_full){
+ print_in_box(80,0,0,"Write the final count table.");
if(isGeneLevel){
char ** sorted_extra_columns = NULL;
if(global_context.reported_extra_columns != NULL){
@@ -6823,11 +7064,15 @@ int readSummary(int argc,char *argv[]){
} else
fc_write_final_results(&global_context, argv[3], nexons, table_columns, table_column_names, loaded_features, isCVersion);
}
- if(global_context.do_junction_counting && !global_context.disk_is_full)
+ if(global_context.do_junction_counting && global_context.is_input_bad_format == 0 && !global_context.disk_is_full){
+ print_in_box(80,0,0,"Write the junction count table.");
fc_write_final_junctions(&global_context, argv[3], table_column_names, junction_global_table_list, splicing_global_table_list);
+ }
- if(!global_context.disk_is_full)
+ if(global_context.is_input_bad_format == 0 && !global_context.disk_is_full){
+ print_in_box(80,0,0,"Write the read assignment summary.");
fc_write_final_counts(&global_context, argv[3], table_column_names, read_counters, isCVersion);
+ }
ArrayListDestroy(table_columns);
ArrayListDestroy(table_column_names);
@@ -6962,7 +7207,7 @@ void sort_bucket_table(fc_thread_global_context_t * global_context){
-int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons, int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * merged_RG_table)
+int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons, int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * merged_RG_table, fc_feature_info_t * loaded_features)
{
int read_length = 0;
int is_first_read_PE=0;
@@ -7000,7 +7245,7 @@ int readSummary_single_file(fc_thread_global_context_t * global_context, read_co
fc_thread_wait_threads(global_context);
srInt_64 nreads_mapped_to_exon = 0;
- fc_thread_merge_results(global_context, column_numbers , &nreads_mapped_to_exon, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table);
+ fc_thread_merge_results(global_context, column_numbers , &nreads_mapped_to_exon, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table, loaded_features);
fc_thread_destroy_thread_context(global_context);
if(global_context -> sambam_chro_table) free(global_context -> sambam_chro_table);
@@ -7031,7 +7276,7 @@ int feature_count_main(int argc, char ** argv)
int cmd_rebuilt_size = 2000;
char * cmd_rebuilt = malloc(cmd_rebuilt_size);
char max_M_str[8];
- char nameFeatureTypeColumn[66];
+ char nameFeatureTypeColumn[2000];
char nameGeneIDColumn[66];
int min_qual_score = 0;
int min_dist = 50;
=====================================
src/sambam-file.c
=====================================
@@ -557,7 +557,8 @@ int PBam_chunk_headers(char * chunk, int *chunk_ptr, int chunk_len, SamBam_Refer
}
SamBam_Reference_Info * new_event = (*bam_chro_table) + (* table_items);
- strncpy(new_event->chro_name, chro_name, BAM_MAX_CHROMOSOME_NAME_LEN);
+ if(strlen(chro_name)>=BAM_MAX_CHROMOSOME_NAME_LEN) chro_name[BAM_MAX_CHROMOSOME_NAME_LEN-1]=0;
+ strcpy(new_event->chro_name, chro_name);
new_event -> chro_length = chro_len;
(* table_items)++;
@@ -1782,7 +1783,7 @@ int SamBam_unzip(char * out , char * in , int inlen)
}
-int SamBam_writer_sort_buff_one_compare(void * Lbin, void * Rbin){
+int SamBam_writer_sort_buff_one_compare(void * Lbin, void * Rbin, ArrayList * me){
unsigned long long * Lv = Lbin, *Rv = Rbin;
if(*Lv > *Rv) return 1;
else if(*Lv < *Rv) return -1;
@@ -2052,7 +2053,7 @@ void SamBam_writer_one_thread_merge_sortedbins(SamBam_Writer * writer){
int level_min_binno[] = {0, 1, 9, 73, 585, 4681};
-int SamBam_writer_merge_chunks_compare(void * vL, void * vR){
+int SamBam_writer_merge_chunks_compare(void * vL, void * vR, ArrayList * me){
long long * lL = vL;
long long * lR = vR;
if( (*lL) > (*lR)) return 1;
=====================================
src/sorted-hashtable.c
=====================================
@@ -125,9 +125,9 @@ int _gehash_resize_bucket(gehash_t * the_table , int bucket_no, char is_small_ta
if (current_bucket->space_size<1)
{
if(is_small_table)
- new_bucket_length = (int)(GEHASH_BUCKET_LENGTH*1.5);
+ new_bucket_length = 5 ;
else
- new_bucket_length = (int)(GEHASH_BUCKET_LENGTH);
+ new_bucket_length = 2 ;
if(the_table->version_number == SUBINDEX_VER0)
new_item_keys = (gehash_key_t *) malloc(sizeof(gehash_key_t) * new_bucket_length);
@@ -167,7 +167,7 @@ int _gehash_resize_bucket(gehash_t * the_table , int bucket_no, char is_small_ta
//new_bucket_length = (int)max(15,current_bucket->space_size*1.5+1);
new_bucket_length = (int)(current_bucket->space_size*5);
else
- new_bucket_length = (int)(current_bucket->space_size*1.5);
+ new_bucket_length = max((int)(current_bucket->space_size*1.5), (int)(current_bucket->space_size+4));
if(the_table->version_number == SUBINDEX_VER0)
current_bucket->item_keys = (gehash_key_t *) realloc(current_bucket->item_keys , sizeof(gehash_key_t) * new_bucket_length);
@@ -188,200 +188,6 @@ int _gehash_resize_bucket(gehash_t * the_table , int bucket_no, char is_small_ta
return 0;
}
-int _gehash_resize_bucket_old(gehash_t * the_table , int bucket_no, char is_small_table)
-{
- int new_bucket_length;
- struct gehash_bucket * current_bucket = &(the_table -> buckets [bucket_no]);
- gehash_key_t * new_item_keys = NULL;
- short * new_new_item_keys = NULL;
- gehash_data_t * new_item_values;
-
- if (current_bucket->space_size<1)
- {
- int randv = myrand_rand();
- if(is_small_table)
- //new_bucket_length = (int)max(15,current_bucket->space_size*1.5+1);
- new_bucket_length = (int)(GEHASH_BUCKET_LENGTH*(1.5+3.*pow(randv * 1./RAND_MAX,30)));
- else
- new_bucket_length = (int)(GEHASH_BUCKET_LENGTH*(1. +3.*pow(randv * 1./RAND_MAX,30)));
-
-
- // printf("NS1=%d\nNS2=%d\n", new_bucket_length, new_bucket_length);
- if(the_table->version_number == SUBINDEX_VER0)
- {
- new_item_keys = (gehash_key_t *) malloc(sizeof(gehash_key_t) * new_bucket_length);
- }
- else
- {
- new_new_item_keys = (short *) malloc(sizeof(short)*new_bucket_length);
- }
-
- new_item_values = (gehash_data_t *) malloc(sizeof(gehash_data_t) * new_bucket_length);
-
- if(!((new_item_keys|| new_new_item_keys) && new_item_values))
- {
- SUBREADputs(MESSAGE_OUT_OF_MEMORY);
- return 1;
- }
-
- if(the_table->version_number == SUBINDEX_VER0)
- memset(new_item_keys, 0, sizeof(gehash_key_t) * new_bucket_length);
- else
- memset(new_new_item_keys, 0, sizeof(short) * new_bucket_length);
-
- memset(new_item_values, 0, sizeof(gehash_data_t) * new_bucket_length);
-
- if(the_table->version_number == SUBINDEX_VER0)
- current_bucket->item_keys = new_item_keys;
- else
- current_bucket->new_item_keys = new_new_item_keys;
-
- current_bucket->item_values = new_item_values;
- current_bucket->space_size = new_bucket_length;
-
- }
- else
- {
- int test_start = myrand_rand() % the_table-> buckets_number;
- int i;
- int need_allowc = 1;
- for (i =test_start; i<test_start+10000; i++)
- {
- if (i==bucket_no)continue;
- if (i>=the_table-> buckets_number)
- {
- test_start = myrand_rand() % the_table-> buckets_number;
- i = test_start;
- continue;
- }
-
- if(the_table-> buckets[i].space_size > current_bucket->current_items+1 && current_bucket->space_size > the_table-> buckets[i].current_items+1 && current_bucket->current_items > the_table-> buckets[i].current_items)
- {
- int j;
- gehash_data_t t_d;
-
- for (j=0; j<current_bucket->current_items; j++)
- {
- if (j< the_table -> buckets[i].current_items)
- {
- if(the_table->version_number == SUBINDEX_VER0)
- {
- gehash_key_t t_key;
- t_key = current_bucket->item_keys[j];
- current_bucket->item_keys[j] = the_table-> buckets[i].item_keys[j];
- the_table-> buckets[i].item_keys[j] = t_key;
- }
- else
- {
-
- short t_key;
- t_key = current_bucket->new_item_keys[j];
- current_bucket->new_item_keys[j] = the_table-> buckets[i].new_item_keys[j];
- the_table-> buckets[i].new_item_keys[j] = t_key;
- }
-
-
- t_d = current_bucket->item_values[j];
- current_bucket->item_values[j] = the_table-> buckets[i].item_values[j];
- the_table-> buckets[i].item_values[j] = t_d;
- }
- else
- {
- if(the_table->version_number == SUBINDEX_VER0)
- the_table-> buckets[i].item_keys[j] = current_bucket->item_keys[j];
- else
- the_table-> buckets[i].new_item_keys[j] = current_bucket->new_item_keys[j];
- the_table-> buckets[i].item_values[j] = current_bucket->item_values[j];
- }
- }
-
- gehash_key_t * p_key;
- short * ps_key;
- gehash_data_t * p_d;
- int i_size;
-
- if(the_table->version_number == SUBINDEX_VER0)
- {
- p_key = the_table-> buckets[i].item_keys ;
- the_table-> buckets[i].item_keys = current_bucket->item_keys;
- current_bucket->item_keys = p_key;
- }else{
- ps_key = the_table-> buckets[i].new_item_keys ;
- the_table-> buckets[i].new_item_keys = current_bucket->new_item_keys;
- current_bucket->new_item_keys = ps_key;
- }
-
- p_d = the_table-> buckets[i].item_values ;
- the_table-> buckets[i].item_values = current_bucket->item_values;
- current_bucket->item_values = p_d;
-
- i_size = the_table-> buckets[i].space_size;
- the_table-> buckets[i].space_size =current_bucket->space_size;
- current_bucket->space_size=i_size;
-
- need_allowc = 0;
- break;
- }
- }
- if(need_allowc)
- {
- if(is_small_table)
- //new_bucket_length = (int)max(15,current_bucket->space_size*1.5+1);
- new_bucket_length = (int)(current_bucket->space_size*5);
- else
- new_bucket_length = (int)(current_bucket->space_size*1.5);
-
- // printf("NSS1=%d\nNSS2=%d\n", new_bucket_length, new_bucket_length);
- if(the_table->version_number == SUBINDEX_VER0)
- new_item_keys = (gehash_key_t *) malloc(sizeof(gehash_key_t) * new_bucket_length);
- else
- new_new_item_keys = (short *) malloc(sizeof(short) * new_bucket_length);
-
- new_item_values = (gehash_data_t *) malloc(sizeof(gehash_data_t) * new_bucket_length);
-
-
- if(!((new_item_keys||new_new_item_keys) && new_item_values))
- {
- SUBREADputs(MESSAGE_OUT_OF_MEMORY);
- return 1;
- }
-
- if(the_table->version_number == SUBINDEX_VER0)
- memset(new_item_keys, 0, sizeof(gehash_key_t) * new_bucket_length);
- else
- memset(new_new_item_keys, 0, sizeof(short) * new_bucket_length);
-
- memset(new_item_values, 0, sizeof(gehash_data_t) * new_bucket_length);
-
- if(the_table->version_number == SUBINDEX_VER0)
- memcpy(new_item_keys, current_bucket->item_keys, current_bucket->current_items*sizeof(gehash_key_t));
- else
- memcpy(new_new_item_keys, current_bucket->item_keys, current_bucket->current_items*sizeof(short));
-
- memcpy(new_item_values, current_bucket->item_values, current_bucket->current_items*sizeof(gehash_data_t));
-
- if(the_table->version_number == SUBINDEX_VER0)
- free(current_bucket->item_keys);
- else
- free(current_bucket->new_item_keys);
-
- free(current_bucket->item_values);
-
- if(the_table->version_number == SUBINDEX_VER0)
- current_bucket->item_keys = new_item_keys;
- else
- current_bucket->new_item_keys = new_new_item_keys;
-
- current_bucket->item_values = new_item_values;
- current_bucket->space_size = new_bucket_length;
-
-
- }
- }
- return 0;
-
-}
-
#define _gehash_get_bucket(tab, key) ( (tab) -> buckets + _gehash_hash( key ) % (tab) -> buckets_number )
#define _gehash_get_bucketNO(tab, key) ( _gehash_hash( key ) % (tab) -> buckets_number )
@@ -419,13 +225,59 @@ int gehash_insert(gehash_t * the_table, gehash_key_t key, gehash_data_t data, un
{
short step_number = key/the_table -> buckets_number;
if(current_bucket-> new_item_keys == NULL && bucket_sizes){
- unsigned int bkno = _gehash_get_bucketNO(the_table, key) ;
- unsigned char * mem_block = malloc((sizeof(gehash_data_t) + sizeof(short)) * bucket_sizes[ bkno ] );
- current_bucket->item_values = ( gehash_data_t * )mem_block;
- current_bucket->new_item_keys = ( short*)(mem_block + sizeof(gehash_data_t) * bucket_sizes[ bkno ]) ;
- current_bucket->current_items = 0;
- current_bucket->space_size = bucket_sizes[ bkno ];
- the_table -> free_item_only = 1;
+ //this will go only once when all the bucktes are NULL.
+
+ unsigned int * memory_sizes = malloc(sizeof(int) * the_table -> buckets_number);
+ memset(memory_sizes, 0xff, sizeof(int) * the_table -> buckets_number);
+
+ unsigned int grp_bytes = 0, grps = 0, this_grp_bucks = 0, xk1, bucks_per_grp = the_table -> buckets_number / GEHASH_MEM_PTR_NO +2;
+ for(xk1=0; xk1< the_table -> buckets_number ; xk1++){
+ unsigned int buck_size =(sizeof(short) + sizeof(gehash_data_t)) * bucket_sizes[xk1];
+ grp_bytes += buck_size;
+ this_grp_bucks ++;
+ if( this_grp_bucks >= bucks_per_grp ){
+ memory_sizes[grps++] = grp_bytes;
+ grp_bytes=0;
+ this_grp_bucks=0;
+ }
+ }
+ if(this_grp_bucks)
+ memory_sizes[grps] = grp_bytes;
+
+ for(xk1=0; xk1 < GEHASH_MEM_PTR_NO; xk1++){
+ unsigned int current_bytes = memory_sizes[xk1];
+ if(current_bytes<0xff000000u){
+ if(the_table -> malloc_ptr[xk1]!=NULL){
+ SUBREADprintf("ERROR : no-NULL ptr : %p\n",the_table -> malloc_ptr[xk1]);
+ }
+ the_table -> malloc_ptr[xk1] = malloc(current_bytes);
+ if(!the_table -> malloc_ptr[xk1]){
+ SUBREADputs(MESSAGE_OUT_OF_MEMORY);
+ return 1;
+ }
+ }
+ }
+
+ grp_bytes = 0; grps = 0; this_grp_bucks = 0;
+ for(xk1=0; xk1< the_table -> buckets_number ; xk1++){
+ struct gehash_bucket * mem_bucket = the_table -> buckets+xk1;
+ memset(mem_bucket, 0, sizeof(struct gehash_bucket));
+ mem_bucket -> new_item_keys = (short*)((char*)the_table -> malloc_ptr[grps] + grp_bytes);
+ mem_bucket -> item_values = (gehash_data_t*)((char *)mem_bucket -> new_item_keys + sizeof(short) * bucket_sizes[xk1]);
+ mem_bucket -> space_size = bucket_sizes[xk1];
+
+ unsigned int buck_size =(sizeof(short) + sizeof(gehash_data_t)) * bucket_sizes[xk1];
+ grp_bytes += buck_size;
+ this_grp_bucks ++;
+ if( this_grp_bucks >= bucks_per_grp ){
+ grps++;
+ grp_bytes=0;
+ this_grp_bucks=0;
+ }
+ }
+
+ the_table -> free_item_only = 2;
+ free(memory_sizes);
}
if (current_bucket->current_items >= current_bucket->space_size) {
@@ -1548,7 +1400,7 @@ int gehash_load(gehash_t * the_table, const char fname [])
//return -1;
}
- the_table -> malloc_ptr = NULL;
+ memset(the_table -> malloc_ptr ,0, sizeof(void*) * GEHASH_MEM_PTR_NO);
the_table -> index_gap = 0;
FILE * fp = f_subr_open(fname, "rb");
@@ -1622,69 +1474,82 @@ int gehash_load(gehash_t * the_table, const char fname [])
SUBREADputs("ERROR: the index format is unrecognizable.");
assert(the_table -> current_items >0 && the_table -> current_items <=0xffffffffllu);
}
+
+ unsigned int * bucket_bytes = malloc(sizeof(int) * GEHASH_MEM_PTR_NO);
+ memset(bucket_bytes, 0xff, sizeof(int) * GEHASH_MEM_PTR_NO);
the_table -> buckets_number = load_int32(fp);
the_table -> buckets = (struct gehash_bucket * )malloc(sizeof(struct gehash_bucket) * the_table -> buckets_number);
if(!the_table -> buckets)
{
+ SUBREADputs("Creating buckets.");
SUBREADputs(MESSAGE_OUT_OF_MEMORY);
return 1;
}
- size_t fp_curr = ftello(fp), fp_end=fp_curr;
-
- fseeko(fp, 0, SEEK_END);
- fp_end = ftello(fp);
- fseeko(fp, fp_curr, SEEK_SET);
-// SUBREADprintf("CACL_SIZE %zd %zd\n", fp_end, fp_curr);
-
- size_t rem_len = fp_end - fp_curr;
- the_table -> malloc_ptr = malloc(rem_len);
- if(!the_table -> malloc_ptr){
- SUBREADputs(MESSAGE_OUT_OF_MEMORY);
- return 1;
- }
-
- size_t rdbytes = 0;
- while(!feof(fp) && rdbytes < rem_len){
- size_t rrr = fread(the_table -> malloc_ptr + rdbytes, 1, rem_len - rdbytes, fp);
- //SUBREADprintf("LOAD: %zd + %zd ==> %zd\n", rdbytes , rrr, rem_len);
- rdbytes += rrr;
- }
- fclose(fp);
- if(rdbytes != rem_len){
- SUBREADprintf("Error: the index cannot be fully loaded (%ld != %ld). It may contain format errors or file '%s' may be truncated.\n", (long) fp_end, (long) fp_curr, fname);
- return -1;
- }
- if(rdbytes<1) {
- SUBREADprintf("Error: the index seem to contain no data at all. It may contain format errors or file '%s' may be truncated.\n", fname);
- return -1;
+ srInt_64 fp_curr = ftello(fp);
+ unsigned int accued_bytes = 0, grp_i = 0, curr_bucks = 0, per_group_bucks = the_table -> buckets_number/ GEHASH_MEM_PTR_NO + 2;
+ for(i=0; i<the_table -> buckets_number ; i++){
+ unsigned int current_items = load_int32(fp);
+ load_int32(fp);//useless for loading : space size
+ unsigned int current_bytes = current_items*( sizeof(short) + sizeof(gehash_data_t) );
+ accued_bytes += current_bytes;
+ curr_bucks ++;
+ if(curr_bucks >= per_group_bucks){
+ //SUBREADprintf("Allocating %d : %d buckets : %u\n", grp_i, i, accued_bytes);
+ bucket_bytes[grp_i++] = accued_bytes;
+ accued_bytes = 0;
+ curr_bucks = 0;
+ }
+ fseeko(fp, current_bytes, SEEK_CUR);
}
- char * curr_ptr = the_table -> malloc_ptr;
- for (i=0; i<the_table -> buckets_number; i++)
- {
- struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
- memcpy(¤t_bucket -> current_items, curr_ptr, 4);
- curr_ptr+=4;
-
- memcpy(¤t_bucket -> space_size, curr_ptr, 4);
- curr_ptr+=4;
-
- current_bucket -> new_item_keys = (short*)curr_ptr;
- curr_ptr += sizeof(short)*current_bucket -> current_items;
- current_bucket -> item_values = (gehash_data_t*)curr_ptr;
- curr_ptr += sizeof(gehash_data_t)*current_bucket -> current_items;
- if( curr_ptr > the_table -> malloc_ptr + rdbytes){
- SUBREADprintf("Error: the index cannot be fully loaded : %p > %p + %ld. It may contain format errors or file '%s' may be truncated.\n",curr_ptr, the_table -> malloc_ptr, rdbytes , fname);
- return -1;
+ if(curr_bucks)
+ bucket_bytes[grp_i++] = accued_bytes;
+ fseeko(fp, (off_t)fp_curr, SEEK_SET);
+
+ for(i=0; i<GEHASH_MEM_PTR_NO ; i++){
+ unsigned int current_bytes = bucket_bytes[i];
+ if(current_bytes<0xff000000u){
+ the_table -> malloc_ptr[i] = malloc(current_bytes);
+ //SUBREADprintf("Allocating %d buckets : %u\n", i, current_bytes);
+ if(!the_table -> malloc_ptr[i]){
+ SUBREADputs(MESSAGE_OUT_OF_MEMORY);
+ return 1;
+ }
}
}
- the_table -> is_small_table = *curr_ptr;
- curr_ptr ++;
- if( curr_ptr != the_table -> malloc_ptr + rdbytes ){
- SUBREADprintf("Error: the index cannot be fully loaded. It may contain format errors or file '%s' may be truncated.\n", fname);
- return -1;
+
+ grp_i = 0;
+ curr_bucks = 0;
+ accued_bytes = 0;
+ for(i=0; i<the_table -> buckets_number ; i++){
+ struct gehash_bucket * current_bucket = the_table -> buckets+i;
+ current_bucket -> current_items = load_int32(fp);
+ load_int32(fp);//useless for lo: space size
+ current_bucket -> space_size = current_bucket -> current_items;
+ unsigned int current_bytes = current_bucket -> current_items*( sizeof(short) + sizeof(gehash_data_t) );
+
+ current_bucket -> new_item_keys = (short*)(the_table -> malloc_ptr[grp_i] + accued_bytes);
+ current_bucket -> item_values = (gehash_data_t*)(the_table -> malloc_ptr[grp_i] + accued_bytes + current_bucket -> current_items * sizeof(short));
+ read_length = fread(current_bucket -> new_item_keys, sizeof(short), current_bucket -> current_items, fp);
+ read_length += fread(current_bucket -> item_values, sizeof(gehash_data_t), current_bucket -> current_items, fp);
+ if(read_length < 2* current_bucket -> current_items){
+ SUBREADprintf("ERROR: the index is incomplete.\n");
+ return 1;
+ }
+
+ accued_bytes += current_bytes;
+ curr_bucks ++;
+ if(curr_bucks >= per_group_bucks){
+ curr_bucks=0;
+ accued_bytes=0;
+ grp_i++;
+ }
}
- assert( curr_ptr == the_table -> malloc_ptr + rdbytes );
+
+ int rval = fread(&(the_table -> is_small_table), sizeof(char), 1, fp);
+ if (rval != 1)SUBREADprintf("ERROR: cannot find the table table.\n");
+ free(bucket_bytes);
+ fclose(fp);
return 0;
}
@@ -1736,7 +1601,10 @@ int gehash_load(gehash_t * the_table, const char fname [])
}
read_length = fread(&(the_table -> is_small_table), sizeof(char), 1, fp);
- assert(read_length>0);
+ if(read_length!=1){
+ SUBREADprintf("ERROR: the index is incomplete.\n");
+ return 1;
+ }
fclose(fp);
return 0;
}
@@ -2027,10 +1895,14 @@ int gehash_dump(gehash_t * the_table, const char fname [])
void gehash_destory(gehash_t * the_table)
{
- int i;
+ int i, is_ptr = 0;
+
+ for(i = 0; i < GEHASH_MEM_PTR_NO; i++) if(the_table -> malloc_ptr[i]){
+ free(the_table -> malloc_ptr[i]);
+ is_ptr=1;
+ }
- if(the_table -> malloc_ptr) free(the_table -> malloc_ptr);
- else for (i=0; i<the_table -> buckets_number; i++)
+ if(!is_ptr) for (i=0; i<the_table -> buckets_number; i++)
{
struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
if (current_bucket -> space_size > 0)
@@ -2039,6 +1911,7 @@ void gehash_destory(gehash_t * the_table)
free (current_bucket -> item_values);
}
}
+
free (the_table -> buckets);
the_table -> current_items = 0;
=====================================
src/subread.h
=====================================
@@ -270,6 +270,12 @@ struct gehash_bucket {
gehash_data_t * item_values;
};
+
+#ifdef __MINGW32__
+#define GEHASH_MEM_PTR_NO (64)
+#else
+#define GEHASH_MEM_PTR_NO (64*1024)
+#endif
typedef struct {
int version_number;
unsigned long long int current_items;
@@ -278,7 +284,7 @@ typedef struct {
struct gehash_bucket * buckets;
int index_gap;
int padding;
- char * malloc_ptr;
+ char * malloc_ptr [GEHASH_MEM_PTR_NO];
int free_item_only;
} gehash_t;
=====================================
src/tx-unique.c
=====================================
@@ -85,7 +85,7 @@ int txunique_load_annotation(txunique_context_t * context){
return 0;
}
-int txunique_process_flat_comp( void * ex1p, void * ex2p ){
+int txunique_process_flat_comp( void * ex1p, void * ex2p, ArrayList *me ){
txunique_exon_t * ex1 = ex1p;
txunique_exon_t * ex2 = ex2p;
@@ -150,7 +150,7 @@ void debug_print_edges(ArrayList * exs){
}
}
-int txunique_process_gene_edge_comp(void * e1p, void * e2p){
+int txunique_process_gene_edge_comp(void * e1p, void * e2p, ArrayList *me){
struct _txunique_tmp_edges * e1 = e1p;
struct _txunique_tmp_edges * e2 = e2p;
if(e1 -> base_open_end < e2 -> base_open_end) return -1;
=====================================
test/subread-align/data/test-err-mut-r1.fq.gz deleted
=====================================
Binary files a/test/subread-align/data/test-err-mut-r1.fq.gz and /dev/null differ
=====================================
test/subread-align/data/test-err-mut-r2.fq.gz deleted
=====================================
Binary files a/test/subread-align/data/test-err-mut-r2.fq.gz and /dev/null differ
View it on GitLab: https://salsa.debian.org/med-team/subread/-/commit/581d5ef26ce8e53c4a026feb096ee98e2510aec4
--
View it on GitLab: https://salsa.debian.org/med-team/subread/-/commit/581d5ef26ce8e53c4a026feb096ee98e2510aec4
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200522/f6808858/attachment-0001.html>
More information about the debian-med-commit
mailing list