[med-svn] [subread] 03/10: New upstream version 1.6.0+dfsg
Alex Mestiashvili
malex-guest at moszumanska.debian.org
Fri Nov 17 16:05:04 UTC 2017
This is an automated email from the git hooks/post-receive script.
malex-guest pushed a commit to branch master
in repository subread.
commit e99d951c933b130df37f4eb82473f333d1ba44c9
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date: Fri Nov 17 13:30:54 2017 +0100
New upstream version 1.6.0+dfsg
---
doc/SubreadUsersGuide.tex | 53 +-
src/Makefile.FreeBSD | 13 +-
src/Makefile.Linux | 11 +-
src/Makefile.MacOS | 10 +-
src/core-interface-aligner.c | 5 +-
src/core-interface-subjunc.c | 3 +-
src/core-junction.c | 66 +-
src/core.c | 57 +-
src/gene-algorithms.c | 29 +-
src/gene-value-index.c | 48 +-
src/input-files.c | 131 +-
src/longread-mapping/LRMbase-index.c | 285 ++++
src/longread-mapping/LRMbase-index.h | 38 +
src/longread-mapping/LRMchro-event.c | 1241 +++++++++++++++++
src/longread-mapping/LRMchro-event.h | 40 +
src/longread-mapping/LRMconfig.h | 389 ++++++
src/longread-mapping/LRMfile-io.c | 673 +++++++++
src/longread-mapping/LRMfile-io.h | 55 +
src/longread-mapping/LRMhelper.c | 77 +
src/longread-mapping/LRMhelper.h | 28 +
src/longread-mapping/LRMsorted-hashtable.c | 534 +++++++
src/longread-mapping/LRMsorted-hashtable.h | 44 +
src/longread-mapping/Makefile | 23 +
src/longread-mapping/hashtable.c | 797 +++++++++++
src/longread-mapping/hashtable.h | 471 +++++++
src/longread-mapping/longread-mapping-one.c | 1231 ++++++++++++++++
src/longread-mapping/longread-mapping.c | 2012 +++++++++++++++++++++++++++
src/longread-mapping/seek-zlib.c | 363 +++++
src/longread-mapping/seek-zlib.h | 86 ++
src/makefile.version | 2 +-
src/propmapped.c | 13 +-
src/readSummary.c | 257 +++-
src/subread.h | 23 +-
33 files changed, 8933 insertions(+), 175 deletions(-)
diff --git a/doc/SubreadUsersGuide.tex b/doc/SubreadUsersGuide.tex
index f4698e3..956c018 100644
--- a/doc/SubreadUsersGuide.tex
+++ b/doc/SubreadUsersGuide.tex
@@ -14,6 +14,7 @@
\newcommand{\code}[1]{{\small\texttt{#1}}}
\newcommand{\Subread}{\textsf{Subread}}
\newcommand{\Subjunc}{\textsf{Subjunc}}
+\newcommand{\Sublong}{\textsf{Sublong}}
\newcommand{\Rsubread}{\textsf{Rsubread}}
\newcommand{\ExactSNP}{\textsf{ExactSNP}}
\newcommand{\limma}{\textsf{limma}}
@@ -35,9 +36,9 @@
\begin{center}
{\Huge\bf Subread/Rsubread Users Guide}\\
\vspace{1 cm}
-{\centering\large Subread v1.5.3/Rsubread v1.26.1\\}
+{\centering\large Subread v1.6.0/Rsubread v1.28.0\\}
\vspace{1 cm}
-\centering 11 July 2017\\
+\centering 14 Nov 2017\\
\vspace{5 cm}
\Large Wei Shi and Yang Liao\\
\vspace{1 cm}
@@ -272,7 +273,7 @@ For RNA-seq data, a read is called as a multi-mapping read if it has two or more
For genomic DNA-seq data, a read is called as a multi-mapping read if it has two or more candidate locations that have the same number of matched bases and this number is the largest among all candidate locations being considered.
Note that for both RNA-seq and genomic DNA-seq data, any alignment reported for a multi-mapping read must not have more than threshold number of mis-matched bases (as specified in `-M' parameter).
-For the reporting of a multi-mapping read, users can choose to not report any alignment for the read (`-u' option) or report up to a pre-defined number of alignments (`-B' option).
+For the reporting of a multi-mapping read, users may choose to not report any alignments for the read (by default) or report up to a pre-defined number of alignments (`--multiMapping' and `-B' options).
\section{Mapping of paired-end reads}
@@ -409,16 +410,16 @@ chr1.fa, chr2.fa, ... \newline (\code{reference}) & Give names of chromosome fil
The {\Subread} aligner (\texttt{subread-align} program in SourceForge {\Subread} package or \code{align} function in Bioconductor {\Rsubread} package) extracts a number of subreads from each read and then uses these subreads to vote for the mapping location of the read.
It uses the the ``seed-and-vote'' paradigm for read mapping and reports the largest mappable region for each read.
-Table 2 describes the arguments used by {\Subread} aligner (and also \code{Subjunc} aligner).
+Table 2 describes the arguments used by {\Subread} aligner (also {\Subjunc} and {\Sublong} aligners).
Arguments used in Bioconductor \code{Rsubread} package are included in parenthesis.\\
-
\begin{longtable}{|p{4cm}|p{12cm}|}
-\multicolumn{2}{p{16cm}}{Table 2: Arguments used by the \code{subread-align}/\code{subjunc} programs included in the SourceForge {\Subread} package in alphabetical order.
+\multicolumn{2}{p{16cm}}{Table 2: Arguments used by the \code{subread-align}/\code{subjunc}/\code{sublong} programs included in the SourceForge {\Subread} package in alphabetical order.
Arguments in parenthesis in the first column are the equivalent arguments used in Bioconductor {\Rsubread} package.
Arguments used by \code{subread-align} only are marked with $^*$.
Arguments used by \code{subjunc} only are marked with $^{**}$.
+Arguments used by \code{sublong} only are marked with $^{***}$.
\newline
}
\endfirsthead
@@ -439,15 +440,15 @@ Arguments & Description \\
\hline
-F $<string>$ \newline (\code{isGTF}) & Specify format of the provided annotation file. Acceptable formats include `GTF' (or compatible GFF format) and `SAF'. Default format in SourceForge {\Subread} is `GTF'. Default format in {\Rsubread} is `SAF'. \\
\hline
--i $<string> \newline (\code{index}) $ & Specify the base name of the index.\\
+-i $<string> \newline (\code{index}) $ & Specify the base name of the index. Full index (not gapped index) must be provided for \code{sublong} aligner, ie. `-F' option must be specified when running \code{subread-buildindex} for index building.\\
\hline
-I $<int>$ \newline (\code{indels}) & Specify the number of INDEL bases allowed in the mapping. 5 by default. Indels of up to 200bp long can be detected.\\
\hline
--m $<int>$ \newline (\code{TH1}) & Specify the consensus threshold, which is the minimal number of consensus subreads required for reporting a hit. The consensus subreads are those subreads which vote for the same location in the reference genome for the read. If pair-end read data are provided, at least one of the two reads from the same pair must satisfy this criteria. 3 by default.\\
+-m $<int>$ \newline (\code{TH1}) & Specify the consensus threshold, which is the minimal number of consensus subreads required for reporting a hit. The consensus subreads are those subreads which vote for the same location in the reference genome for the read. If pair-end read data are provided, at least one of the two reads from the same pair must satisfy this criteria. 3 by default. For \code{sublong}, this is the consensus threshold for mapping a readlet (1 by default). A readlet is [...]
\hline
-M $<int>$ \newline (\code{maxMismatches}) & Specify the maximum number of mis-matched bases allowed in the alignment. 3 by default. Mis-matches found in soft-clipped bases are not counted.\\
\hline
--n $<int>$ \newline (\code{nsubreads}) & Specify the number of subreads extracted from each read, 10 by default.\\
+-n $<int>$ \newline (\code{nsubreads}) & Specify the number of subreads extracted from each read, 10 by default. For \code{sublong}, this is number of subreads (85 by default) extracted from each readlet. A readlet is a 100bp sequence extracted from a long read.\\
\hline
-o $<string>$ \newline (\code{output\_file}) & Give the name of output file. The default output format is BAM. All reads are included in mapping output, including both mapped and unmapped reads, and they are in the same order as in the input file.\\
\hline
@@ -467,6 +468,8 @@ $^*$ -t $<int>$ \newline (\code{type}) & Specify the type of input sequencing da
%\hline
%-u \newline (\code{unique=TRUE}) & Output uniquely mapped reads only. Reads that were found to have more than one best mapping location will not be reported.\\
\hline
+$^{***}$-X $<int>$ & Specify the maximum number of mis-matched bases allowed in the mapping of each subread. 0 by default. This parameter is only applicable for \code{sublong}. \\
+\hline
$^{**}$$--$allJunctions \newline (\code{reportAllJunctions} \newline \code{=TRUE}) & This option should be used with \code{subjunc} for detecting canonical exon-exon junctions (with `GT/AG' donor/receptor sites), non-canonical exon-exon junctions and structural variants (SVs) in RNA-seq data. detected junctions will be saved to a file with suffix name ``.junction.bed". Detected SV breakpoints will be saved to a file with suffix name ``.breakpoints.txt", which includes chromosomal coordin [...]
\hline
$--$BAMinput \newline (\code{input\_format="BAM"}) & Specify that the input read data are in BAM format.\\
@@ -544,6 +547,7 @@ $N_{mm}$ is the number of mismatches present in the final reported alignment for
Read mapping results for each library will be saved to a BAM or SAM format file.
Short indels detected from the read data will be saved to a text file (`.indel').
If `$--$sv' is specified when running \code{subread-align}, breakpoints detected from structural variant events will be output to a text file for each library as well (`.breakpoints.txt').
+Screen output includes a brief mapping summary, including percentage of uniquely mapped reads, percentage of multi-mapping reads and percentage of unmapped reads.
\newpage
@@ -650,9 +654,8 @@ Table 2 describes the arguments used by the {\Subjunc} program.\\
Read mapping results for each library will be saved to a BAM/SAM file.
Detected exon-exon junctions will be saved to a BED file for each library (`.junction.bed').
-Detected short indels will be saved to a text file (`.indel').\\
-
-
+Detected short indels will be saved to a text file (`.indel').
+Screen output includes a brief mapping summary, including percentage of uniquely mapped reads, percentage of multi-mapping reads and percentage of unmapped reads.
\section{Mapping microRNA sequencing reads (miRNA-seq)}
@@ -700,6 +703,17 @@ The {\featureCounts} program can be readily used for summarizing reads to miRNA
+\chapter{Mapping long sequence reads}
+
+\section{Short description}
+
+We developed a new long-read aligner called {\Sublong}, which is also based on the seed-and-vote mapping strategy.
+{\Sublong} is an order of magnitude faster than existing long-read aligners.
+Our simulation results also show that {\Sublong} performs better in mapping accuracy.
+Parameters of {\Sublong} program can be found in Table 2.
+
+
+
\chapter{Read summarization}
\section{Introduction}
@@ -883,10 +897,9 @@ Below lists all the filters supported by {\featureCounts}:
\item Unassigned\_Ambiguity: overlapping with two or more features (feature-level summarization) or meta-features (meta-feature-level) summarization.
\end{itemize}
-These filters are listed in the order that they are applied (same order with that shown in Section~\ref{sec:read_filtering}).
-Which of the filters are applied during read counting depend on the parameter setting.
-Usually only a subset of filters are applied in the counting.
-Unassigned reads are counted for each filter and each unassigned read will only be counted for one filter (the first filter that filters the read out).
+In the counting summary these filters are listed in the same order as they were applied in counting process (see Section~\ref{sec:read_filtering}).
+All categories are exclusive to each other, ie no reads are assigned to more than one category.
+If a read can be filtered out by more than one filter, it is always assigned to the first filter it encounters.
\subsection{Program usage}
@@ -963,6 +976,8 @@ $--$fraction \newline (\code{fraction}) & Assign fractional counts to features.
\hline
$--$fracOverlap $<float>$ \newline (\code{fracOverlap}) & Minimum fraction of overlapping bases in a read that is required for read assignment. Value should be a float number in the range [0,1]. 0 by default. If paired end, number of overlapping bases is counted from both reads. Soft-clipped bases are counted when calculating total read length (but ignored when counting overlapping bases). Both this option and `--minOverlap' option need to be satisfied for read assignment. \\
\hline
+$--$fracOverlapFeature $<float>$ \newline (\code{fracOverlapFeature}) & Minimum fraction of bases included in a feature that is required for overlapping with a read or a read pair. Value should be within range [0,1]. 0 by default. \\
+\hline
$--$ignoreDup \newline (\code{ignoreDup}) & If specified, reads that were marked as duplicates will be ignored. Bit Ox400 in FLAG field of SAM/BAM file is used for identifying duplicate reads. In paired end data, the entire read pair will be ignored if at least one end is found to be a duplicate read.\\
\hline
$--$largestOverlap \newline (\code{largestOverlap}) & If specified, reads (or fragments) will be assigned to the target that has the largest number of overlapping bases.\\
@@ -1180,6 +1195,10 @@ It takes only about half a minute to re-order a location-sorted BAM file includi
Compute the read coverage for each chromosomal location in the genome.
+\section{promoterRegions}
+
+This function is only implemented in {\Rsubread}. It generates a SAF format annotation that includes coordinates of promoter regions for each gene.
+
\section{propmapped}
Get number of mapped reads from a BAM/SAM file.
@@ -1198,7 +1217,7 @@ Get all chromosomal locations that contain a genomic sequence sharing high homol
\section{txUnique}
-This function is only implemented in {\Rsubread} and it counts the number of bases unique to each transcript.
+This function is only implemented in {\Rsubread}. It counts the number of bases unique to each transcript.
\chapter{Case studies}
diff --git a/src/Makefile.FreeBSD b/src/Makefile.FreeBSD
index b901a3f..6715ae4 100644
--- a/src/Makefile.FreeBSD
+++ b/src/Makefile.FreeBSD
@@ -14,22 +14,27 @@ ALL_OBJECTS=$(addsuffix .o, ${ALL_LIBS})
ALL_H=$(addsuffix .h, ${ALL_LIBS})
ALL_C=$(addsuffix .c, ${ALL_LIBS})
-all: repair featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc subtools qualityScores subread-fullscan propmapped coverageCount
+all: sublong repair featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc subtools qualityScores subread-fullscan propmapped coverageCount
mkdir -p ../bin/utilities
- mv subread-align subjunc featureCounts subindel exactSNP subread-buildindex ../bin/
+ mv sublong subread-align subjunc featureCounts subindel exactSNP subread-buildindex ../bin/
mv repair coverageCount subtools qualityScores propmapped subread-fullscan removeDup ../bin/utilities
@echo
@echo "###########################################################"
@echo "# #"
- @echo "# Installation successfully complete. #"
+ @echo "# Installation successfully completed. #"
@echo "# #"
@echo "# Generated executables were copied to directory ../bin/ #"
@echo "# #"
@echo "###########################################################"
@echo
+sublong: longread-mapping/longread-mapping.c ${ALL_OBJECTS}
+ echo "MACOS= -D FREEBSD " > longread-mapping/make.version
+ rm -f longread-mapping/*.o
+ cd longread-mapping && $(MAKE)
+
repair: read-repair.c ${ALL_OBJECTS}
- ${CC} -o repair read-repair.c ${ALL_OBJECTS} ${LDFLAGS}
+ ${CC} -o repair read-repair.c ${ALL_OBJECTS} ${LDFLAGS}
propmapped: propmapped.c ${ALL_OBJECTS}
${CC} -o propmapped propmapped.c ${ALL_OBJECTS} ${LDFLAGS}
diff --git a/src/Makefile.Linux b/src/Makefile.Linux
index 4a251f7..aa43ffc 100644
--- a/src/Makefile.Linux
+++ b/src/Makefile.Linux
@@ -14,20 +14,25 @@ ALL_OBJECTS=$(addsuffix .o, ${ALL_LIBS})
ALL_H=$(addsuffix .h, ${ALL_LIBS})
ALL_C=$(addsuffix .c, ${ALL_LIBS})
-all: repair txUnique featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount # samMappedBases mergeVCF testZlib
+all: sublong repair txUnique featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount # samMappedBases mergeVCF testZlib
mkdir -p ../bin/utilities
- mv subread-align subjunc featureCounts subindel exactSNP subread-buildindex ../bin/
+ mv sublong subread-align subjunc featureCounts subindel exactSNP subread-buildindex ../bin/
mv repair coverageCount propmapped qualityScores removeDup subread-fullscan txUnique ../bin/utilities
@echo
@echo "###########################################################"
@echo "# #"
- @echo "# Installation successfully complete. #"
+ @echo "# Installation successfully completed. #"
@echo "# #"
@echo "# Generated executables were copied to directory ../bin/ #"
@echo "# #"
@echo "###########################################################"
@echo
+sublong: longread-mapping/longread-mapping.c ${ALL_OBJECTS}
+ echo " " > longread-mapping/make.version
+ rm -f longread-mapping/*.o
+ cd longread-mapping && $(MAKE)
+
repair: read-repair.c ${ALL_OBJECTS}
${CC} -o repair read-repair.c ${ALL_OBJECTS} ${LDFLAGS}
diff --git a/src/Makefile.MacOS b/src/Makefile.MacOS
index 122b4d8..9036b1f 100644
--- a/src/Makefile.MacOS
+++ b/src/Makefile.MacOS
@@ -11,20 +11,24 @@ ALL_OBJECTS=$(addsuffix .o, ${ALL_LIBS})
ALL_H=$(addsuffix .h, ${ALL_LIBS})
ALL_C=$(addsuffix .c, ${ALL_LIBS})
-all: repair featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount # globalReassembly testZlib
+all: sublong repair featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount # globalReassembly testZlib
mkdir -p ../bin/utilities
- mv subread-align subjunc featureCounts subindel exactSNP subread-buildindex ../bin/
+ mv sublong subread-align subjunc featureCounts subindel exactSNP subread-buildindex ../bin/
mv repair coverageCount subread-fullscan qualityScores removeDup propmapped ../bin/utilities
@echo
@echo "###########################################################"
@echo "# #"
- @echo "# Installation successfully complete. #"
+ @echo "# Installation successfully completed. #"
@echo "# #"
@echo "# Generated executables were copied to directory ../bin/ #"
@echo "# #"
@echo "###########################################################"
@echo
+sublong: longread-mapping/longread-mapping.c ${ALL_OBJECTS}
+ echo "MACOS= -D MACOS " > longread-mapping/make.version
+ rm -f longread-mapping/*.o
+ cd longread-mapping && $(MAKE)
repair: read-repair.c ${ALL_OBJECTS}
${CC} -o repair read-repair.c ${ALL_OBJECTS} ${LDFLAGS}
diff --git a/src/core-interface-aligner.c b/src/core-interface-aligner.c
index 8488535..7999280 100644
--- a/src/core-interface-aligner.c
+++ b/src/core-interface-aligner.c
@@ -314,10 +314,8 @@ int parse_opts_aligner(int argc , char ** argv, global_context_t * global_contex
global_context->config.multi_best_reads=1;
global_context->config.reported_multi_best_reads = global_context->config.multi_best_reads;
-
global_context->config.max_vote_combinations = max(global_context->config.max_vote_combinations, global_context->config.reported_multi_best_reads + 1);
global_context->config.max_vote_simples = max(global_context->config.max_vote_simples, global_context->config.reported_multi_best_reads + 1);
- global_context->config.report_multi_mapping_reads = 1;
break;
case 'H':
global_context->config.use_hamming_distance_break_ties = 1;
@@ -576,6 +574,9 @@ int parse_opts_aligner(int argc , char ** argv, global_context_t * global_contex
return -1;
}
+ if(global_context->config.reported_multi_best_reads > 1 && ! global_context->config.report_multi_mapping_reads)
+ SUBREADprintf("WARNING: You required multi best alignments, but disallowed multi-mapping reads. You need to turn on the multi-mapping option.\n");
+
global_context->config.more_accurate_fusions = global_context->config.more_accurate_fusions && global_context->config.do_fusion_detection;
if(global_context->config.more_accurate_fusions)
{
diff --git a/src/core-interface-subjunc.c b/src/core-interface-subjunc.c
index c4b66e1..918c420 100644
--- a/src/core-interface-subjunc.c
+++ b/src/core-interface-subjunc.c
@@ -465,7 +465,6 @@ int parse_opts_subjunc(int argc , char ** argv, global_context_t * global_contex
global_context->config.max_vote_combinations = max(global_context->config.max_vote_combinations, global_context->config.reported_multi_best_reads + 1);
global_context->config.max_vote_simples = max(global_context->config.max_vote_simples, global_context->config.reported_multi_best_reads + 1);
-
break;
case 'c':
global_context->config.space_type = GENE_SPACE_COLOR;
@@ -618,6 +617,8 @@ int parse_opts_subjunc(int argc , char ** argv, global_context_t * global_contex
return -1;
}
+ if(global_context->config.reported_multi_best_reads > 1 && ! global_context->config.report_multi_mapping_reads)
+ SUBREADprintf("WARNING: You required multi best alignments, but disallowed multi-mapping reads. You need to turn on the multi-mapping option.\n");
if(global_context->config.is_SAM_file_input) global_context->config.phred_score_format = FASTQ_PHRED33;
global_context->config.more_accurate_fusions = global_context->config.more_accurate_fusions && global_context->config.do_fusion_detection;
diff --git a/src/core-junction.c b/src/core-junction.c
index ff6a3d0..9663859 100644
--- a/src/core-junction.c
+++ b/src/core-junction.c
@@ -110,6 +110,41 @@ void debug_show_event(global_context_t* global_context, chromosome_event_t * eve
SUBREADprintf("Event between %s and %s\n", outpos1, outpos2);
}
+int get_offset_maximum_chro_pos(global_context_t * global_context, thread_context_t * thread_context, unsigned int linear){
+ gene_offset_t * chros =& global_context -> chromosome_table;
+ int n = 0;
+ int total_offsets = chros -> total_offsets;
+
+ int LL = 0, RR = total_offsets-1;
+
+ while(1){
+ if(LL >= RR-1) break;
+ int MM = (LL+RR)/2;
+ if( linear > chros->read_offsets[MM]) LL = MM;
+ else if(linear < chros->read_offsets[MM]) RR = MM;
+ else break;
+ }
+
+ n = max(0, LL - 2);
+
+ for (; n < chros -> total_offsets; n++) {
+ if (chros->read_offsets[n] > linear) {
+ int ret;
+ unsigned int last_linear = 0;
+ if(n==0)
+ ret = chros->read_offsets[0] - chros -> padding *2 +16;
+ else{
+ ret = ( chros->read_offsets[n] - chros->read_offsets[n-1] ) - chros -> padding *2 +16;
+ last_linear = chros->read_offsets[n-1];
+ }
+ linear -= last_linear;
+ if(linear < chros -> padding || linear >= chros -> padding + ret) return -1;
+ return ret;
+ }
+ }
+ return -2;
+}
+
// read_head_abs_pos is the offset of the FIRST WANTED base.
void search_events_to_front(global_context_t * global_context, thread_context_t * thread_context, explain_context_t * explain_context, char * read_text , char * qual_text, unsigned int read_head_abs_offset, short remainder_len, short sofar_matched, int suggested_movement, int do_not_jump)
@@ -2547,7 +2582,7 @@ unsigned int explain_read(global_context_t * global_context, thread_context_t *
back_search_tail_position = current_result -> selected_position + back_search_read_tail + current_result -> indels_in_confident_coverage;
//if( back_search_read_tail > 102)
- // SUBREADprintf("MAX back_search_read_tail : MIN %d , %d\n", explain_context.full_read_len , current_result -> confident_coverage_end);
+ //SUBREADprintf("MAX back_search_read_tail : MIN %d , %d\n", explain_context.full_read_len , current_result -> confident_coverage_end);
explain_context.tmp_search_junctions[0].read_pos_end = back_search_read_tail;
explain_context.tmp_search_junctions[0].abs_offset_for_start = back_search_tail_position;
@@ -2762,8 +2797,11 @@ int find_soft_clipping(global_context_t * global_context, thread_context_t * th
// add the new base
char reference_base = gvindex_get(current_value_index, added_base_index + mapped_pos);
-
- //SUBREADprintf("CHMAT [%s] ref:read = %c:%c\n", search_to_tail?"T":"H", reference_base, read_text[added_base_index]);
+ if(0){
+ char outpos1[100];
+ absoffset_to_posstr(global_context, added_base_index + mapped_pos, outpos1);
+ SUBREADprintf("CHMAT [%s] %s (%u) ref:read = %c:%c\n", search_to_tail?"T":"H" ,outpos1, added_base_index + mapped_pos, reference_base, read_text[added_base_index]);
+ }
int added_is_matched = (reference_base == read_text[added_base_index]);
matched_in_window += added_is_matched;
if(added_is_matched)
@@ -2820,6 +2858,13 @@ int final_CIGAR_quality(global_context_t * global_context, thread_context_t * th
//SUBREADprintf("Coverage : %d ~ %d\n", covered_start, covered_end);
+ if(0){
+ char posout1[100];
+ int chro_max = get_offset_maximum_chro_pos(global_context,thread_context,read_head_abs_offset);
+ absoffset_to_posstr(global_context, read_head_abs_offset, posout1);
+ SUBREADprintf("READ %s : mapped to %s ; max_pos=%d\n", read_name, posout1, chro_max);
+ }
+
while(1)
{
char nch = cigar_string[cigar_cursor++];
@@ -2841,6 +2886,18 @@ int final_CIGAR_quality(global_context_t * global_context, thread_context_t * th
int has_clipping_this_section_head = 0, has_clipping_this_section_tail = 0;
char * reversed_first_section_text = NULL;
+ if(0){
+ int is_head_in_chro = get_offset_maximum_chro_pos(global_context,thread_context, current_perfect_section_abs );
+ int is_end_in_chro = get_offset_maximum_chro_pos(global_context,thread_context, current_perfect_section_abs + tmp_int );
+ char posout1[100];
+ char posout2[100];
+ int chro_max = get_offset_maximum_chro_pos(global_context,thread_context, current_perfect_section_abs );
+ absoffset_to_posstr(global_context, current_perfect_section_abs, posout1);
+ absoffset_to_posstr(global_context, current_perfect_section_abs + tmp_int, posout2);
+ SUBREADprintf(" %dM SECTION : mapped to %s ~ %s ; max_pos=%d ; Hin=%d, Ein=%d\n", tmp_int, posout1, posout2, chro_max, is_head_in_chro, is_end_in_chro);
+ SUBREADprintf(" %dM SECTION : Hin=%d, Ein=%d\n", tmp_int, is_head_in_chro, is_end_in_chro);
+ }
+
// find "J" sections if it is the first M
if(is_First_M && global_context -> config.show_soft_cliping)
{
@@ -2859,6 +2916,7 @@ int final_CIGAR_quality(global_context_t * global_context, thread_context_t * th
}
else
head_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, read_text, current_perfect_section_abs, tmp_int, 0, adj_coverage_start);
+ //SUBREADprintf("SSHEAD:%d\n", head_soft_clipped);
if(head_soft_clipped == tmp_int){
(*full_section_clipped) = 1;
@@ -2891,6 +2949,8 @@ int final_CIGAR_quality(global_context_t * global_context, thread_context_t * th
else
tail_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, read_text + read_cursor, current_perfect_section_abs, tmp_int, 1, adj_coverage_end);
+ //SUBREADprintf("SSTAIL:%d\n", tail_soft_clipped);
+
if(tail_soft_clipped == tmp_int){
tail_soft_clipped = 0;
if(full_section_clipped)(*full_section_clipped) = 1;
diff --git a/src/core.c b/src/core.c
index eeff7f9..ca1de2d 100644
--- a/src/core.c
+++ b/src/core.c
@@ -1191,11 +1191,13 @@ unsigned int move_to_read_head(unsigned int tailpos, char * cigar){
// This function returns 1 if the cut was added.
// It returns 0 if the head or tail cut is not able to be added (e.g., the cigar ends up like "50M4I10S" or "10S30N90M")
-int add_head_tail_cut_softclipping(char * cigar, int rlen, int head_cut, int tail_cut){
+int add_head_tail_cut_softclipping(global_context_t * global_context, unsigned int linear, char * cigar, int rlen, int head_cut, int tail_cut){
+ //SUBREADprintf("ADD_SOFT: %s , %d, %d\n", cigar,head_cut,tail_cut);
char cigar_added [CORE_MAX_CIGAR_STR_LEN];
int cigar_cursor = 0, read_cursor = 0, next_read_cursor = 0;
int tmpi = 0, nch, has_M = 0;
+ unsigned int linear_cursor = linear;
cigar_added[0]=0;
@@ -1209,6 +1211,13 @@ int add_head_tail_cut_softclipping(char * cigar, int rlen, int head_cut, int tai
if('M' == nch || 'S' == nch || 'I' == nch)
next_read_cursor = read_cursor + tmpi;
+ if('M' == nch || 'D' == nch || 'S' == nch || 'N' == nch){
+ int is_start_in_chro, is_end_in_chro;
+ is_start_in_chro = get_offset_maximum_chro_pos(global_context,linear_cursor);
+ is_end_in_chro = get_offset_maximum_chro_pos(global_context,linear_cursor + tmpi);
+ linear_cursor += tmpi;
+ }
+
int head_S = 0, tail_S =0, remainder_tmpi = tmpi, is_skip = 0;
if(next_read_cursor <= head_cut) is_skip = 1;
@@ -1322,26 +1331,12 @@ int convert_read_to_tmp(global_context_t * global_context , subread_output_conte
{
int head_cut = 0 , tail_cut = 0;
- if(0 && FIXLENstrcmp("V0112_0155:7:1302:9507:32993", read_name)==0){
- char posout1[100];
- absoffset_to_posstr(global_context, r->linear_position, posout1);
- SUBREADprintf("PERR : CIGAR=%s, READLEN=%d, POS=%s\n", r->cigar , read_len, posout1);
- }
-
- if(locate_gene_position_max(r->linear_position,& global_context -> chromosome_table, &r-> chro , &r -> offset, &head_cut, &tail_cut, global_context->config.do_fusion_detection?read_len:current_result->chromosomal_length)) {
+ if(locate_gene_position_max(r->linear_position + r->soft_clipping_movements,& global_context -> chromosome_table, &r-> chro , &r -> offset, &head_cut, &tail_cut, global_context->config.do_fusion_detection?read_len:(current_result->chromosomal_length - r->soft_clipping_movements))) {
is_r_OK = 0;
} else {
-
-
- if(0 && FIXLENstrcmp("V0112_0155:7:1302:9507:32993", read_name)==0){
- char posout1[100];
- absoffset_to_posstr(global_context, r->linear_position, posout1);
- SUBREADprintf("CUTT : CIGAR=%s, READLEN=%d, CATS=%d %d\n", r->cigar , read_len, head_cut, tail_cut);
- }
-
int is_added_OK = 1;
if(head_cut!=0 || tail_cut!=0)
- is_added_OK = add_head_tail_cut_softclipping(r->cigar , read_len, head_cut, tail_cut);
+ is_added_OK = add_head_tail_cut_softclipping(global_context, r->linear_position, r->cigar , read_len, head_cut, tail_cut);
if(is_added_OK){
r -> offset++;
@@ -1922,13 +1917,13 @@ void write_single_fragment(global_context_t * global_context, thread_context_t *
}
if(1)if(is_funky & FUNKY_FRAGMENT_BC){
//#warning "LOGIC WRONG: R1 AND R2 SHOULD BE DECIDED BY THEIR MAPPING POSITIONS"
- bktable_append(&global_context -> funky_table_BC, rec1 -> chro, rec1 -> offset + rec1 -> soft_clipping_movements, NULL + (2*pair_number));
- bktable_append(&global_context -> funky_table_BC, rec2 -> chro, rec2 -> offset + rec2 -> soft_clipping_movements, NULL + (2*pair_number+1));
+ bktable_append(&global_context -> funky_table_BC, rec1 -> chro, rec1 -> offset , NULL + (2*pair_number));
+ bktable_append(&global_context -> funky_table_BC, rec2 -> chro, rec2 -> offset , NULL + (2*pair_number+1));
}
if(1)if(is_funky & FUNKY_FRAGMENT_DE){
fraglist_append(&global_context -> funky_list_DE, pair_number);
- bktable_append(&global_context -> funky_table_DE, rec1 -> chro, rec1 -> offset + rec1 -> soft_clipping_movements, NULL + (2*pair_number + (rec1 -> offset > rec2 -> offset ? 1:0)));
- bktable_append(&global_context -> funky_table_DE, rec2 -> chro, rec2 -> offset + rec2 -> soft_clipping_movements, NULL + (2*pair_number + (rec1 -> offset < rec2 -> offset ? 1:0)));
+ bktable_append(&global_context -> funky_table_DE, rec1 -> chro, rec1 -> offset , NULL + (2*pair_number + (rec1 -> offset > rec2 -> offset ? 1:0)));
+ bktable_append(&global_context -> funky_table_DE, rec2 -> chro, rec2 -> offset , NULL + (2*pair_number + (rec1 -> offset < rec2 -> offset ? 1:0)));
}
}
@@ -2120,7 +2115,11 @@ void write_single_fragment(global_context_t * global_context, thread_context_t *
if(is_R1_OK && is_R2_OK)
{
if( rec1->offset > rec2->offset) out_tlen1 = - out_tlen1;
- else out_tlen2 = -out_tlen2;
+ else if(rec2->offset > rec1->offset) out_tlen2 = -out_tlen2;
+ else{
+ if( rec1 -> strand ) out_tlen1 = - out_tlen1;
+ else out_tlen2 = -out_tlen2;
+ }
}
if(0==current_location)
@@ -2144,11 +2143,11 @@ void write_single_fragment(global_context_t * global_context, thread_context_t *
}
if(is_R1_OK){
- out_offset1 = max(1, rec1->offset + rec1 -> soft_clipping_movements);
+ out_offset1 = max(1, rec1->offset);
out_mapping_quality1 = rec1->mapping_quality;
}
if(is_R2_OK){
- out_offset2 = max(1, rec2->offset + rec2 -> soft_clipping_movements);
+ out_offset2 = max(1, rec2->offset);
out_mapping_quality2 = rec2->mapping_quality;
}
@@ -3506,6 +3505,11 @@ int run_maybe_threads(global_context_t *global_context, int task)
void * thr_parameters [5];
int ret_value =0;
+ if(task == STEP_ITERATION_TWO){
+ global_context -> last_written_fragment_number = 0;
+ }
+
+
if(global_context->config.all_threads<2) {
thr_parameters[0] = global_context;
thr_parameters[1] = NULL;
@@ -3528,8 +3532,6 @@ int run_maybe_threads(global_context_t *global_context, int task)
memset(thread_contexts, 0, sizeof(thread_context_t)*64);
global_context -> all_thread_contexts = thread_contexts;
- if(task == STEP_ITERATION_TWO)
- global_context -> last_written_fragment_number = 0;
for(current_thread_no = 0 ; current_thread_no < global_context->config.all_threads ; current_thread_no ++)
{
@@ -3563,11 +3565,11 @@ int run_maybe_threads(global_context_t *global_context, int task)
global_context -> not_properly_pairs_only_one_end_mapped += thread_contexts[current_thread_no].not_properly_pairs_only_one_end_mapped;
global_context -> all_multimapping_reads += thread_contexts[current_thread_no].all_multimapping_reads;
global_context -> all_uniquely_mapped_reads += thread_contexts[current_thread_no].all_uniquely_mapped_reads;
+
}
ret_value += *(ret_values + current_thread_no);
if(ret_value)break;
}
-
for(current_thread_no = 0 ; current_thread_no < global_context->config.all_threads ; current_thread_no ++){
if(thread_contexts[current_thread_no].output_buffer_item > 0)
SUBREADprintf("ERROR: UNFINISHED OUTPUT!\n");
@@ -3685,7 +3687,6 @@ int read_chunk_circles(global_context_t *global_context)
global_context -> timecost_load_index += period_load_index;
global_context -> current_value_index = global_context -> all_value_indexes + global_context->current_index_block_number;
-
if(global_context->current_index_block_number ==0 && global_context -> all_processed_reads==0)
global_context->align_start_time = miltime();
diff --git a/src/gene-algorithms.c b/src/gene-algorithms.c
index cabadf9..58ff8aa 100644
--- a/src/gene-algorithms.c
+++ b/src/gene-algorithms.c
@@ -444,13 +444,13 @@ int locate_gene_position_max(unsigned int linear, const gene_offset_t* offsets ,
int total_offsets = offsets -> total_offsets;
- int GENE_LOCATE_JUMP = total_offsets/4;
+ int GENE_LOCATE_JUMP = total_offsets/3;
- while (GENE_LOCATE_JUMP > 5)
+ while (GENE_LOCATE_JUMP >3)
{
while(n+GENE_LOCATE_JUMP < total_offsets && offsets->read_offsets[n+GENE_LOCATE_JUMP] <= linear)
n+=GENE_LOCATE_JUMP;
- GENE_LOCATE_JUMP /=4;
+ GENE_LOCATE_JUMP /=3;
}
for (; offsets->read_offsets[n]; n++)
@@ -462,18 +462,29 @@ int locate_gene_position_max(unsigned int linear, const gene_offset_t* offsets ,
//SUBREADprintf("max=%u <= lim=%u : ACCEPTED.\n", rl + linear , offsets->read_offsets[n] + 16);
// the end of the read should not excess the end of the chromosome
- if(tail_cut_length == NULL){
- if(rl + linear > offsets->read_offsets[n] + 15 - offsets -> padding) return 1;
- } else {
- (*tail_cut_length) = linear + rl - ( offsets->read_offsets[n] + 15 - offsets -> padding);
- if( (*tail_cut_length) >= rl )return 1;
- }
if (n==0)
*pos = linear;
else
*pos = linear - offsets->read_offsets[n-1];
+
+ if(tail_cut_length == NULL){
+ if(rl + linear > offsets->read_offsets[n] + 15 - offsets -> padding) return 1;
+ } else {
+ unsigned int posn1 = 0;
+ if(n>0) posn1 = offsets->read_offsets[n-1];
+ long long int tct = ( linear + rl - posn1 - offsets -> padding );
+ if(tct < rl)tct = rl;
+ long long int chro_leng = (offsets->read_offsets[n] - posn1 - 2*offsets -> padding + 16);
+
+ //SUBREADprintf("CHRO_LEN : %lld, READ_TAIL %lld , RL=%d\n", chro_leng, tct, rl);
+ tct -= chro_leng;
+ if( tct >= rl )return 1;
+ if( tct <0 )tct=0;
+ (*tail_cut_length) = tct;
+ }
+
if( (*pos) < offsets -> padding ) {
if(head_cut_length == NULL || (*pos) + rl <= offsets -> padding){
return 1;
diff --git a/src/gene-value-index.c b/src/gene-value-index.c
index 7c51bf7..297388e 100644
--- a/src/gene-value-index.c
+++ b/src/gene-value-index.c
@@ -53,17 +53,56 @@ void gvindex_baseno2offset(unsigned int base_number, gene_value_index_t * index,
* offset_bit = base_number % 4 * 2;
}
+int is_offset_in_chro(gene_value_index_t * offsets, gehash_data_t linear){
+ int ret = 1;
+ if(offsets -> appendix1 && offsets -> appendix2){
+ gene_offset_t * chros = offsets -> appendix1;
+ int padding = offsets -> appendix2 - NULL;
+ // SUBREADprintf( "OFFSETS:%d, PADD:%d\n", chros -> total_offsets, padding );
+
+ int n = 0;
+ int total_offsets = chros -> total_offsets;
+
+ int LL = 0, RR = total_offsets-1;
+
+ while(1){
+ if(LL >= RR-1) break;
+ int MM = (LL+RR)/2;
+ if( linear > chros->read_offsets[MM]) LL = MM;
+ else if(linear < chros->read_offsets[MM]) RR = MM;
+ else break;
+ }
+
+ n = max(0, LL - 2);
+
+ for (; n < chros -> total_offsets; n++) {
+ if (chros->read_offsets[n] > linear) {
+ unsigned int pos;
+
+ if (n==0)
+ pos = linear;
+ else
+ pos = linear - chros->read_offsets[n-1];
+
+ if( pos < chros -> padding || pos >= ( chros->read_offsets[n] - chros->read_offsets[n-1] - chros -> padding ))
+ ret = 0;
+ SUBREADprintf("INCHRO:%d ; POS:%d\n", ret, pos);
+ break;
+ }
+ }
+ }
+ return ret;
+}
+
// return 'A', 'G', 'T' and 'C'
int gvindex_get(gene_value_index_t * index, gehash_data_t offset)
{
unsigned int offset_byte, offset_bit;
- gvindex_baseno2offset_m(offset, index , offset_byte, offset_bit);
+ //if(!is_offset_in_chro( index, offset ))return -1;
+ gvindex_baseno2offset_m(offset, index , offset_byte, offset_bit);
if(offset_byte >= index-> values_bytes -1)return 'N';
-
unsigned int one_base_value = (index->values [offset_byte]) >> (offset_bit);
-
-
//SUBREADprintf("RECV_BASE=%d (%d - %d)\n",one_base_value & 3, offset_byte , offset_bit);
return int2base(one_base_value & 3);
@@ -151,6 +190,7 @@ int gvindex_dump(gene_value_index_t * index, const char filename [])
int gvindex_load(gene_value_index_t * index, const char filename [])
{
+ memset(index,0, sizeof(gene_value_index_t));
FILE * fp = f_subr_open(filename, "rb");
int read_length;
read_length = fread(&index->start_point,4,1, fp);
diff --git a/src/input-files.c b/src/input-files.c
index 0a35d1b..9950f50 100644
--- a/src/input-files.c
+++ b/src/input-files.c
@@ -2510,6 +2510,7 @@ void SAM_pairer_destroy(SAM_pairer_context_t * pairer){
subread_destroy_lock(&pairer -> unsorted_notification_lock);
subread_destroy_lock(&pairer -> input_fp_lock);
subread_destroy_lock(&pairer -> output_header_lock);
+
delete_with_prefix(pairer -> tmp_file_prefix);
fclose(pairer -> input_fp);
free(pairer -> threads);
@@ -2706,6 +2707,7 @@ int SAM_pairer_fetch_BAM_block(SAM_pairer_context_t * pairer , SAM_pairer_thread
if(thread_context -> need_find_start){
int test_read_bin = SAM_pairer_find_start(pairer, thread_context);
+ //SUBREADprintf("THREAD [%d] IS_READ_BIN_OK=%d, %d\n", thread_context -> thread_id, test_read_bin, thread_context -> input_buff_BIN_used);
if(test_read_bin<1 && thread_context -> input_buff_BIN_used >= 32 ){
pairer -> is_bad_format = 1;
//SUBREADprintf("BIN REMAIN=%d, BAM USED=%d, BIN GENERATED=%d, BAM REMAIN=%d, TEST_READ_BIN=%d\n", remained_BIN, used_BAM, have, thread_context -> input_buff_SBAM_used - thread_context -> input_buff_SBAM_ptr, test_read_bin);
@@ -2838,6 +2840,7 @@ int SAM_pairer_get_next_read_BIN( SAM_pairer_context_t * pairer , SAM_pairer_thr
if(record_len < 32 || record_len > min(MAX_BIN_RECORD_LENGTH,60000) || seq_len >= pairer -> long_read_minimum_length || thread_context -> input_buff_BIN_used < thread_context -> input_buff_BIN_ptr + record_len ){
if(seq_len >= pairer -> long_read_minimum_length) pairer -> is_single_end_mode = 1;
+ //SUBREADprintf("BADFMT: rlen %d; seqlen %d; room %d < %d\n", record_len, seq_len, thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_ptr , record_len);
pairer -> is_bad_format = 1;
return 0;
}
@@ -3324,6 +3327,7 @@ int SAM_pairer_get_read_full_name( SAM_pairer_context_t * pairer , SAM_pairer_th
memcpy(&next_refID, bin + 24, 4);
memcpy(&next_pos, bin + 28, 4);
memcpy(full_name, bin+36, l_read_name);
+ assert(l_read_name > 0);
unsigned int r1_refID, r1_pos, r2_refID, r2_pos;
if(FLAG & 4){
@@ -3404,47 +3408,70 @@ void SAM_pairer_make_dummy(char * rname, char * bin1, char * out_bin2){
char * tmptr = NULL;
//SUBREADprintf("S=%s ", rname);
- char * realname = strtok_r(rname, "\027", &tmptr);
+ char * realname = bin1 + 36;
+ int block1len =-1;
int len_name = strlen(realname);
- int r1_chro = atoi(strtok_r(NULL, "\027", &tmptr));
- int r1_pos = atoi(strtok_r(NULL, "\027", &tmptr));
- int r2_chro = atoi(strtok_r(NULL, "\027", &tmptr));
- int r2_pos = atoi(strtok_r(NULL, "\027", &tmptr));
- int HItag = atoi(strtok_r(NULL, "\027", &tmptr));
- int mate_FLAG = 0;
- memcpy(&mate_FLAG, bin1 + 16, 4);
- mate_FLAG = 0xffff&(mate_FLAG >>16);
+ int r1_chro =-1;
+ int r1_pos =-1;
+ int r2_chro =-1;
+ int r2_pos =-1;
+
+ memcpy(&block1len, bin1, 4);
+ memcpy(&r1_chro, bin1 + 4, 4);
+ memcpy(&r1_pos, bin1 + 8, 4);
+
+ memcpy(&r2_chro, bin1 + 24, 4);
+ memcpy(&r2_pos, bin1 + 28, 4);
+
+ int HItag =-1;
+ int NHtag =-1;
+
+ int seq_len = -1;
+ int cigar_opts = -1;
+ memcpy(&seq_len, bin1+20,4);
+ int r1_FLAG = -1;
+ memcpy(&r1_FLAG, bin1 + 16, 4);
+ cigar_opts = r1_FLAG & 0xffff;
+
+ int bin1ptr = 36 + len_name +1 + seq_len + (seq_len+1)/2 + 4 * cigar_opts;
+ //SUBREADprintf("MAKE_DUMMY: %s\n", realname);
+ if( block1len + 4 > bin1ptr + 3 ){
+ SAM_pairer_iterate_int_tags(bin1+bin1ptr,block1len + 4 - bin1ptr, "NH", &NHtag);
+ SAM_pairer_iterate_int_tags(bin1+bin1ptr,block1len + 4 - bin1ptr, "HI", &HItag);
+ }
+
+ r1_FLAG = 0xffff&(r1_FLAG >>16);
int mate_tlen = 0;
memcpy(&mate_tlen, bin1 + 32, 4);
if(r1_chro<0) r1_pos=-1;
if(r2_chro<0) r2_pos=-1;
- int my_chro = (mate_FLAG&0x40)? r2_chro : r1_chro;
- int my_pos = (mate_FLAG&0x40)? r2_pos : r1_pos;
- int mate_chro = (mate_FLAG&0x40)? r1_chro : r2_chro;
- int mate_pos = (mate_FLAG&0x40)? r1_pos : r2_pos;
+ int my_chro = (r1_FLAG&0x40)? r2_chro : r1_chro;
+ int my_pos = (r1_FLAG&0x40)? r2_pos : r1_pos;
+ int mate_chro = (r1_FLAG&0x40)? r1_chro : r2_chro;
+ int mate_pos = (r1_FLAG&0x40)? r1_pos : r2_pos;
int bin_mq_nl = (len_name+1);
- int my_flag = (mate_FLAG&0x40)? 0x80:0x40;
- my_flag |= 1;
+ int r2_FLAG = (r1_FLAG&0x40)? 0x80:0x40;
+ r2_FLAG |= 1;
// Dummy reads should always be unmapped!
- //if(mate_FLAG & 8)my_flag |=4;
+ //if(r1_FLAG & 8)r2_FLAG |=4;
- if(mate_FLAG & 4)my_flag |=8;
- if(mate_FLAG & 8)my_flag |=4;
- if(mate_FLAG & 0x10) my_flag |= 0x20;
- if(mate_FLAG & 0x20) my_flag |= 0x10;
- my_flag = my_flag << 16;
+ if(r1_FLAG & 4)r2_FLAG |=8;
+ if(r1_FLAG & 8)r2_FLAG |=4;
+ if(r1_FLAG & 0x10) r2_FLAG |= 0x20;
+ if(r1_FLAG & 0x20) r2_FLAG |= 0x10;
+ r2_FLAG = r2_FLAG << 16;
memcpy(out_bin2+4, &my_chro,4);
memcpy(out_bin2+8, &my_pos,4);
memcpy(out_bin2+12, &bin_mq_nl, 4);
- memcpy(out_bin2+16, &my_flag, 4);
+ memcpy(out_bin2+16, &r2_FLAG, 4);
- my_flag = 1;
- memcpy(out_bin2+20, &my_flag, 4);
+ r2_FLAG = 1;
+ memcpy(out_bin2+20, &r2_FLAG, 4);
memcpy(out_bin2+24, &mate_chro, 4);
memcpy(out_bin2+28, &mate_pos, 4);
@@ -3455,24 +3482,43 @@ void SAM_pairer_make_dummy(char * rname, char * bin1, char * out_bin2){
out_bin2[36 + len_name+2] = 0x20;
int all_len = 36 + len_name + 3 - 4;
+ int tag_ptr = 36 + len_name + 3;
//SUBREADprintf("HI=%d\n", HItag);
- if(HItag>=0){
- out_bin2[36 + len_name+3]='H';
- out_bin2[36 + len_name+4]='I';
+ if(HItag>0){
+ out_bin2[tag_ptr++]='H';
+ out_bin2[tag_ptr++]='I';
if(HItag<128){
- out_bin2[36 + len_name+5]='C';
- memcpy(out_bin2 + 36 + len_name+6, &HItag, 1);
+ out_bin2[tag_ptr++]='C';
+ memcpy(out_bin2 + (tag_ptr++), &HItag, 1);
all_len += 4;
}else if(HItag<32767){
- out_bin2[36 + len_name+5]='S';
+ out_bin2[(tag_ptr+=2)]='S';
memcpy(out_bin2 + 36 + len_name+6, &HItag, 2);
all_len += 5;
}else {
- out_bin2[36 + len_name+5]='I';
+ out_bin2[(tag_ptr+=4)]='I';
memcpy(out_bin2 + 36 + len_name+6, &HItag, 4);
all_len += 7;
}
}
+ if(NHtag>0){
+ out_bin2[tag_ptr++]='N';
+ out_bin2[tag_ptr++]='H';
+ if(NHtag<128){
+ out_bin2[tag_ptr++]='C';
+ memcpy(out_bin2 + (tag_ptr++), &NHtag, 1);
+ all_len += 4;
+ }else if(NHtag<32767){
+ out_bin2[(tag_ptr+=2)]='S';
+ memcpy(out_bin2 + 36 + len_name+6, &NHtag, 2);
+ all_len += 5;
+ }else {
+ out_bin2[(tag_ptr+=4)]='I';
+ memcpy(out_bin2 + 36 + len_name+6, &NHtag, 4);
+ all_len += 7;
+ }
+ }
+
memcpy(out_bin2,&all_len,4);
}
@@ -3608,11 +3654,11 @@ int SAM_pairer_do_next_read( SAM_pairer_context_t * pairer , SAM_pairer_thread_t
int bin_len = 0, this_flags = 0;
int has_next_read = SAM_pairer_get_next_read_BIN(pairer, thread_context, &bin, &bin_len);
- //#warning "============COMMENT NEXT =================="
- //SUBREADprintf("GOT READ: BINLEN=%d\n", bin_len);
- if(has_next_read){
+ if(has_next_read && !pairer -> is_bad_format){
int name_len = SAM_pairer_get_read_full_name(pairer, thread_context, bin, bin_len, read_full_name, & this_flags);
+ //SUBREADprintf("GOT READ %s, : BINLEN=%d\n", read_full_name , bin_len);
+
if(pairer -> is_single_end_mode == 0 && ( this_flags & 1 ) == 1){ // if the reads are PE
if(strcmp(read_full_name , thread_context -> immediate_last_read_full_name) == 0){
if(pairer -> output_function)
@@ -4178,6 +4224,7 @@ int is_read_bin(char * bin, int bin_len, int max_refID){
int min_mq_nl;
memcpy(&min_mq_nl, bin + 12, 4);
int name_len = min_mq_nl & 0xff;
+ if(name_len < 1) return -20;
int flag_nc;
memcpy(&flag_nc, bin + 16, 4);
int cigar_opts = flag_nc & 0xffff;
@@ -4195,7 +4242,6 @@ int is_read_bin(char * bin, int bin_len, int max_refID){
if(block_len < 32 + name_len + 4*cigar_opts + l_seq + (l_seq+1)/2) return -11;
-
int cigar_i;
for(cigar_i = 0; cigar_i < cigar_opts ; cigar_i++){
int cigar_v;
@@ -4239,8 +4285,8 @@ int is_read_bin(char * bin, int bin_len, int max_refID){
}
int SAM_pairer_find_start(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context ){
- thread_context -> need_find_start = 0;
if(FAST_PICARD_BAM_PROCESSING){
+ thread_context -> need_find_start = 0;
int start_pos = 0;
for(start_pos = 0; start_pos < min(MAX_BIN_RECORD_LENGTH, thread_context -> input_buff_BIN_used); start_pos++){
if(is_read_bin((char *)thread_context -> input_buff_BIN + start_pos, thread_context -> input_buff_SBAM_used - start_pos , pairer -> BAM_n_ref)){
@@ -4267,8 +4313,8 @@ void * SAM_pairer_thread_run( void * params ){
while(1){
subread_lock_occupy(&pairer -> input_fp_lock);
if(pairer -> BAM_header_parsed || thread_no == 0){
- SAM_pairer_fill_BIN_buff(pairer, thread_context, &is_finished);
thread_context -> need_find_start = pairer -> BAM_header_parsed;
+ SAM_pairer_fill_BIN_buff(pairer, thread_context, &is_finished);
thread_context -> chunk_number = pairer -> input_chunk_no;
pairer -> input_chunk_no ++;
}
@@ -4668,8 +4714,9 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
if((( etag_name0 == 'H' && etag_name1 == 'I' ) ||
( etag_name0 == 'N' && etag_name1 == 'H' ) ||
+ ( etag_name0 == 'R' && etag_name1 == 'G' ) ||
( etag_name0 == 'N' && etag_name1 == 'M' )
- ) && ( etag_type == 'c' || etag_type == 'C'||etag_type == 's'||etag_type == 'S'||etag_type == 'i'||etag_type == 'I')
+ ) && ( etag_type == 'c' || etag_type=='Z' || etag_type == 'C'||etag_type == 's'||etag_type == 'S'||etag_type == 'i'||etag_type == 'I')
){
FIX_APPEND_READ(&etag_name0,1);
FIX_APPEND_READ(&etag_name1,1);
@@ -4678,10 +4725,16 @@ int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
// SUBREADprintf("ADDED INTO BAM\n");
}
if(etag_type == 'Z'||etag_type =='H'){
+ if(this_tag_output) extag_new_len +=3;
while(1){
FIX_GET_NEXT_NCH;
if(nch < 0) return -1;
+ if(this_tag_output){
+ FIX_APPEND_READ(&nch, 1);
+ extag_new_len++;
+ }
x1++;
+ assert(x1 < 20000);
if(nch == 0)break;
}
}else if(etag_type == 'A'){
@@ -5301,8 +5354,6 @@ int SAM_pairer_run( SAM_pairer_context_t * pairer){
assert(1 != corrected_run);
delete_with_prefix(pairer -> tmp_file_prefix);
pairer -> is_internal_error |= SAM_pairer_fix_format(pairer);
- //#warning ">>>>>> COMMENT NEXT LINE IN RELEASE <<<<<<"
- //SUBREADprintf("Retrying with the corrected format... (%d)\n", pairer -> is_bad_format);
if(pairer -> is_bad_format || pairer -> is_internal_error)
return -1;
diff --git a/src/longread-mapping/LRMbase-index.c b/src/longread-mapping/LRMbase-index.c
new file mode 100644
index 0000000..f9e951d
--- /dev/null
+++ b/src/longread-mapping/LRMbase-index.c
@@ -0,0 +1,285 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "LRMconfig.h"
+#include "LRMbase-index.h"
+#include "LRMfile-io.h"
+
+
+#define LRMgvindex_baseno2offset_m(base_number, index, offset_byte, offset_bit) {offset_byte = (base_number - index -> start_base_offset) >>2; offset_bit = base_number % 4 * 2;}
+
+
+void LRMgvindex_baseno2offset(unsigned int base_number, LRMgene_value_index_t * index, unsigned int * offset_byte, unsigned int * offset_bit)
+{
+ // the base number corrsponding to the 0-th bit in the whole value array;
+
+ unsigned int offset = (base_number - index -> start_base_offset);
+
+ * offset_byte = offset >>2 ;
+ * offset_bit = base_number % 4 * 2;
+}
+
+// return 'A', 'G', 'T' and 'C'
+int LRMgvindex_get(LRMgene_value_index_t * index, LRMgehash_data_t offset)
+{
+ unsigned int offset_byte, offset_bit;
+ LRMgvindex_baseno2offset_m(offset, index , offset_byte, offset_bit);
+
+ if(offset_byte >= index-> values_bytes -1)return 'N';
+
+ unsigned int one_base_value = (index->values [offset_byte]) >> (offset_bit);
+
+
+ //LRMprintf("RECV_BASE=%d (%d - %d)\n",one_base_value & 3, offset_byte , offset_bit);
+
+ return LRMint2base(one_base_value & 3);
+}
+
+int LRMgvindex_match(LRMgene_value_index_t * index, LRMgehash_data_t offset, LRMgehash_key_t base_values)
+{
+ unsigned int offset_byte, offset_bit;
+
+ LRMgvindex_baseno2offset_m(offset, index , offset_byte, offset_bit);
+ int i, ret = 0;
+
+ for (i=0; i<16; i++)
+ {
+ unsigned char mask = 0x3 << (offset_bit);
+ unsigned char one_base_value = (index->values [offset_byte] & mask) >> (8-offset_bit);
+ if ( ((base_values >> (30 - i*2)) & 0x3) == one_base_value)
+ ret |= 1 << i;
+
+ offset_bit +=2;
+ if(offset_bit >=8)
+ {
+ offset_bit = 0;
+ offset_byte ++;
+ }
+ }
+
+ return ret;
+
+}
+
+int LRMgvindex_load(LRMgene_value_index_t * index, const char filename [])
+{
+ FILE * fp = fopen(filename, "rb");
+ int read_length;
+ read_length = fread(&index->start_point,4,1, fp);
+ if(read_length<1){
+ LRMprintf("ERROR: the array index is incomplete : %d", read_length );
+ return 1;
+ }
+ read_length = fread(&index->length,4,1, fp);
+ if(read_length<1){
+ LRMprintf("Bad index\n");
+ return 1;
+ }
+ //LRMprintf ("\nBINDEX %s : %u ~ +%u\n",filename, index->start_point, index->length );
+
+ unsigned int useful_bytes, useful_bits;
+ index -> start_base_offset = index -> start_point - index -> start_point%4;
+ LRMgvindex_baseno2offset (index -> length+ index -> start_point, index ,&useful_bytes,&useful_bits);
+ index -> values = malloc(useful_bytes+1);
+ index -> values_bytes = useful_bytes+1;
+ if(!index->values)
+ {
+ LRMprintf("Out of memory\n");
+ return 1;
+ }
+
+
+ read_length =fread(index->values, 1, useful_bytes+1, fp);
+ if(read_length < useful_bytes){
+ LRMprintf("ERROR: the array index is incomplete : %d < %d.", read_length, useful_bytes+1 );
+ return 1;
+ }
+
+ fclose(fp);
+ return 0;
+
+}
+
+int gvindex_get(LRMgene_value_index_t * index, unsigned int offset){
+ unsigned int offset_byte, offset_bit;
+ LRMgvindex_baseno2offset_m(offset, index , offset_byte, offset_bit);
+ if(offset_byte >= index-> values_bytes -1)return 'N';
+ unsigned int one_base_value = (index->values [offset_byte]) >> (offset_bit);
+
+ return LRMint2base(one_base_value & 3);
+}
+
+
+void LRMgvindex_get_string(char *buf, LRMgene_value_index_t * index, unsigned int pos, int len, int is_negative_strand){
+ int i;
+ if (is_negative_strand)
+ for (i=len-1;i>=0;i--)
+ {
+ buf[i] = LRMgvindex_get (index, pos + len - 1 - i);
+ switch(buf[i])
+ {
+ case 'A': buf[i] = 'T'; break;
+ case 'G': buf[i] = 'C'; break;
+ case 'C': buf[i] = 'G'; break;
+ default: buf[i] = 'A';
+ }
+ }
+ else
+ for (i=0;i<len;i++)
+ buf[i] = LRMgvindex_get (index, pos +i);
+}
+
+
+int LRMvalidate_mapping(LRMcontext_t * context, char * read, char * cigar, LRMgene_value_index_t * index, unsigned int pos, int neg, int * maplen, int show_txt){
+ unsigned int chro_cursor = pos;
+ int read_chrsor = 0, all_matched = 0, all_mismatched = 0;
+ int tmpi = 0,cigar_i, nch, tmpi_sign = 1, x1;
+
+ if(neg) LRMreverse_read(read, strlen(read));
+
+ if(show_txt){
+ char postxt[100];
+ LRMpos2txt(context, chro_cursor, postxt);
+ LRMprintf("Starting Pos : Read + %d ( %s )\n", read_chrsor, postxt);
+ }
+
+ for(cigar_i = 0; (nch = cigar[cigar_i])!=0; cigar_i++){
+ if(nch == '-') tmpi_sign = -1;
+ else if(nch >='0' && nch <='9'){
+ tmpi = tmpi * 10 + (nch - '0');
+ }else{
+ tmpi *= tmpi_sign;
+ if(nch == 'M'){
+ int this_matched = LRMmatch_chro(read + read_chrsor, index, chro_cursor, tmpi, 0);
+ if(show_txt){
+ unsigned txt_chro_cursor = chro_cursor;
+ int txt_read_chrsor = read_chrsor;
+ for(x1 = 0; x1 < tmpi ; x1++){
+ int knownval = LRMgvindex_get(index, txt_chro_cursor);
+ int readval = read[txt_read_chrsor];
+ txt_chro_cursor ++;
+ txt_read_chrsor ++;
+
+ LRMprintf("%c[3%dm%c", 27, knownval == readval ? 7:1, readval);
+ }
+ }
+
+ if(0 && abs(tmpi) > 22 && this_matched < tmpi * 0.6){
+ LRMprintf("Too many mismatched (%d%c) : %d / %d : read + %d\n", tmpi, nch, this_matched, tmpi, read_chrsor);
+ }
+
+ all_matched += this_matched;
+ all_mismatched += ( tmpi - this_matched );
+ (*maplen) += tmpi;
+ }
+
+ if(nch == 'M' || nch == 'I' || nch == 'S'){
+ if(nch == 'I' && show_txt) for(x1 = 0; x1 < tmpi ; x1++) LRMprintf("%c[32m%c", 27, read[ read_chrsor + x1 ]);
+ if(nch == 'S' && show_txt) for(x1 = 0; x1 < tmpi ; x1++) LRMprintf("%c[4m%c%c[0m", 27, read[ read_chrsor + x1 ], 27);
+ read_chrsor += tmpi;
+ }
+ if(nch == 'M' || nch == 'D' || nch == 'N' || nch == 'S'){
+ if(( nch == 'N' || nch == 'D' ) && show_txt) LRMprintf("%c[36m//%c[37m", 27, 27);
+ if(nch != 'S')chro_cursor += tmpi;
+ }
+
+ tmpi = 0;
+ tmpi_sign = 1;
+ }
+ }
+ if(show_txt){
+ char postxt[100];
+ LRMpos2txt(context, chro_cursor, postxt);
+ LRMprintf("%c[37m\n", 27);
+ LRMprintf("Ending Pos : Read + %d ( %s )\n", read_chrsor, postxt);
+ }
+ if(neg) LRMreverse_read(read, strlen(read));
+ return all_matched;
+}
+
+int LRMmatch_chro(char * read, LRMgene_value_index_t * index, unsigned int pos, int test_len, int is_negative_strand){
+ int ret = 0;
+ int i;
+
+ if ((unsigned int)(pos + test_len) >= index -> length + index -> start_point) return 0;
+ if (pos > 0xffff0000) return 0;
+
+ if (is_negative_strand)
+ {
+ for (i=test_len -1;i>=0;i--)
+ {
+ char tt = LRMgvindex_get (index, pos+test_len-1-i);
+ switch(tt)
+ {
+ case 'A': ret += read[i] == 'T'; break;
+ case 'T': ret += read[i] == 'A'; break;
+ case 'G': ret += read[i] == 'C'; break;
+ case 'C': ret += read[i] == 'G'; break;
+ }
+ }
+ }else{
+ unsigned int offset_byte, offset_bit;
+
+ LRMgvindex_baseno2offset_m(pos, index , offset_byte, offset_bit);
+
+ if(offset_byte >= index-> values_bytes)return 0;
+ char int_value = index->values [offset_byte];
+
+ for (i=0;i<test_len;i++)
+ {
+ char tt = (int_value >> offset_bit) & 3;
+ char tv = read[i];
+ switch(tv){
+ case 'A':
+ ret += tt==0;
+ break;
+ case 'G':
+ ret += tt==1;
+ break;
+ case 'C':
+ ret += tt==2;
+ break;
+ case 0:
+ break;
+ default:
+ ret += tt==3;
+
+ }
+ offset_bit+=2;
+ if(offset_bit==8)
+ {
+ offset_byte++;
+ if(offset_byte == index-> values_bytes)return 0;
+ int_value = index->values [offset_byte];
+ offset_bit = 0;
+ }
+ }
+ }
+ return ret;
+}
+
+void LRMgvindex_destory(LRMgene_value_index_t * index)
+{
+ free(index -> values);
+}
diff --git a/src/longread-mapping/LRMbase-index.h b/src/longread-mapping/LRMbase-index.h
new file mode 100644
index 0000000..9aed2cf
--- /dev/null
+++ b/src/longread-mapping/LRMbase-index.h
@@ -0,0 +1,38 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+
+#ifndef __LRMBASE_INDEX_H_
+#define __LRMBASE_INDEX_H_
+
+#include "LRMconfig.h"
+
+int LRMgvindex_load(LRMgene_value_index_t * index, const char filename []);
+
+void LRMgvindex_destory(LRMgene_value_index_t * index);
+
+void LRMgvindex_baseno2offset(unsigned int base_number, LRMgene_value_index_t * index, unsigned int * offset_byte, unsigned int * offset_bit);
+
+int LRMgvindex_get(LRMgene_value_index_t * index, LRMgehash_data_t offset);
+
+int LRMmatch_chro(char * read, LRMgene_value_index_t * index, unsigned int pos, int test_len, int is_negative_strand);
+
+void LRMgvindex_get_string(char *buf, LRMgene_value_index_t * index, unsigned int pos, int len, int is_negative_strand);
+
+int LRMvalidate_mapping(LRMcontext_t * context, char * read, char * cigar, LRMgene_value_index_t * index, unsigned int pos, int rev, int * mapped_length, int show_txt);
+#endif
diff --git a/src/longread-mapping/LRMchro-event.c b/src/longread-mapping/LRMchro-event.c
new file mode 100644
index 0000000..2288222
--- /dev/null
+++ b/src/longread-mapping/LRMchro-event.c
@@ -0,0 +1,1241 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include "LRMconfig.h"
+#include "LRMchro-event.h"
+#include "LRMfile-io.h"
+#include "LRMbase-index.h"
+#include "LRMhelper.h"
+
+#define JUNCTION_CONFIRM_WINDOW 14
+
+#define ceq(c,t) ((c)[0]==(t)[0] && (c)[1]==(t)[1])
+#define c2eq(ch1, ch2, tg1, tg2) ((ceq(ch1, tg1) && ceq(ch2, tg2)) || (ceq(ch1, tg2) && ceq(ch2, tg1)) )
+
+#define LRMis_donor_chars(cc) (((cc)[0]=='G' && (cc)[1]=='T') || \
+ ((cc)[0]=='A' && (cc)[1]=='G') || \
+ ((cc)[0]=='A' && (cc)[1]=='C') || \
+ ((cc)[0]=='C' && (cc)[1]=='T') ||\
+ ((cc)[0]=='G' && (cc)[1]=='C'))
+
+
+int LRMpaired_chars(char * ch1, char * ch2){
+ if (c2eq(ch1, ch2, "GC", "AG") || c2eq(ch1, ch2, "GT", "AG") || c2eq(ch1, ch2, "CT", "AC")) {
+ if ( ceq(ch1, "GC") || ceq(ch1, "CT") || ceq(ch1, "GT")) return 1;
+ }
+ return 0;
+}
+
+
+int LRMscanning_events_compare(void * arr, int l, int r){
+ void ** arrr = (void **) arr;
+ LRMcontext_t * context = arrr[0];
+ int * event_ids = arrr[1];
+ LRMevent_t * body_l = context -> event_space+event_ids[l];
+ LRMevent_t * body_r = context -> event_space+event_ids[r];
+
+ if(body_l->small_side > body_r->small_side)return 1;
+ if(body_l->small_side < body_r->small_side)return -1;
+
+ if(body_l->large_side > body_r->large_side)return 1;
+ if(body_l->large_side < body_r->large_side)return -1;
+
+ if(body_l->event_type > body_r->event_type) return 1;
+ if(body_l->event_type < body_r->event_type) return -1;
+
+ if(body_l -> indel_length > body_r -> indel_length) return -1; // same length, but L is del and R is ins -- prefer del than ins
+ if(body_l -> indel_length < body_r -> indel_length) return 1;
+
+ return -1;
+}
+
+void LRMscanning_events_merge(void * arr, int start, int items, int items2){
+ void ** arrr = (void **) arr;
+ int * records = arrr[1];
+
+ int read_1_ptr = start, read_2_ptr = start+items, write_ptr;
+ int * merged_records = malloc(sizeof(int) * (items+items2));
+
+ for(write_ptr=0; write_ptr<items+items2; write_ptr++){
+ if((read_1_ptr >= start+items)||(read_2_ptr < start+items+items2 && LRMscanning_events_compare(arr, read_1_ptr, read_2_ptr) > 0))
+ merged_records[write_ptr] = records[read_2_ptr++];
+ else
+ merged_records[write_ptr] = records[read_1_ptr++];
+ }
+ memcpy(records + start, merged_records, sizeof(int) * (items+items2));
+ free(merged_records);
+}
+
+void LRMscanning_events_exchange(void * arr, int l, int r){
+ void ** arrr = (void **) arr;
+ int * records = arrr[1];
+
+ int tmpi;
+
+ tmpi = records[l];
+ records[l] = records[r];
+ records[r] = tmpi;
+}
+
+int LRMevents_build_entries(LRMcontext_t * context){
+ int x1,side_i;
+
+ for(x1=0; x1 < context->event_number; x1++){
+ LRMevent_t * te = context->event_space+ x1;
+ for(side_i = 0; side_i <2; side_i++){
+ unsigned int sidepos = side_i?te->large_side:te->small_side;
+ int * entries_list = HashTableGet(context -> events_realignment, NULL+sidepos);
+ //LRMprintf("INSERT ENTRY : %u -> %p ; SRC: %u ~ %u\n", sidepos, entries_list, te->small_side, te->large_side);
+ if(NULL == entries_list){
+ entries_list = malloc(sizeof(int) * 3);
+ if(!entries_list){
+ LRMprintf("ERROR: NO MEMORY CAN BE ALLOCATED!\n");
+ assert(0);
+ }
+ entries_list[0]=2;
+ entries_list[1]=0;
+ HashTablePut(context -> events_realignment , NULL+sidepos, entries_list);
+ }
+ int x2 = 0, inserted = 0;
+ for(x2=1; x2< 1+ min( LRMMAX_EVENTS_PER_SITE , entries_list[0] ); x2++){
+ //#warning ">>>>>>>>> COMMENT NEXT LINE <<<<<<<<"
+ //if( x1 + 1 == entries_list[x2] )LRMprintf("REPEATED ENTRY: %d\n");
+
+ if(0 == entries_list[x2]){
+ entries_list[x2] = x1+1;
+ if( x2 < entries_list[0] )entries_list[x2+1]=0;
+ inserted = 1;
+ break;
+ }
+ }
+ if((!inserted) && entries_list[0] < LRMMAX_EVENTS_PER_SITE){
+ int last_x1 = entries_list[0];
+ entries_list[0] = LRMMAX_EVENTS_PER_SITE;
+ entries_list = realloc(entries_list, sizeof(int)*(1+LRMMAX_EVENTS_PER_SITE));
+ entries_list[last_x1] = x1+1;
+ entries_list[last_x1+1] = 0;
+
+ if(te -> small_side == 457511654 ) LRMprintf("INSERT_NEW EVENT : %d AT %u\n",x1, sidepos );
+
+ HashTablePut(context -> events_realignment, NULL+sidepos, entries_list);
+ }
+ }
+ }
+ return 0;
+}
+
+void LRMevents_reorder_merge_next(LRMcontext_t * context, int *order_index){
+ LRMevent_t *prev_event = NULL, * new_space = malloc(sizeof(LRMevent_t) * 10000);
+ int x1, new_space_size = 10000, new_space_used = 0;
+
+ for(x1=0; x1 <=context->event_number; x1++){
+ LRMevent_t *this_event = NULL;
+ if(x1 < context->event_number) this_event = context->event_space+order_index[x1];
+ if( x1 < context->event_number && prev_event!=NULL &&
+ prev_event->large_side == this_event->large_side &&
+ prev_event->small_side == this_event->small_side &&
+ prev_event->event_type == this_event->event_type &&
+ prev_event->indel_length == this_event->indel_length){
+ prev_event -> supporting_reads ++;
+ }else{
+ if(new_space_size -1 < new_space_used){
+ new_space_size*=1.7;
+ new_space = realloc(new_space, sizeof(LRMevent_t)*new_space_size);
+ }
+ if(prev_event) memcpy(new_space+(new_space_used++), prev_event, sizeof(LRMevent_t));
+
+ if(this_event){
+ prev_event = this_event;
+ prev_event -> supporting_reads = 1;
+ }
+ }
+ }
+
+ free(context -> event_space);
+ context -> event_space = new_space;
+ context -> event_space_size = new_space_size;
+ context -> event_number = new_space_used;
+}
+
+int LRMevents_reorder(LRMcontext_t * context){
+ int * order_index = malloc(context -> event_number*sizeof(int));
+ int x1=0;
+ while(x1<context -> event_number){
+ order_index[x1]=x1;
+ x1++;
+ }
+ void * sort_arr[2];
+ sort_arr [0] = context;
+ sort_arr [1] = order_index;
+
+ merge_sort(sort_arr, context -> event_number, LRMscanning_events_compare, LRMscanning_events_exchange, LRMscanning_events_merge);
+ //basic_sort(sort_arr, context -> event_number, LRMscanning_events_compare, LRMscanning_events_exchange);
+ LRMevents_reorder_merge_next(context, order_index);
+
+ if(0){
+ LRMprintf("Total events : %d\n", context -> event_number);
+ for(x1=0; x1<context -> event_number; x1++){
+ LRMevent_t * te = context->event_space+ x1;
+ if(1 || te -> small_side == 457511654){
+
+ char pos1txt[100], pos2txt[100];
+ LRMpos2txt(context, te->small_side, pos1txt);
+ LRMpos2txt(context, te->large_side, pos2txt);
+
+ LRMprintf("SORTED EVENT: TYPE: %d - INS %d %s ~ %s, nsup=%d\n", te -> event_type, te -> indel_length, pos1txt, pos2txt, te->supporting_reads);
+ }
+ }
+ }
+
+ free(order_index);
+ return 0;
+}
+
+int LRMchro_event_new(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMevent_t * new_event){
+ if(new_event -> event_type == LRMEVENT_TYPE_INDEL) return 0;
+
+ LRMthread_lock(&context -> event_space_lock);
+ if(context -> event_space_size < context -> event_number + 1 ){
+ context -> event_space_size *= 1.7;
+ context -> event_space =realloc(context -> event_space, sizeof(LRMevent_t) * context -> event_space_size);
+ if(!context -> event_space )return 1;
+ }
+ memcpy(context -> event_space+context -> event_number, new_event, sizeof(LRMevent_t));
+ context -> event_number++;
+ //LRMprintf("Total events after adding : %d\n", context -> event_number);
+ LRMthread_lockrelease(&context -> event_space_lock);
+
+ return 0;
+}
+
+#define LRMDP_score(x,y) dynamic_score_buffer[ (x)*dynamic_row_width + (y) ]
+#define LRMDP_move(x,y) dynamic_movement_BEFORE_buffer[ (x)*dynamic_row_width + (y) ]
+
+int tested = 0;
+
+void LRMindel_dynamic_search_debug(LRMcontext_t* context, int * dynamic_score_buffer, char * dynamic_movement_BEFORE_buffer, int dynamic_row_width, int dynamic_rows, int *best_offset_history){
+ int rr, ii;
+
+ LRMprintf(" ");
+ for(ii=0;ii<dynamic_row_width;ii++)
+ LRMprintf(" % 4d ", ii - context -> max_dynamic_indel_length);
+ LRMprintf("\n");
+
+ for(rr=0; rr<dynamic_rows; rr++){
+ LRMprintf("%4d | %4d ", best_offset_history?best_offset_history[ rr ]:-1, rr);
+ for(ii=0;ii<dynamic_row_width;ii++){
+ LRMprintf("% 4d %c ", LRMDP_score(rr,ii), LRMDP_move(rr,ii));
+ }
+ LRMprintf("\n");
+ }
+}
+
+#define LRMSOFTCLIPPING_WINDOW 30
+#define LRMSOFTCLIPPING_MATCHED 25
+int LRMsoftclipping_moves(LRMcontext_t* context, LRMthread_context_t* thread_context, char * move_buff, int moves, int bases_in_read){
+ int ii;
+
+ int included_read_length = 0;
+ int last_M = 0x7fffffff;
+ int window_end = moves - 1;
+ int window_start = 0;
+ int window_MX = 0, window_M=0;
+
+
+ //LRMprintf("MOVES=%s\n", move_buff);
+ for(ii = moves -1; ii>= 0; ii--){
+ if( move_buff[ii] == 'M' || move_buff[ii] == 'X' ){
+ window_MX++;
+ if(move_buff[ii] == 'M')
+ window_M++;
+ }
+ if(window_MX == LRMSOFTCLIPPING_WINDOW)break;
+ }
+ window_start = ii;
+
+ if(window_MX ==LRMSOFTCLIPPING_WINDOW){
+ for(; window_start >=0 ; window_start--){
+ if(move_buff[window_start]=='M' || move_buff[window_start] =='X'){
+ window_MX ++;
+ if(move_buff[window_start]=='M')
+ window_M++;
+ }
+
+ if(window_MX > LRMSOFTCLIPPING_WINDOW){
+ while(1){
+ char nch = move_buff[window_end--];
+ if(nch == 'M' || nch == 'X'){
+ window_MX --;
+ if(nch == 'M') window_M--;
+ break;
+ }
+ }
+ }
+
+ //LRMprintf("M=%d, W = %d - %d, windows_MX=%d in %d\n", window_M, window_start, window_end, window_MX, bases_in_read);
+ if(window_M < LRMSOFTCLIPPING_MATCHED)
+ break;
+ }
+ }
+
+ int smallwindow_Xs = 0;
+ last_M = window_end;
+
+ for(ii = window_end; ii>=0 && ii >= window_start; ii--){
+ if(move_buff[ii] == 'M')
+ last_M = ii;
+
+ if(move_buff[ii] == 'X' && window_M < LRMSOFTCLIPPING_MATCHED){
+ smallwindow_Xs++;
+ if(smallwindow_Xs > 1) break;
+ }
+ }
+
+ //LRMprintf("M=%d, last 'M' at %d, windows_MX=%d in %d\n", window_M, last_M, window_MX, bases_in_read);
+ for(ii = moves -1; ii>= last_M; ii--){
+ if(move_buff[ii] == 'M' || move_buff[ii] == 'X' || move_buff[ii] == 'I')
+ included_read_length++;
+ }
+
+ //assert( (last_M - 1) >=(bases_in_read - included_read_length));
+ //LRMprintf("last_M=%d, included_read_length=%d in %d\n", last_M, included_read_length, bases_in_read );
+ for(ii = last_M - 1; ii > (last_M - 1) - (bases_in_read - included_read_length) ; ii--){
+ if(ii<0) return -1;
+ move_buff[ii]='S';
+ }
+
+ for(; ii >= 0; ii--)
+ move_buff[ii]='.';
+ return 0;
+}
+
+
+
+#define LRMindel_dynamic_search_narrow LRMindel_dynamic_search
+//#define LRMindel_dynamic_search_full LRMindel_dynamic_search
+
+int my_debug = 0;
+int LRMindel_dynamic_search_narrow(LRMcontext_t* context, LRMthread_context_t* thread_context, int expected_indels, unsigned int last_correct_base_on_chro, char * corrected_read, int last_correct_base, int first_correct_base,unsigned int * total_mismatched_bases, int high_penalty_create_gap, char * read_name){
+ int moves, xx1;
+
+ (*total_mismatched_bases)=0;
+ if(my_debug) {
+ char postxt[100];
+ LRMpos2txt(context, last_correct_base_on_chro, postxt);
+ LRMprintf("Dynamic: at %s : %d - %d ; expected = %d\n", postxt, last_correct_base, first_correct_base, expected_indels);
+ for(xx1 = 0 ; xx1 < first_correct_base - last_correct_base ; xx1++ ){
+ LRMprintf("%c", corrected_read[ xx1 + last_correct_base ]);
+ }
+ LRMprintf("\n");
+ for(xx1 = 0 ; xx1 < first_correct_base - last_correct_base - expected_indels; xx1++){
+ LRMprintf("%c", LRMgvindex_get(& context -> current_base_index, last_correct_base_on_chro + xx1));
+ }
+ LRMprintf("\n");
+ }
+
+ int dynamic_rows = first_correct_base - last_correct_base;
+ int trying_indel_length = LRMINDEL_DYNAMIC_CHANNEL_TOLERANCE;
+
+ int best_offset_history [dynamic_rows];
+ int score_match = context -> dynamic_programming_score_match;
+ int score_mismatch = context -> dynamic_programming_score_mismatch;
+ int score_create_gap = context -> dynamic_programming_score_create_gap * (1+high_penalty_create_gap);
+ int score_extend_gap = context -> dynamic_programming_score_extend_gap;
+ int dynamic_row_width = 2* trying_indel_length + 1;
+
+// assert(dynamic_rows <=LRMDYNAMIC_MAXIMUM_GAP_LENGTH);
+
+ //LRMprintf("DYNAMIC SCORING BUFF = %lld bytes (%d * %d)\n", sizeof(int) * dynamic_row_width * dynamic_rows, dynamic_row_width, dynamic_rows);
+ int * dynamic_score_buffer = (int *)thread_context -> dynamic_programming_score_buffer;
+ char * dynamic_movement_BEFORE_buffer = thread_context -> dynamic_programming_movement_buffer;
+ //LRMprintf("SETTING 0 TO %p : %lu bytes\n", dynamic_score_buffer,sizeof(int) * dynamic_row_width * dynamic_rows);
+ memset(dynamic_score_buffer, 0, sizeof(int) * dynamic_row_width * dynamic_rows);
+ memset(dynamic_movement_BEFORE_buffer, 0, sizeof(char) * dynamic_row_width * dynamic_rows);
+ char * indel_movement_buff = (char *) thread_context -> dynamic_programming_indel_movement_buf;
+
+ LRMDP_score(0, trying_indel_length )=0;
+ int read_cursor = last_correct_base, row_i, indel_i;
+ unsigned int chro_cursor = last_correct_base_on_chro;
+
+ int last_slope_offset = 0;
+
+ if(1){
+ float slope = expected_indels *1. / dynamic_rows;
+ for(xx1 = 0; xx1 < dynamic_rows; xx1++)
+ //#warning "========= DELETE 0* ============="
+ best_offset_history[xx1] = 1* (int)(xx1 * slope);
+ }
+
+ for(; read_cursor < first_correct_base; read_cursor++){
+ row_i = read_cursor - last_correct_base;
+ int slope_offset = row_i>0?best_offset_history[row_i-1]:0;
+ int last_slope_delta = slope_offset - last_slope_offset;
+
+ //LRMprintf("Filling Dynamic Matrix : row %d / %d\n", read_cursor, first_correct_base - min(0, expected_indels));
+ for(indel_i = dynamic_row_width-1 ; indel_i >=0; indel_i --){ // negative: deletion; positive: insertion
+ int testing_indel = indel_i - trying_indel_length;
+ if(1){
+ int score_from_del = -0x7fffffff, score_from_ins = -0x7fffffff, score_from_match = -0x7fffffff;
+ //int is_matched_base = toupper(corrected_read[read_cursor + max(0, testing_indel)]) == toupper(LRMgvindex_get(& context -> current_base_index, chro_cursor - min(0, testing_indel) ));
+ int is_matched_base = toupper(corrected_read[read_cursor]) == toupper(LRMgvindex_get(& context -> current_base_index, chro_cursor - slope_offset - testing_indel));
+
+ if(row_i>0 && (indel_i-1+ last_slope_delta)>=0 && (indel_i-1+ last_slope_delta)< dynamic_row_width && LRMDP_score(row_i-1, indel_i-1 + last_slope_delta) > -0x7ffffff0)
+ score_from_ins = LRMDP_score(row_i-1, indel_i-1 + last_slope_delta) + ( (LRMDP_move(row_i-1, indel_i-1+ last_slope_delta) == 'M' || LRMDP_move(row_i-1, indel_i-1 +last_slope_delta) == 'X')?score_create_gap:score_extend_gap);
+
+ if(testing_indel < 0 || testing_indel < row_i)if(indel_i < dynamic_row_width-1 && LRMDP_score(row_i, indel_i+1) > -0x7ffffff0)
+ score_from_del = LRMDP_score(row_i, indel_i+1) + ((LRMDP_move(row_i, indel_i+1) == 'M' || LRMDP_move(row_i, indel_i+1) == 'X')?score_create_gap:score_extend_gap);
+
+ if((indel_i+ last_slope_delta)>=0 && (indel_i+ last_slope_delta)< dynamic_row_width && (row_i ==0 || LRMDP_score(row_i-1, indel_i + last_slope_delta) > -0x7ffffff0)){
+ score_from_match =(row_i > 0 ?LRMDP_score(row_i-1, indel_i + last_slope_delta): 0)+ (is_matched_base?score_match:score_mismatch);
+ if(row_i == 0 && testing_indel > 0) score_from_match += score_create_gap + (testing_indel-1) * score_extend_gap;
+ }
+
+ int final_score = max(score_from_del, max(score_from_ins, score_from_match));
+ if(testing_indel + slope_offset > 0 && row_i < slope_offset + testing_indel){
+ LRMDP_score(row_i, indel_i) = score_create_gap + (score_extend_gap-1) * (testing_indel+slope_offset) ;
+ LRMDP_move(row_i, indel_i) = 'I';
+ }else{
+ LRMDP_score(row_i, indel_i) = final_score;
+ if(final_score < -0x7ffffff0) LRMDP_move(row_i, indel_i) = '?';
+ else LRMDP_move(row_i, indel_i) = score_from_del == final_score?'D':((score_from_ins == final_score)?'I': ( is_matched_base ?'M':'X'));
+ }
+ }
+ }
+ last_slope_offset = slope_offset;
+ chro_cursor ++;
+ }
+
+ if(my_debug) LRMindel_dynamic_search_debug(context, dynamic_score_buffer, dynamic_movement_BEFORE_buffer, dynamic_row_width, dynamic_rows, best_offset_history);
+
+ row_i = first_correct_base - last_correct_base - 1;
+ indel_i = trying_indel_length + expected_indels -(row_i >0?best_offset_history[row_i - 1]:0);
+
+ moves = 0;
+ while(row_i >= 0 && indel_i >=0 && indel_i < dynamic_row_width){
+ int slope_offset = best_offset_history[row_i-1];
+ int next_slope_offset = row_i > 1?best_offset_history[row_i-2]:0;
+ int last_slope_delta = slope_offset - next_slope_offset;
+ //#warning "========= DO NOT ASSERT ============="
+ indel_movement_buff[moves] = LRMDP_move(row_i, indel_i);
+ if(indel_movement_buff[moves]=='?')LRMprintf("Assertion_Error:%s\n", read_name);
+ assert(indel_movement_buff[moves]!='?');
+ (* total_mismatched_bases) += (indel_movement_buff[moves]=='X')?1:0;
+
+ if(indel_movement_buff[moves] == 'M' || indel_movement_buff[moves] == 'X'){
+ row_i--;
+ indel_i += last_slope_delta;
+ } else if(indel_movement_buff[moves] == 'D')indel_i++;
+ else {
+ indel_i --;
+ indel_i += last_slope_delta;
+ row_i--;
+ }
+ moves ++;
+
+ if(row_i < 0 && indel_i < trying_indel_length)
+ for(; indel_i < trying_indel_length; indel_i++) indel_movement_buff[moves++] ='D';
+
+ if(moves > max( LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15, 300 ) + context -> max_dynamic_indel_length ){
+ LRMprintf("ERROR: Dynamic programming moves more than %d\n", max( (int)(LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15), 300 ) + context -> max_dynamic_indel_length);
+ return -1;
+ }
+ }
+
+ indel_movement_buff[moves]=0;
+ //if(my_debug)LRMprintf("MOVE0 = %s\n", indel_movement_buff);
+ for(row_i = 0; row_i < moves/2; row_i++){
+ char tmp = indel_movement_buff[row_i];
+ indel_movement_buff[row_i] = indel_movement_buff[ moves - row_i - 1];
+ indel_movement_buff[ moves - row_i - 1] = tmp;
+ }
+ if(my_debug)LRMprintf("MOVES = %s\n", indel_movement_buff);
+
+ //#warning "=============== DO NOT EXIT ==========="
+ //if(tested ++ > 26) exit(0);
+ return moves;
+}
+
+
+int LRMindel_dynamic_search_unknownregion(LRMcontext_t* context, LRMthread_context_t* thread_context, int search_to_3end, unsigned int last_correct_base_on_chro, char * corrected_read, int last_correct_base, int first_correct_base,unsigned int * total_mismatched_bases, int high_penalty_create_gap, char * read_name){
+ int moves;
+ (*total_mismatched_bases)=0;
+
+ int bases_in_read = first_correct_base - last_correct_base;
+ int trying_indel_length = LRMINDEL_DYNAMIC_CHANNEL_TOLERANCE;
+
+ int best_offset_history [bases_in_read];
+ int score_match = context -> dynamic_programming_score_match;
+ int score_mismatch = context -> dynamic_programming_score_mismatch;
+ int score_create_gap = context -> dynamic_programming_score_create_gap * (1+high_penalty_create_gap);
+ int score_extend_gap = context -> dynamic_programming_score_extend_gap;
+ int dynamic_row_width = 2* trying_indel_length + 1;
+
+ int * dynamic_score_buffer = (int *)thread_context -> dynamic_programming_score_buffer;
+ char * dynamic_movement_BEFORE_buffer = thread_context -> dynamic_programming_movement_buffer;
+ memset(dynamic_score_buffer, 0, sizeof(int) * dynamic_row_width * bases_in_read);
+ memset(dynamic_movement_BEFORE_buffer, 0, sizeof(char) * dynamic_row_width * bases_in_read);
+ char * indel_movement_buff = (char *) thread_context -> dynamic_programming_indel_movement_buf;
+
+ LRMDP_score(0, trying_indel_length )=0;
+ unsigned int chro_cursor ;
+
+ int last_slope_offset = 0, read_i, indel_i, previous_base_in_read = 0;
+
+ if(0 && !search_to_3end){
+ char postxt[100];
+ LRMpos2txt(context, last_correct_base_on_chro, postxt);
+ LRMprintf("EXTEND_UNKNOWN: %s\n", postxt);
+ if(!search_to_3end){
+ int bb;
+ bb = corrected_read[first_correct_base];
+ corrected_read[first_correct_base] = 0;
+ LRMprintf("READ: %s\n", corrected_read);
+ corrected_read[first_correct_base] = bb;
+
+ LRMprintf("CHRO: ");
+ for(chro_cursor = last_correct_base_on_chro - bases_in_read - 10; chro_cursor < last_correct_base_on_chro; chro_cursor ++){
+ bb = LRMgvindex_get(& context -> current_base_index, chro_cursor);
+ LRMprintf("%c", bb);
+ }
+ LRMprintf("\n\n");
+ }else{
+ int bb;
+ bb = corrected_read[first_correct_base];
+ corrected_read[first_correct_base] = 0;
+ LRMprintf("READ: %s\n", corrected_read + last_correct_base);
+ corrected_read[first_correct_base] = bb;
+
+ LRMprintf("CHRO: ");
+ for(chro_cursor = last_correct_base_on_chro; chro_cursor < last_correct_base_on_chro + 10 + bases_in_read; chro_cursor ++){
+ bb = LRMgvindex_get(& context -> current_base_index, chro_cursor);
+ LRMprintf("%c", bb);
+ }
+ LRMprintf("\n\n");
+ }
+ }
+
+ for(read_i = 0; read_i < bases_in_read; read_i ++){
+ int this_base_in_read = search_to_3end? read_i :(bases_in_read - read_i -1);
+ int this_base_value_in_read = corrected_read [ last_correct_base + this_base_in_read ];
+
+ int slope_offset = read_i>0?best_offset_history[previous_base_in_read]:0;
+ int last_slope_delta = slope_offset - last_slope_offset;
+ if(0 && ! search_to_3end)LRMprintf("GET READ_BASE='%c' LAST_OFF=%d SLP_OFF=%d LAST_DELTA=%d\n", this_base_value_in_read, last_slope_offset, slope_offset, last_slope_delta);
+
+ int thisrow_max_score = -0x7fffffff, thisrow_max_indel_from_start = -0x7fffffff;
+
+
+ for(indel_i = search_to_3end?dynamic_row_width-1 : 0; indel_i !=(search_to_3end?-1:dynamic_row_width); indel_i+=(search_to_3end?-1:1)){
+ int indel_from_start = slope_offset + ( indel_i - trying_indel_length); // if to_3end: +:ins, -:del; if !to_3end: +:del, -:ins
+ unsigned int chro_location_after_indel = last_correct_base_on_chro + ( search_to_3end ?( read_i):(- read_i -1)) - indel_from_start; // if !to_3end: "DEL" from right is "INS" from left
+ int this_base_value_in_chro = LRMgvindex_get(& context -> current_base_index, chro_location_after_indel);
+
+ int score_from_del = -0x7fffffff, score_from_ins = -0x7fffffff, score_from_match = -0x7fffffff;
+ int is_matched_base = toupper(this_base_value_in_read)== toupper(this_base_value_in_chro);
+
+
+ if(search_to_3end){
+ if(read_i > 0 && indel_i-1 + last_slope_delta >=0 && indel_i-1 + last_slope_delta < dynamic_row_width && read_i > 0 && LRMDP_score(read_i-1, indel_i-1 + last_slope_delta) > -0x7ffffff0)
+ score_from_ins = LRMDP_score(read_i - 1, indel_i-1 + last_slope_delta) + ( (LRMDP_move(read_i - 1, indel_i-1+ last_slope_delta) == 'I')?score_extend_gap:score_create_gap);
+
+ if( indel_from_start < 0 || indel_from_start < read_i ) if(indel_i < dynamic_row_width-1 && read_i > 0 && LRMDP_score(read_i, indel_i+1) > -0x7ffffff0)
+ score_from_del = LRMDP_score(read_i, indel_i+1) + ((LRMDP_move(read_i, indel_i+1) == 'D')?score_extend_gap:score_create_gap);
+
+ if((indel_i+ last_slope_delta)>=0 && (indel_i+ last_slope_delta)< dynamic_row_width && (read_i ==0 || LRMDP_score(read_i-1, indel_i + last_slope_delta) > -0x7ffffff0))
+ score_from_match =(read_i > 0 ?LRMDP_score(read_i-1, indel_i + last_slope_delta): 0)+ (is_matched_base?score_match:score_mismatch);
+ //if(read_i == 2) LRMprintf("INDEL_i=%d, F_INS=%d, F_DEL=%d, F_MAT=%d\n", indel_i, score_from_ins, score_from_del, score_from_match);
+ }else{
+ if( indel_from_start > 0 || ( -indel_from_start < read_i ) ) if(indel_i>0 && LRMDP_score(read_i, indel_i-1) > -0x7ffffff0)
+ score_from_ins = LRMDP_score(read_i, indel_i-1) + ( (LRMDP_move(read_i, indel_i-1) == 'I')?score_extend_gap:score_create_gap);
+
+ if(indel_i < dynamic_row_width-1 && indel_i+1 + last_slope_delta >= 0 && indel_i+1 + last_slope_delta < dynamic_row_width && read_i > 0 && LRMDP_score(read_i - 1, indel_i+1+last_slope_delta) > -0x7ffffff0)
+ score_from_del = LRMDP_score(read_i - 1, indel_i+1 + last_slope_delta) + ((LRMDP_move(read_i - 1, indel_i+1+last_slope_delta) == 'D')?score_extend_gap:score_create_gap);
+
+ if((indel_i+ last_slope_delta)>=0 && (indel_i+ last_slope_delta)< dynamic_row_width && (read_i ==0 || LRMDP_score(read_i-1, indel_i + last_slope_delta) > -0x7ffffff0))
+ score_from_match =(read_i > 0 ?LRMDP_score(read_i-1, indel_i + last_slope_delta): 0)+ (is_matched_base?score_match:score_mismatch);
+ //if(read_i == 4 && !search_to_3end) // LRMprintf("INDEL_i=%d, F_INS=%d, F_DEL=%d, F_MAT=%d\n", indel_i, score_from_ins, score_from_del, score_from_match);
+ // LRMprintf("SCORE AT readi=%d, indel_i=%d = %d ;; MOVE = '%c'\n", read_i, indel_i-1+ last_slope_delta, LRMDP_score(read_i, indel_i-1+ last_slope_delta), LRMDP_move(read_i, indel_i-1+ last_slope_delta));
+ }
+
+ if(read_i == 0 && indel_from_start > 0 && !search_to_3end) score_from_match += score_create_gap + (indel_from_start-1) * score_extend_gap;
+ if(read_i == 0 && indel_from_start > 0 && search_to_3end) score_from_match += score_create_gap + (indel_from_start-1) * score_extend_gap;
+
+ int final_score = max(score_from_del, max(score_from_ins, score_from_match));
+ //if(read_i == 4 && !search_to_3end)LRMprintf(" == READ_I %d; INDEL_I %d; INDEL_START %d; FINAL %d\n", read_i, indel_i, indel_from_start, final_score);
+
+ if(indel_from_start < 0 && read_i < - indel_from_start && !search_to_3end){
+ LRMDP_score(read_i, indel_i) = score_create_gap - score_extend_gap * indel_from_start ;
+ LRMDP_move(read_i, indel_i) = 'D';
+ } else if(indel_from_start > 0 && read_i < indel_from_start && search_to_3end) {
+ LRMDP_score(read_i, indel_i) = score_create_gap + score_extend_gap * (indel_from_start -1) ;
+ LRMDP_move(read_i, indel_i) = 'I';
+ } else if(final_score < -0x7ffffff0){
+ LRMDP_move( read_i, indel_i) = '?';
+ }else{
+ if(0 && ! search_to_3end)LRMprintf("#%d %c%c%c %c %d |", indel_i, indel_from_start?' ':'>', this_base_value_in_chro, indel_from_start?' ':'<', score_from_del == final_score?'D':((score_from_ins == final_score)?'I': ( is_matched_base ?'M':'X')), final_score);
+ LRMDP_score( read_i, indel_i) = final_score;
+ //LRMprintf(" !! READ_I %d; INDEL_I %d; INDEL_START %d; FINAL %d\n", read_i, indel_i, indel_from_start, final_score);
+ if(final_score > thisrow_max_score && abs( indel_i - dynamic_row_width/2 ) < dynamic_row_width/3){
+ thisrow_max_score = final_score;
+ thisrow_max_indel_from_start = indel_from_start;
+ }
+ LRMDP_move(read_i, indel_i) =(score_from_del == final_score?'D':((score_from_ins == final_score)?'I': ( is_matched_base ?'M':'X')));
+ }
+ }
+ //if(thisrow_max_indel_from_start < -0x7ffffff0) LRMprintf("ERROR READ\n");
+ //else LRMprintf(" READ\n");
+ if(thisrow_max_indel_from_start < -0x7ffffff0 && read_name){
+ LRMprintf(" !! BADREAD:%s\n",read_name);
+ }
+ assert(thisrow_max_indel_from_start > -0x7ffffff0);
+ best_offset_history[this_base_in_read] = thisrow_max_indel_from_start;
+ if(0 && ! search_to_3end)LRMprintf("\nSET %d-th BEST_FROM_START=%d ; SLOPE_OFFSET=%d ; MAX_SCORE=%d\n\n", this_base_in_read, thisrow_max_indel_from_start, slope_offset, thisrow_max_score);
+
+ last_slope_offset = slope_offset;
+ previous_base_in_read = this_base_in_read;
+ }
+ read_i = bases_in_read - 1;
+
+ //LRMprintf("RESTART SEARCH: best_offset_history[read_i]=%d, best_offset_history[read_i-1]=%d\n", best_offset_history[ search_to_3end? bases_in_read - 1 : 0 ] - best_offset_history[ search_to_3end? bases_in_read - 2 : 1 ] );
+ indel_i = best_offset_history[ search_to_3end? bases_in_read - 1 : 0 ] - best_offset_history[ search_to_3end? bases_in_read - 2 : 1 ] + trying_indel_length;
+
+ moves = 0;
+ int error_in = 0;
+ while(read_i >= 0 && indel_i >=0 && indel_i < dynamic_row_width){
+ int real_base_in_read = search_to_3end? read_i:(bases_in_read - (read_i+1));
+ int slope_offset;
+ int next_slope_offset;
+
+ if(search_to_3end){
+ slope_offset = (real_base_in_read>0)?best_offset_history[real_base_in_read - 1]:0;
+ next_slope_offset = (real_base_in_read>1)?best_offset_history[real_base_in_read - 2]:0;
+ }else{
+ slope_offset = (real_base_in_read<bases_in_read - 1)?best_offset_history[real_base_in_read + 1]:0;
+ next_slope_offset = (real_base_in_read<bases_in_read - 2)?best_offset_history[real_base_in_read + 2]:0;
+ }
+
+ int last_slope_delta = next_slope_offset - slope_offset;
+ //#warning "========= DO NOT ASSERT ============="
+ indel_movement_buff[moves] = LRMDP_move(read_i, indel_i);
+ //LRMprintf("R %d , SCORE+=%d (INDEL_i=%d) MOVE='%c' LAST_DELTA=%d\n", read_i, LRMDP_score(read_i, indel_i), indel_i, LRMDP_move(read_i, indel_i), last_slope_delta);
+ if(indel_movement_buff[moves]=='?'){
+ error_in = 1;
+ //LRMprintf("READ_ERROR_MOVE : %s\n", read_name);
+ break;
+ }
+ (* total_mismatched_bases) += (indel_movement_buff[moves]=='X')?1:0;
+
+
+ if(indel_movement_buff[moves] == 'M' || indel_movement_buff[moves] == 'X'){
+ read_i--;
+ if(read_i>=0)indel_i -= last_slope_delta;
+ }else {
+
+ if(search_to_3end){
+ if(indel_movement_buff[moves] == 'D')indel_i++;
+ else {
+ indel_i --;
+ if(read_i>=0)indel_i -= last_slope_delta;
+ read_i--;
+ }
+ }else{
+ if(indel_movement_buff[moves] == 'I')indel_i--;
+ else {
+ indel_i ++;
+ if(read_i>=0)indel_i -= last_slope_delta;
+ read_i--;
+ }
+ }
+ }
+ moves ++;
+
+ //if(read_i<0) LRMprintf("END_UP: indel_i = %d > %d\n", indel_i, trying_indel_length);
+ if(search_to_3end){
+ if(read_i < 0 && indel_i < trying_indel_length)
+ for(; indel_i <trying_indel_length; indel_i++) indel_movement_buff[moves++] = 'D';
+ }else if(read_i < 0 && indel_i > trying_indel_length)
+ for(; indel_i > trying_indel_length; indel_i--) indel_movement_buff[moves++] = 'I';
+
+ if(moves > max( LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15, 300 ) + context -> max_dynamic_indel_length ){
+ LRMprintf("ERROR: Dynamic programming moves more than %d\n", max( (int)(LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15), 300 ) + context -> max_dynamic_indel_length);
+ return -1;
+ }
+ }
+
+ if(error_in)return -1;
+
+ indel_movement_buff[moves]=0;
+ if(!search_to_3end){
+ for(read_i = 0; read_i < moves; read_i++){
+ char tmp = indel_movement_buff[read_i];
+ if(tmp == 'I') indel_movement_buff[read_i]='D';
+ else if(tmp == 'D') indel_movement_buff[read_i]='I';
+ }
+ }
+
+
+ error_in = LRMsoftclipping_moves(context,thread_context,indel_movement_buff, moves, bases_in_read);
+ if(error_in) return -1;
+
+ if(search_to_3end){
+ //if(my_debug)LRMprintf("MOVE0 = %s\n", indel_movement_buff);
+ for(read_i = 0; read_i < moves/2; read_i++){
+ char tmp = indel_movement_buff[read_i];
+ indel_movement_buff[read_i] = indel_movement_buff[ moves - read_i - 1];
+ indel_movement_buff[ moves - read_i - 1] = tmp;
+ }
+ }
+ return moves;
+}
+
+int LRMindel_dynamic_search_full(LRMcontext_t* context, LRMthread_context_t* thread_context, int expected_indels, unsigned int last_correct_base_on_chro, char * corrected_read, int last_correct_base, int first_correct_base,unsigned int * total_mismatched_bases){
+ int moves, xx1;
+
+ (*total_mismatched_bases)=0;
+ if(my_debug) {
+ char postxt[100];
+ LRMpos2txt(context, last_correct_base_on_chro, postxt);
+ LRMprintf("Dynamic: at %s : %d - %d ; expected = %d\n", postxt, last_correct_base, first_correct_base, expected_indels);
+ for(xx1 = 0 ; xx1 < first_correct_base - last_correct_base ; xx1++ ){
+ LRMprintf("%c", corrected_read[ xx1 + last_correct_base ]);
+ }
+ LRMprintf("\n");
+ for(xx1 = 0 ; xx1 < first_correct_base - last_correct_base - expected_indels; xx1++){
+ LRMprintf("%c", LRMgvindex_get(& context -> current_base_index, last_correct_base_on_chro + xx1));
+ }
+ LRMprintf("\n");
+ }
+
+ int trying_indel_length = min(context -> max_dynamic_indel_length, max(16, abs(expected_indels) *5/3));
+ //trying_indel_length = context -> max_dynamic_indel_length;
+
+ int score_match = context -> dynamic_programming_score_match;
+ int score_mismatch = context -> dynamic_programming_score_mismatch;
+ int score_create_gap = context -> dynamic_programming_score_create_gap;
+ int score_extend_gap = context -> dynamic_programming_score_extend_gap;
+ int dynamic_row_width = 2* trying_indel_length + 1;
+ int dynamic_rows = first_correct_base - last_correct_base;
+
+// assert(dynamic_rows <=LRMDYNAMIC_MAXIMUM_GAP_LENGTH);
+
+ //LRMprintf("DYNAMIC SCORING BUFF = %lld bytes (%d * %d)\n", sizeof(int) * dynamic_row_width * dynamic_rows, dynamic_row_width, dynamic_rows);
+ int * dynamic_score_buffer = (int *)thread_context -> dynamic_programming_score_buffer;
+ char * dynamic_movement_BEFORE_buffer = thread_context -> dynamic_programming_movement_buffer;
+ //LRMprintf("SETTING 0 TO %p : %lu bytes\n", dynamic_score_buffer,sizeof(int) * dynamic_row_width * dynamic_rows);
+ memset(dynamic_score_buffer, 0, sizeof(int) * dynamic_row_width * dynamic_rows);
+ memset(dynamic_movement_BEFORE_buffer, 0, sizeof(char) * dynamic_row_width * dynamic_rows);
+ char * indel_movement_buff = (char *) thread_context -> dynamic_programming_indel_movement_buf;
+
+ LRMDP_score(0, trying_indel_length )=0;
+ int read_cursor = last_correct_base, row_i, indel_i;
+ unsigned int chro_cursor = last_correct_base_on_chro;
+
+ for(; read_cursor < first_correct_base; read_cursor++){
+ row_i = read_cursor - last_correct_base;
+ //LRMprintf("Filling Dynamic Matrix : row %d / %d\n", read_cursor, first_correct_base - min(0, expected_indels));
+ for(indel_i = dynamic_row_width-1 ; indel_i >=0; indel_i --){ // negative: deletion; positive: insertion
+ int testing_indel = indel_i - trying_indel_length;
+ if(1){
+ int score_from_del = -0x7fffffff, score_from_ins = -0x7fffffff, score_from_match = -0x7fffffff;
+ //int is_matched_base = toupper(corrected_read[read_cursor + max(0, testing_indel)]) == toupper(LRMgvindex_get(& context -> current_base_index, chro_cursor - min(0, testing_indel) ));
+ int is_matched_base = toupper(corrected_read[read_cursor]) == toupper(LRMgvindex_get(& context -> current_base_index, chro_cursor - testing_indel));
+
+ if(indel_i>0 && row_i>0)
+ score_from_ins = LRMDP_score(row_i-1, indel_i-1) + ( (LRMDP_move(row_i-1, indel_i-1) == 'M' || LRMDP_move(row_i-1, indel_i-1) == 'X')?score_create_gap:score_extend_gap);
+ if(indel_i < dynamic_row_width-1)
+ score_from_del = LRMDP_score(row_i, indel_i+1) + ((LRMDP_move(row_i, indel_i+1) == 'M' || LRMDP_move(row_i, indel_i+1) == 'X')?score_create_gap:score_extend_gap);
+ score_from_match =(row_i > 0 ?LRMDP_score(row_i-1, indel_i): 0)+ (is_matched_base?score_match:score_mismatch);
+
+ int final_score = max(score_from_del, max(score_from_ins, score_from_match));
+ if(testing_indel > 0 && row_i < testing_indel){
+ LRMDP_score(row_i, indel_i) = score_create_gap + score_extend_gap * testing_indel ;
+ LRMDP_move(row_i, indel_i) = 'I';
+ }else{
+ LRMDP_score(row_i, indel_i) = final_score;
+ LRMDP_move(row_i, indel_i) = score_from_del == final_score?'D':((score_from_ins == final_score)?'I': ( is_matched_base ?'M':'X'));
+ }
+ }
+ }
+ chro_cursor ++;
+ }
+
+ if(my_debug) LRMindel_dynamic_search_debug(context, dynamic_score_buffer, dynamic_movement_BEFORE_buffer, dynamic_row_width, dynamic_rows, NULL);
+
+ row_i = first_correct_base - last_correct_base - 1;
+ indel_i = trying_indel_length + expected_indels;
+
+ moves = 0;
+ while(row_i >= 0 && indel_i >=0 && indel_i < dynamic_row_width){
+ //#warning "========= DO NOT ASSERT ============="
+ indel_movement_buff[moves] = LRMDP_move(row_i, indel_i);
+ (* total_mismatched_bases) += (indel_movement_buff[moves]=='X')?1:0;
+
+ if(indel_movement_buff[moves] == 'M' || indel_movement_buff[moves] == 'X') row_i--;
+ else if(indel_movement_buff[moves] == 'D') { indel_i++; }
+ else {indel_i--; row_i--;}
+ moves ++;
+
+ if(row_i < 0 && indel_i < trying_indel_length)
+ for(; indel_i < trying_indel_length; indel_i++) indel_movement_buff[moves++] ='D';
+
+ if(moves > max( LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15, 300 ) + context -> max_dynamic_indel_length ){
+ LRMprintf("ERROR: Dynamic programming moves more than %d\n", max( (int)(LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15), 300 ) + context -> max_dynamic_indel_length);
+ return -1;
+ }
+ }
+
+ indel_movement_buff[moves]=0;
+ //if(my_debug)LRMprintf("MOVE0 = %s\n", indel_movement_buff);
+ for(row_i = 0; row_i < moves/2; row_i++){
+ char tmp = indel_movement_buff[row_i];
+ indel_movement_buff[row_i] = indel_movement_buff[ moves - row_i - 1];
+ indel_movement_buff[ moves - row_i - 1] = tmp;
+ }
+ if(my_debug)LRMprintf("MOVES = %s\n", indel_movement_buff);
+
+ //#warning "=============== DO NOT EXIT ==========="
+ //if(tested ++ > 26) exit(0);
+ return moves;
+}
+
+
+
+
+
+
+
+
+
+
+
+int LRMdonor_score(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMsegment_mapping_candidate_t * mapping_result, int seg_id, unsigned int secondary_mapping_location, unsigned int secondary_coverage_start , unsigned int secondary_coverage_end, int indel_in_anchor, int indel_in_secondary, int * final_split_point_on_read, int *is_GT_AG_strand, int * final_left_indel_offset){
+ LRMgene_value_index_t * value_index = &context->current_base_index;
+ int left_indel_offset = mapping_result -> first_base_position > secondary_mapping_location? indel_in_secondary : indel_in_anchor ; // may be later calculated from the indel records;
+
+ unsigned int gap_on_read_start = mapping_result -> first_base_position > secondary_mapping_location? secondary_coverage_end : mapping_result -> confident_coverage_end;
+ unsigned int gap_on_read_end = mapping_result -> first_base_position > secondary_mapping_location? mapping_result -> confident_coverage_start : secondary_coverage_start;
+
+ gap_on_read_start -= 4;
+ gap_on_read_end += 4;
+
+ if(gap_on_read_start <= gap_on_read_end){
+
+ unsigned int left_virtualHead_abs_offset = min(mapping_result -> first_base_position, secondary_mapping_location);
+ unsigned int right_virtualHead_abs_offset = max(mapping_result -> first_base_position, secondary_mapping_location);
+
+ // guess_end is the index of the first UNWANTED BASE.
+ int most_likely_point = (gap_on_read_start+gap_on_read_end)/2;
+
+ // "split_point" is the first base NOT IN piece 1; it is also the first base IN piece 2.
+ int selected_real_split_point = -1, selected_junction_strand = -1;
+ char donor_left[3], donor_right[3];
+ int best_score = -111111;
+
+ int real_split_point_i;
+ int real_split_point_numbers = gap_on_read_end - gap_on_read_start;
+ char * read_text = iteration_context -> segment_texts[seg_id];
+ int read_len = iteration_context -> segment_lengths[seg_id];
+
+ //LRMprintf("START DONOR SEARCH: real_split_point_numbers = %d\n", real_split_point_numbers);
+ for(real_split_point_i = 0 ; real_split_point_i < real_split_point_numbers; real_split_point_i++){
+ int left_should_match, right_should_match = 0;
+ int left_should_not_match = 0, right_should_not_match = 0;
+ int real_split_point = (real_split_point_i % 2)?-((real_split_point_i+1)/2):((1+real_split_point_i)/2);
+ real_split_point += most_likely_point;
+ int is_donor_test_ok = 0;
+
+ if(real_split_point > read_len-JUNCTION_CONFIRM_WINDOW)continue;
+ if(real_split_point < JUNCTION_CONFIRM_WINDOW)continue;
+
+ LRMgvindex_get_string (donor_left, value_index, left_virtualHead_abs_offset + real_split_point + left_indel_offset, 2, 0);
+ LRMgvindex_get_string (donor_right, value_index, right_virtualHead_abs_offset + real_split_point - 2, 2, 0);
+ if(LRMis_donor_chars(donor_left)&& LRMis_donor_chars(donor_right))
+ is_donor_test_ok = LRMpaired_chars(donor_left, donor_right);
+
+ int mismatch_in_between_allowd = 1;
+ if(is_donor_test_ok){
+ left_should_match = LRMmatch_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, left_virtualHead_abs_offset + real_split_point - JUNCTION_CONFIRM_WINDOW + left_indel_offset , JUNCTION_CONFIRM_WINDOW , 0);
+ right_should_match = LRMmatch_chro(read_text + real_split_point, value_index, right_virtualHead_abs_offset + real_split_point, JUNCTION_CONFIRM_WINDOW , 0);
+
+ if(0){
+ char pos1txt[100], pos2txt[100];
+ LRMpos2txt(context, left_virtualHead_abs_offset + real_split_point + left_indel_offset, pos1txt);
+ LRMpos2txt(context, right_virtualHead_abs_offset + real_split_point + left_indel_offset, pos2txt);
+ LRMprintf("Testing %s ~ %s : GT:AG: %c%c %c%c OK=%d ; Ml and Mr = %d and %d\n", pos1txt, pos2txt, donor_left[0], donor_left[1], donor_right[0], donor_right[1], is_donor_test_ok, left_should_match, right_should_match);
+ }
+ if(left_should_match > JUNCTION_CONFIRM_WINDOW- 2){
+
+ if(right_should_match >= 2*JUNCTION_CONFIRM_WINDOW - left_should_match - mismatch_in_between_allowd){
+ left_should_not_match = LRMmatch_chro(read_text + real_split_point, value_index, left_virtualHead_abs_offset + real_split_point + left_indel_offset, JUNCTION_CONFIRM_WINDOW , 0);
+ right_should_not_match = LRMmatch_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, right_virtualHead_abs_offset + real_split_point - JUNCTION_CONFIRM_WINDOW, JUNCTION_CONFIRM_WINDOW , 0);
+
+ //LRMprintf(" Xl and Xr = %d and %d\n", left_should_not_match, right_should_not_match);
+ if(left_should_not_match <= JUNCTION_CONFIRM_WINDOW -5 && right_should_not_match <= JUNCTION_CONFIRM_WINDOW -5){
+ int test_score ;
+ test_score = 100*(is_donor_test_ok*3000+left_should_match + right_should_match - left_should_not_match - right_should_not_match);
+
+ //LRMprintf(" BEST REPLACES? %d <?< %d\n", best_score, test_score);
+ if(test_score > best_score){
+ selected_junction_strand = (donor_left[0]=='G' || donor_right[1]=='G');
+ selected_real_split_point = real_split_point;
+ //LRMprintf(" BEST REPLACESED %d <<- %d\n", best_score, test_score);
+ best_score = test_score;
+ }
+ }
+ }
+ }
+ }
+ }
+ if(best_score>0)
+ {
+ *final_split_point_on_read = selected_real_split_point;
+ *is_GT_AG_strand = selected_junction_strand;
+ *final_left_indel_offset = left_indel_offset;
+ return (1+best_score)/100;
+ }
+ }
+ return 0;
+}
+
+int LRMevents_search(LRMcontext_t * context, unsigned int testing_pos, int search_for_smaller_coordinate, int * event_ids_buff){
+ int * event_here = HashTableGet(context -> events_realignment, NULL + testing_pos);
+ if(NULL == event_here)return 0;
+ int x1, ret = 0;
+ for(x1 = 1; x1< 1+ min(LRMMAX_EVENTS_PER_SITE, event_here[0]); x1++){
+ int event_i = event_here[x1] - 1;
+ if(0 > event_i)break;
+
+ //LRMprintf("TESTING EVENT AT %u [%d] : %d \n", testing_pos, x1 , event_i);
+ LRMevent_t * thise = context -> event_space + event_i;
+ if((thise -> large_side == testing_pos && !search_for_smaller_coordinate)||
+ (thise -> small_side == testing_pos && search_for_smaller_coordinate) )
+ event_ids_buff[ret++]=event_i;
+ }
+ return ret;
+}
+
+#define LRMJUMP_TESTING_LEN 16
+
+int LRM_should_jump( LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMrealign_context_t * realign_context, unsigned int other_side_start, int next_seg_pos, LRMevent_t *new_env ){
+ int bx1, matched = 0;
+
+ if(new_env -> event_type == LRMEVENT_TYPE_INDEL)return 0;
+
+ if(0){
+ LRMprintf("=================\n");
+ char poschr[100];
+ LRMpos2txt(context, other_side_start, poschr);
+ LRMprintf("Match %s and READ + %d to %s end\n", poschr, next_seg_pos, realign_context -> current_dir_to_3end?"3":"5");
+
+ for(bx1 = 0; bx1 < LRMJUMP_TESTING_LEN; bx1++){
+ int testing_read_pos = realign_context -> current_dir_to_3end? next_seg_pos + bx1 :(next_seg_pos - bx1);
+ unsigned int testing_chro_pos = realign_context -> current_dir_to_3end? other_side_start + bx1 : (other_side_start - bx1);
+ if(testing_read_pos >= iteration_context -> segment_lengths[realign_context -> current_segment_id]) break;
+ if(testing_read_pos <0) break;
+
+ char chro_ch = LRMgvindex_get(&context -> current_base_index, testing_chro_pos);
+ char read_ch = iteration_context -> segment_texts[ realign_context -> current_segment_id ][testing_read_pos];
+
+ LRMprintf(" %c %c\n", chro_ch, read_ch);
+ }
+ LRMprintf("=================\n\n");
+ }
+ int tested_bases = 0;
+ for(bx1 = 0; bx1 < LRMJUMP_TESTING_LEN; bx1++){
+ int testing_read_pos = realign_context -> current_dir_to_3end? next_seg_pos + bx1 :(next_seg_pos - bx1);
+ unsigned int testing_chro_pos = realign_context -> current_dir_to_3end? other_side_start + bx1 : (other_side_start - bx1);
+ if(testing_read_pos >= iteration_context -> segment_lengths[realign_context -> current_segment_id]) break;
+ if(testing_read_pos <0) break;
+ char chro_ch = LRMgvindex_get(&context -> current_base_index, testing_chro_pos);
+ char read_ch = iteration_context -> segment_texts[ realign_context -> current_segment_id ][testing_read_pos];
+ tested_bases ++;
+ matched += (read_ch == chro_ch);
+ }
+
+ if(matched<1)return 0;
+ return matched >= tested_bases - LRMJUMP_MISMATCH_TOLERANCE;
+}
+
+void LRMrealign_one_segment_bestresult( LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMrealign_context_t * realign_context ){
+ int current_score = 0;
+ int x1;
+
+ for(x1 = 0; x1 < realign_context -> current_stack_depth ; x1++)
+ current_score += realign_context -> current_stack_matched_bases[x1];
+
+ //LRMprintf(" TEST SCORE %s : %d > %d ; DEPTH : %d > %d\n", realign_context -> current_dir_to_3end?"AFT":"FWD", current_score, realign_context -> best_stack_score, realign_context -> current_stack_depth, realign_context -> best_stack_depth [realign_context -> current_dir_to_3end] );
+
+ if(realign_context -> best_stack_depth[realign_context -> current_dir_to_3end] < 1 || current_score > realign_context -> best_stack_score [ realign_context -> current_dir_to_3end ] ){
+ realign_context -> best_stack_score [realign_context -> current_dir_to_3end] = current_score;
+ realign_context -> best_stack_depth[realign_context -> current_dir_to_3end] = realign_context -> current_stack_depth ;
+ memcpy(realign_context -> best_stack_segment_length[realign_context -> current_dir_to_3end], realign_context -> current_stack_segment_length, sizeof(int) * realign_context -> current_stack_depth);
+ memcpy(realign_context -> best_stack_segment_first_base_position[realign_context -> current_dir_to_3end], realign_context -> current_stack_segment_first_base_position, sizeof(int) * realign_context -> current_stack_depth);
+ memcpy(realign_context -> best_stack_event_after[realign_context -> current_dir_to_3end], realign_context -> current_stack_event_after, sizeof(int) * realign_context -> current_stack_depth );
+ memcpy(realign_context -> best_stack_chro_first_base_position[realign_context -> current_dir_to_3end], realign_context -> current_stack_chro_first_base_position, sizeof(int) * realign_context -> current_stack_depth );
+ memcpy(realign_context -> best_stack_softcliping_length[realign_context -> current_dir_to_3end], realign_context -> current_stack_softcliping_length, sizeof(int) * realign_context -> current_stack_depth );
+ }
+}
+
+
+int LRMrealign_one_segment_iterating( LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMrealign_context_t * realign_context){
+ unsigned int testing_pos;
+
+ int seg_id = realign_context -> current_segment_id;
+ testing_pos = realign_context -> current_next_iteration_chro_start;
+ int having_mismatches = 0;
+
+ while(1){
+ if(realign_context -> current_dir_to_3end && realign_context -> current_testing_segment_location >= iteration_context -> segment_lengths[seg_id]) break;
+ if(realign_context -> current_dir_to_3end == 0 && realign_context -> current_testing_segment_location < 0) break;
+
+ char chro_base = LRMgvindex_get(&context -> current_base_index , testing_pos);
+ char read_base = iteration_context -> segment_texts[ realign_context -> current_segment_id ][ realign_context -> current_testing_segment_location ];
+ if(having_mismatches < LRMSOFTCLIPPING_MAX_MISMATCH){
+ if(chro_base == read_base){
+ having_mismatches = 0;
+ realign_context -> current_stack_softcliping_length[ realign_context -> current_stack_depth ] = 0;
+ }else{
+ if(LRMSOFTCLIPPING_MAX_MISMATCH < 1000 && having_mismatches == 0)realign_context -> current_stack_softcliping_length[ realign_context -> current_stack_depth ] = realign_context -> current_dir_to_3end ? ( iteration_context -> segment_lengths[seg_id] - realign_context -> current_testing_segment_location ): (1+realign_context -> current_testing_segment_location);
+ having_mismatches ++;
+ }
+ }
+
+ if(having_mismatches < LRMSOFTCLIPPING_MAX_MISMATCH){
+ int found_events [LRMMAX_EVENTS_PER_SITE];
+ int event_i, events_at_here = LRMevents_search(context, testing_pos, realign_context -> current_dir_to_3end, found_events);
+
+ if(0 && events_at_here > 0) {
+ char postxt[100];
+ LRMpos2txt(context, testing_pos , postxt);
+ LRMprintf("Mismatch = %d ; Found %d events at %s ; to 3' = %d ; read pos : %d , read len : %d\n", having_mismatches, events_at_here, postxt, realign_context -> current_dir_to_3end, realign_context -> current_testing_segment_location , iteration_context -> segment_lengths[seg_id]);
+ }
+
+ for(event_i = 0; event_i < events_at_here; event_i++){
+ LRMevent_t * tested_event = context -> event_space + found_events[event_i];
+ unsigned int other_side_start = realign_context -> current_dir_to_3end? tested_event -> large_side: tested_event -> small_side;
+ int next_seg_pos = realign_context -> current_dir_to_3end? realign_context -> current_testing_segment_location +1:( realign_context -> current_testing_segment_location -1);
+ if(tested_event -> indel_length < 0)next_seg_pos += ( realign_context -> current_dir_to_3end?-tested_event -> indel_length:tested_event -> indel_length );
+ //LRMprintf(" events #%d type: %d; len: %d ; coor: %u ~ %u\n", found_events[event_i] , tested_event-> event_type, tested_event-> indel_length, tested_event->small_side, tested_event -> large_side);
+ if(LRM_should_jump( context, thread_context, iteration_context, realign_context, other_side_start, next_seg_pos, tested_event)){
+ int item_length = abs(realign_context -> current_testing_segment_location - realign_context -> current_stack_segment_first_base_position[ realign_context -> current_stack_depth ]) + 1;
+ unsigned int item_start_chro_pos = realign_context -> current_dir_to_3end? ( testing_pos - item_length ) : testing_pos ;
+ int item_start_seg_pos = realign_context -> current_dir_to_3end? ( realign_context -> current_testing_segment_location - item_length ): realign_context -> current_testing_segment_location ;
+
+ realign_context -> current_stack_segment_length[ realign_context -> current_stack_depth ] = item_length;
+ realign_context -> current_stack_matched_bases[ realign_context -> current_stack_depth ] = LRMmatch_chro( iteration_context -> segment_texts[ realign_context -> current_segment_id ] + item_start_seg_pos , &context -> current_base_index, item_start_chro_pos, realign_context -> current_stack_segment_length[ realign_context -> current_stack_depth ], 0);
+ realign_context -> current_stack_event_after[ realign_context -> current_stack_depth ] = found_events[event_i];
+ realign_context -> current_stack_depth ++;
+ realign_context -> current_stack_softcliping_length[ realign_context -> current_stack_depth ] = 0;
+ realign_context -> current_stack_chro_first_base_position[ realign_context -> current_stack_depth ] = other_side_start;
+ realign_context -> current_stack_segment_first_base_position[ realign_context -> current_stack_depth ] = next_seg_pos;
+ realign_context -> current_next_iteration_chro_start = other_side_start;
+ realign_context -> current_testing_segment_location = next_seg_pos;
+ LRMrealign_one_segment_iterating( context, thread_context, iteration_context, realign_context );
+
+ realign_context -> current_stack_depth --;
+ }
+ }
+ }
+
+ if(realign_context -> current_dir_to_3end){
+ testing_pos ++;
+ realign_context -> current_testing_segment_location ++;
+ }else{
+ testing_pos --;
+ realign_context -> current_testing_segment_location --;
+ }
+ }
+
+ // testing no-event alignment
+ int item_length = abs(realign_context -> current_testing_segment_location - realign_context -> current_stack_segment_first_base_position[ realign_context -> current_stack_depth ]);
+ if( item_length >0) {
+ int item_start_seg_pos = realign_context -> current_dir_to_3end? ( realign_context -> current_testing_segment_location - item_length ): realign_context -> current_testing_segment_location ;
+ unsigned int item_start_chro_pos = realign_context -> current_dir_to_3end? ( testing_pos - item_length ) : testing_pos;
+
+ realign_context -> current_stack_segment_length[ realign_context -> current_stack_depth ] = item_length;
+ realign_context -> current_stack_matched_bases[ realign_context -> current_stack_depth ] = LRMmatch_chro( iteration_context -> segment_texts[ realign_context -> current_segment_id ] + item_start_seg_pos , &context -> current_base_index, item_start_chro_pos, realign_context -> current_stack_segment_length[ realign_context -> current_stack_depth ], 0);
+
+ //LRMprintf(" LAST TEST %s : DEP=%d, LEN=%d, MATCH=%d\n", realign_context -> current_dir_to_3end?"AFT":"FWD", realign_context -> current_stack_depth, item_length, realign_context -> current_stack_matched_bases[ realign_context -> current_stack_depth ]);
+
+ realign_context -> current_stack_depth ++;
+ realign_context -> current_stack_event_after[ realign_context -> current_stack_depth ] = 0;
+ LRMrealign_one_segment_bestresult(context, thread_context, iteration_context, realign_context);
+ realign_context -> current_stack_depth --;
+ }
+ return 0;
+}
+
+int LRMrealign_one_segment_build_cigar(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMrealign_context_t * realign_context){
+ int item_i, all_items = realign_context -> best_stack_depth[0]+realign_context -> best_stack_depth[1] - (realign_context -> best_stack_depth[0] >0? 1:0);
+ realign_context -> best_cigar[0] = 0;
+ int cigar_len_ptr = 0;
+ unsigned int entire_segment_chro_pos = 0;
+ for(item_i = 0; item_i < all_items; item_i++){
+ int start_on_seg = -1, length_of_seg = -1;
+ unsigned int start_on_chro = 0;
+ LRMevent_t * event_3end_seg = NULL;
+ int softcliping_len_before = 0, softcliping_len_after = 0;
+
+ if(item_i < realign_context -> best_stack_depth[0]-1){
+ int itemidx = realign_context -> best_stack_depth[0] - item_i - 1;
+ length_of_seg = realign_context -> best_stack_segment_length[0][itemidx];
+ start_on_seg = realign_context -> best_stack_segment_first_base_position[0][itemidx] - length_of_seg;
+ start_on_chro = realign_context -> best_stack_chro_first_base_position[0][itemidx] - length_of_seg + 1;
+
+ if( item_i == 0 )softcliping_len_before = realign_context -> best_stack_softcliping_length[0][itemidx];
+ if( item_i == all_items - 1 )softcliping_len_after = realign_context -> best_stack_softcliping_length[1][itemidx];
+ event_3end_seg = NULL;
+ if(item_i < all_items - 1) event_3end_seg = context -> event_space + realign_context -> best_stack_event_after[0][itemidx - 1];
+ }else if (item_i == realign_context -> best_stack_depth[0] -1){
+ length_of_seg = realign_context -> best_stack_segment_length[0][0] + realign_context -> best_stack_segment_length[1][0];
+ start_on_seg = realign_context -> best_stack_segment_first_base_position[0][0] - realign_context -> best_stack_segment_length[0][0];
+ start_on_chro = realign_context -> best_stack_chro_first_base_position[0][0] - realign_context -> best_stack_segment_length[0][0] + 1;
+
+ if(0 && realign_context -> current_segment_id == 161){
+ char postxt[100];
+ LRMpos2txt(context, start_on_chro, postxt);
+ LRMprintf("MID_SECTION LEN=%d, START=%s, %d ; B0_0 LEN=%d, B0 DEPTH=%d B0_1 LEN=%d B1_0 LEN=%d \n", length_of_seg, postxt, start_on_seg, realign_context -> best_stack_segment_length[0][0], realign_context -> best_stack_depth[0], realign_context -> best_stack_segment_length[0][1], realign_context -> best_stack_segment_length[1][0]);
+ }
+
+ if(item_i == 0)softcliping_len_before = realign_context -> best_stack_softcliping_length[0][ realign_context -> best_stack_depth[0]-1 ];
+ if(item_i == all_items - 1)softcliping_len_after = realign_context -> best_stack_softcliping_length[1] [0];
+ event_3end_seg = NULL;
+ if(item_i < all_items - 1) event_3end_seg = context -> event_space + realign_context -> best_stack_event_after[1][0];
+ }else{
+ int itemidx = item_i - realign_context -> best_stack_depth[0] + (realign_context -> best_stack_depth[0] >0? 1:0);
+ length_of_seg = realign_context -> best_stack_segment_length[1][itemidx];
+ start_on_seg = realign_context -> best_stack_segment_first_base_position[1][itemidx];
+ start_on_chro = realign_context -> best_stack_chro_first_base_position[1][itemidx];
+
+ if( item_i == 0 )softcliping_len_before = realign_context -> best_stack_softcliping_length[0][itemidx];
+ if( item_i == all_items - 1 )softcliping_len_after = realign_context -> best_stack_softcliping_length[1][itemidx];
+ event_3end_seg = NULL;
+ if(item_i < all_items - 1) event_3end_seg = context -> event_space + realign_context -> best_stack_event_after[1][itemidx];
+ }
+
+ if(softcliping_len_before)if(cigar_len_ptr < LRMSEGMENT_CIGAR_SIZE) cigar_len_ptr += snprintf(realign_context -> best_cigar + cigar_len_ptr, LRMSEGMENT_CIGAR_SIZE - cigar_len_ptr, "%dS",softcliping_len_before);
+ if(cigar_len_ptr < LRMSEGMENT_CIGAR_SIZE)cigar_len_ptr += snprintf(realign_context -> best_cigar + cigar_len_ptr, LRMSEGMENT_CIGAR_SIZE - cigar_len_ptr, "%dM", length_of_seg - softcliping_len_before - softcliping_len_after);
+ if(softcliping_len_after)
+ if(cigar_len_ptr < LRMSEGMENT_CIGAR_SIZE)cigar_len_ptr += snprintf(realign_context -> best_cigar + cigar_len_ptr, LRMSEGMENT_CIGAR_SIZE - cigar_len_ptr, "%dS",softcliping_len_after);
+ if(event_3end_seg){
+ char cigar_op = event_3end_seg -> event_type == LRMEVENT_TYPE_JUNCTION?'N':(event_3end_seg -> indel_length > 0?'D':'I' );
+ int cigar_oplen = event_3end_seg -> event_type == LRMEVENT_TYPE_JUNCTION?(event_3end_seg -> large_side - event_3end_seg -> small_side - 1 ):abs(event_3end_seg -> indel_length);
+ if(cigar_len_ptr < LRMSEGMENT_CIGAR_SIZE)cigar_len_ptr += snprintf(realign_context -> best_cigar + cigar_len_ptr, LRMSEGMENT_CIGAR_SIZE - cigar_len_ptr, "%d%c", cigar_oplen, cigar_op);
+ }
+ if(0 == item_i) entire_segment_chro_pos = start_on_chro;
+ if(softcliping_len_after) break;
+ }
+
+ if(0 && entire_segment_chro_pos > 0){
+ char postxt[100];
+ LRMpos2txt(context, entire_segment_chro_pos, postxt);
+ LRMprintf("\nREAD %s [%d]\n=== pos: %s TXT %.*s ===", iteration_context -> read_name, realign_context -> current_segment_id, postxt, iteration_context -> segment_lengths[realign_context -> current_segment_id], iteration_context -> segment_texts[realign_context -> current_segment_id]);
+ LRMprintf("=== CIGAR BUILT : %s ; ITEMS = %d + %d = %d ===\n", realign_context -> best_cigar, realign_context -> best_stack_depth[0], realign_context -> best_stack_depth[1], all_items);
+ }
+
+ realign_context -> best_chro_pos = entire_segment_chro_pos;
+ return 1;
+}
+
+int LRMrealign_one_segment(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMrealign_context_t * realign_context){
+ int read_reversed = 0;
+ LRMread_mapping_result_t * read_res = context -> read_mapping_results + iteration_context -> read_no_in_chunk;
+ LRMsegment_mapping_result_t * seg_res = read_res -> segment_results + realign_context -> current_segment_id;
+ LRMsegment_mapping_candidate_t * cand_res = seg_res -> candidates + realign_context -> current_candidate_id;
+
+ if(context -> do_junction_detection){
+
+ realign_context -> current_dir_to_3end = 0;
+ realign_context -> best_stack_score[0] = 0;
+ realign_context -> current_stack_depth = 0;
+ realign_context -> best_stack_depth[0]=0;
+ realign_context -> best_stack_segment_length[0][0]=0;
+
+ realign_context -> current_stack_segment_first_base_position[0] = cand_res -> confident_coverage_start - 1;
+ realign_context -> current_next_iteration_chro_start = cand_res -> first_base_position + cand_res -> confident_coverage_start - 1;
+ realign_context -> current_stack_chro_first_base_position[0] = realign_context -> current_next_iteration_chro_start;
+ realign_context -> current_stack_softcliping_length[0] = 0;
+ realign_context -> current_testing_segment_location = cand_res -> confident_coverage_start - 1;
+
+
+ //#warning "======== SHOULD NOT RETURN FOR NEGATIVE STRAND ================"
+ //if(cand_res -> masks & LRMIS_NEGATIVE_STRAND) return 0;
+ if( (cand_res -> masks & LRMIS_NEGATIVE_STRAND) != (read_reversed?LRMIS_NEGATIVE_STRAND:0)){
+ LRMreverse_read(iteration_context -> segment_texts[realign_context -> current_segment_id], iteration_context -> segment_lengths[realign_context -> current_segment_id]);
+ //LRMreverse_read(iteration_context -> read_text, iteration_context -> read_length);
+ read_reversed = !read_reversed;
+ }
+
+ if(0 && realign_context -> current_segment_id == 13){
+ char postxt[100];
+ LRMpos2txt(context, cand_res -> first_base_position + cand_res -> confident_coverage_start , postxt);
+ int ccc = iteration_context -> segment_texts[realign_context -> current_segment_id][iteration_context -> segment_lengths[realign_context -> current_segment_id]];
+ iteration_context -> segment_texts[realign_context -> current_segment_id][iteration_context -> segment_lengths[realign_context -> current_segment_id]] = 0;
+ LRMprintf("\nREALIGN SEG %d (%s): FROM %s MASKS=%u CONF_START=%d\n%s\n\n", realign_context -> current_segment_id, (cand_res -> masks & LRMIS_NEGATIVE_STRAND)?"NEG":"POS", postxt , cand_res -> masks, cand_res -> confident_coverage_start, iteration_context -> segment_texts[realign_context -> current_segment_id]);
+ iteration_context -> segment_texts[realign_context -> current_segment_id][iteration_context -> segment_lengths[realign_context -> current_segment_id]] = ccc;
+ }
+
+ //LRMprintf("REAL %s SEG %d MASK %d REV %d [%dbp] at %p : %.*s\n", iteration_context -> read_name, realign_context -> current_segment_id, cand_res -> masks, read_reversed, iteration_context -> segment_lengths[realign_context -> current_segment_id], iteration_context -> segment_texts[realign_context -> current_segment_id], iteration_context -> segment_lengths[realign_context -> current_segment_id], iteration_context -> segment_texts[realign_context -> current_segment_id]);
+
+ LRMrealign_one_segment_iterating(context, thread_context, iteration_context, realign_context);
+
+ if(0 && realign_context -> current_segment_id == 13){
+ char postxt[100];
+ LRMpos2txt(context, realign_context -> best_stack_chro_first_base_position [0][realign_context -> best_stack_depth[0]-1] - realign_context -> best_stack_segment_length[0][ realign_context -> best_stack_depth[0]-1 ], postxt);
+ LRMprintf("REALIGN_FWD_RES : SCORE=%d, ITEMS=%d ; ITEMS_FIRST BASE: %s ; ITEMS_FIRST LEN : %d\n", realign_context -> best_stack_score [0], realign_context -> best_stack_depth[0], postxt, realign_context -> best_stack_segment_length[0][ realign_context -> best_stack_depth[0]-1 ]);
+ }
+
+ realign_context -> current_dir_to_3end = 1;
+ realign_context -> best_stack_score[1] = 0;
+ realign_context -> current_stack_depth = 0;
+ realign_context -> best_stack_depth[1]=0;
+ realign_context -> best_stack_segment_length[1][0]=0;
+
+ realign_context -> current_stack_segment_first_base_position[0] = cand_res -> confident_coverage_start;
+ realign_context -> current_next_iteration_chro_start = cand_res -> first_base_position + cand_res -> confident_coverage_start;
+ realign_context -> current_stack_chro_first_base_position[0] = realign_context -> current_next_iteration_chro_start;
+ realign_context -> current_stack_softcliping_length[0] = 0;
+ realign_context -> current_testing_segment_location = cand_res -> confident_coverage_start;
+
+ LRMrealign_one_segment_iterating(context, thread_context, iteration_context, realign_context);
+ if(0 && realign_context -> current_segment_id == 13){
+ LRMprintf("REALIGN_AFT_RES : SCORE=%d, ITEMS=%d\n", realign_context -> best_stack_score [realign_context -> current_dir_to_3end], realign_context -> best_stack_depth[realign_context -> current_dir_to_3end]);
+ }
+
+ LRMrealign_one_segment_build_cigar(context, thread_context, iteration_context, realign_context);
+
+ if(0 && realign_context -> current_segment_id == 13) LRMprintf("REALIGN_CIGAR: %s\n",realign_context -> best_cigar);
+
+ if(read_reversed){
+ LRMreverse_read(iteration_context -> segment_texts[realign_context -> current_segment_id], iteration_context -> segment_lengths[realign_context -> current_segment_id]);
+ // LRMreverse_read(iteration_context -> read_text, iteration_context -> read_length);
+ }
+ }else{
+ int skip_head = 0, skip_tail = 0;
+ skip_head = (cand_res -> confident_coverage_start + cand_res -> confident_coverage_end)/2;
+ skip_tail = iteration_context -> segment_lengths[realign_context -> current_segment_id] - 1 - skip_head;
+ sprintf(realign_context -> best_cigar, "%dS1M%dS", skip_head, skip_tail);
+ realign_context -> best_chro_pos = cand_res -> first_base_position;
+ realign_context -> best_stack_score[0] = cand_res -> votes;
+ realign_context -> best_stack_score[1] = cand_res -> votes;
+ }
+ return 0;
+}
diff --git a/src/longread-mapping/LRMchro-event.h b/src/longread-mapping/LRMchro-event.h
new file mode 100644
index 0000000..709d837
--- /dev/null
+++ b/src/longread-mapping/LRMchro-event.h
@@ -0,0 +1,40 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+#ifndef __LRMCHRO_EVENT_H_
+#define __LRMCHRO_EVENT_H_
+
+#include "LRMconfig.h"
+
+int LRMevents_load_annot(LRMcontext_t * context);
+int LRMevents_filtering(LRMcontext_t * context);
+int LRMevents_reorder(LRMcontext_t * context);
+int LRMevents_build_entries(LRMcontext_t * context);
+int LRMevents_lookup(LRMcontext_t * context, unsigned int abs_pos, int event_type_masks, int search_large_side, LRMevent_t ** res);
+
+int LRMchro_event_new(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMevent_t * new_event);
+
+int LRMdonor_score(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMsegment_mapping_candidate_t * mapping_result, int seg_id, unsigned int secondary_mapping_location, unsigned int secondary_coverage_start , unsigned int secondary_coverage_end, int indel_in_anchor, int indel_in_secondary, int * final_split_point, int *is_GT_AG_strand, int * left_offset_indels);
+
+
+// high_penalty_for_creating_gap is for creating less CIGAR options (<65535)
+int LRMindel_dynamic_search(LRMcontext_t* context, LRMthread_context_t* thread_context, int expected_indels, unsigned int last_correct_base_on_chro, char * corrected_read, int last_correct_base, int first_correct_base, unsigned int * total_mismatched, int high_penalty_for_creating_gap, char * read_name);
+int LRMrealign_one_segment(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMrealign_context_t * realign_context);
+void LRMreverse_read_and_qual(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context);
+int LRMindel_dynamic_search_unknownregion(LRMcontext_t* context, LRMthread_context_t* thread_context, int search_to_3end, unsigned int last_correct_base_on_chro, char * corrected_read, int last_correct_base, int first_correct_base,unsigned int * total_mismatched_bases, int high_penalty_create_gap, char * read_name);
+#endif
diff --git a/src/longread-mapping/LRMconfig.h b/src/longread-mapping/LRMconfig.h
new file mode 100644
index 0000000..7baf7a0
--- /dev/null
+++ b/src/longread-mapping/LRMconfig.h
@@ -0,0 +1,389 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+#ifndef __LRMconfig_h_
+#define __LRMconfig_h_
+
+#include <pthread.h>
+#include <zlib.h>
+#include "seek-zlib.h"
+#include "hashtable.h"
+
+#define LRMMAX_FILENAME_LENGTH 500
+#define LRMMAX_READ_NAME_LEN 256
+#define LRMMAX_CHROMOSOME_NAME_LEN 256
+#define LRMMAX_EVENTS_PER_SITE 10
+
+#define LRMMAX_READ_LENGTH 1300000
+#define LRMLONGREAD_DENOMINATOR 100000
+#define LRMMAX_THREADS 64
+#define LRMSEGMENT_MIN_LENGTH 100
+#define LRMSEGMENT_MIN_DIST ( LRMSEGMENT_MIN_LENGTH - 33 )
+
+#define LRMREADS_PER_CHUNK 300000
+#define LRMSEGMENT_MAX_CANDIDATES 5
+#define LRMMERGING_MAX_CANDIDATES 5
+#define LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT 85
+#define LRMMAX_REALIGNMENT_ITEMS 10
+#define LRMMAX_MULTI_BEST 16
+
+#define LRMMAX_INDEL_TOLERANCE 16
+#define LRMMAX_INDEL_SECTIONS 7
+#define LRMMAX_CIGAR_OPTS_IN_BAM_READ 65500
+#define LRMMAX_CIGAR_OPTS_IN_SAM_READ (65500*2)
+#define LRMSEGMENT_CIGAR_SIZE (500)
+#define LRMMERGE_CIGAR_SIZE (LRMMAX_CIGAR_OPTS_IN_SAM_READ * 12)
+#define LRMSOFTCLIPPING_MAX_MISMATCH 2
+#define LRMJUMP_MISMATCH_TOLERANCE 4
+#define LRMDYNAMIC_MAXIMUM_GAP_LENGTH (3000 + 48000)
+#define LRMINDEL_DYNAMIC_CHANNEL_TOLERANCE (161)
+
+
+#define LRMSUBREAD_INDEX_OPTION_INDEX_GAP 0x0101
+#define LRMSUBREAD_INDEX_OPTION_INDEX_PADDING 0x0102
+#define LRMMULTI_THREAD_OUTPUT_ITEMS (4096 * 3/5 *3)
+
+
+#define LRMIS_REVERSED_HALVES 1
+#define LRMIS_PAIRED_MATCH 2
+#define LRMIS_NEGATIVE_STRAND 4
+#define LRMIS_BREAKEVEN_READ 8
+#define LRMIS_IN_KNOWN_EXONS 16
+
+#define LRM_EVENT_IS_GT_AT_DONOR 1
+#define LRM_EVENT_IS_ANNOTATED 2
+
+#define LRMEVENT_TYPE_JUNCTION 30
+#define LRMEVENT_TYPE_INDEL 20
+
+#define LRMBAM_COMPRESS_LEVEL Z_BEST_SPEED
+#define LRMGZIP_WINDOW_BITS -15
+
+#define LRMSUBINDEX_VER2 201
+
+#define LRMRUNNING_STEP_VOTING 10
+#define LRMRUNNING_STEP_REALIGN 30
+
+#define LRMthread_lock_t pthread_mutex_t
+#define LRMthread_lock pthread_mutex_lock
+#define LRMthread_lockrelease pthread_mutex_unlock
+#define LRMthread_lockinit(a) pthread_mutex_init(a, NULL)
+
+#define LRMGENE_VOTE_SPACE 169
+#define LRMGENE_VOTE_TABLE_SIZE 128
+
+typedef char LRMgene_quality_score_t;
+typedef unsigned int LRMgehash_key_t;
+typedef unsigned int LRMgehash_data_t;
+
+typedef struct {
+ unsigned short items[LRMGENE_VOTE_TABLE_SIZE];
+ unsigned int pos [LRMGENE_VOTE_TABLE_SIZE][LRMGENE_VOTE_SPACE];
+ unsigned short votes [LRMGENE_VOTE_TABLE_SIZE][LRMGENE_VOTE_SPACE];
+ short masks [LRMGENE_VOTE_TABLE_SIZE][LRMGENE_VOTE_SPACE];
+ short indel_recorder [LRMGENE_VOTE_TABLE_SIZE][LRMGENE_VOTE_SPACE][LRMMAX_INDEL_SECTIONS*3];
+ char current_indel_cursor[LRMGENE_VOTE_TABLE_SIZE][LRMGENE_VOTE_SPACE];
+ unsigned short toli[LRMGENE_VOTE_TABLE_SIZE][LRMGENE_VOTE_SPACE];
+
+ unsigned int coverage_start [LRMGENE_VOTE_TABLE_SIZE][LRMGENE_VOTE_SPACE];
+ unsigned int coverage_end [LRMGENE_VOTE_TABLE_SIZE][LRMGENE_VOTE_SPACE];
+} LRMgene_vote_t ;
+
+typedef struct {
+ unsigned int read_no_in_chunk;
+ unsigned int read_length;
+ char read_name[LRMMAX_READ_NAME_LEN];
+ char read_text[LRMMAX_READ_LENGTH];
+ char qual_text[LRMMAX_READ_LENGTH];
+
+ int total_segments;
+ int mapped_segments;
+ char * segment_texts[LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1];
+ char * segment_quals[LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1];
+ unsigned int subread_offsets[LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1][LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT];
+ unsigned int segment_lengths[LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1];
+
+ char segment_cigars[ LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1 ] [LRMMERGING_MAX_CANDIDATES] [ LRMSEGMENT_CIGAR_SIZE + 1];
+ unsigned int segment_best_pos[LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1 ] [LRMMERGING_MAX_CANDIDATES];
+ int segment_best_masks[LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1 ] [LRMMERGING_MAX_CANDIDATES];
+ int segment_best_candidate_score[LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST +1 ] [LRMMERGING_MAX_CANDIDATES];
+ int segment_best_votes[LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1 ] [LRMMERGING_MAX_CANDIDATES];
+
+ int is_reversed;
+ int added_mismatch_allowed_in_subread;
+ unsigned short extracted_subreads;
+
+ LRMgene_vote_t vote_table;
+
+ int merged_masks[LRMMAX_MULTI_BEST];
+ unsigned int merged_position[LRMMAX_MULTI_BEST];
+ char merged_cigar [LRMMAX_MULTI_BEST][ LRMMERGE_CIGAR_SIZE + 1];
+ int merged_matched_bases[LRMMAX_MULTI_BEST];
+} LRMread_iteration_context_t;
+
+typedef struct {
+ int current_dir_to_3end;
+ int current_segment_id;
+ int current_candidate_id;
+ int current_testing_segment_location;
+ unsigned int current_next_iteration_chro_start;
+
+ int current_stack_depth;
+ int current_stack_matched_bases[LRMMAX_REALIGNMENT_ITEMS];
+ int current_stack_segment_length[LRMMAX_REALIGNMENT_ITEMS];
+ int current_stack_segment_first_base_position[LRMMAX_REALIGNMENT_ITEMS];
+ int current_stack_event_after[LRMMAX_REALIGNMENT_ITEMS];
+ int current_stack_softcliping_length[LRMMAX_REALIGNMENT_ITEMS];
+ unsigned int current_stack_chro_first_base_position[LRMMAX_REALIGNMENT_ITEMS];
+
+ int best_stack_depth[2];
+ int best_stack_segment_length[2][LRMMAX_REALIGNMENT_ITEMS];
+ int best_stack_segment_first_base_position[2][LRMMAX_REALIGNMENT_ITEMS];
+ int best_stack_event_after[2][LRMMAX_REALIGNMENT_ITEMS];
+ int best_stack_softcliping_length[2][LRMMAX_REALIGNMENT_ITEMS];
+ unsigned int best_stack_chro_first_base_position[2][LRMMAX_REALIGNMENT_ITEMS];
+ int best_stack_score[2];
+
+ char best_cigar[ LRMSEGMENT_CIGAR_SIZE + 1 ];
+ unsigned int best_chro_pos;
+
+} LRMrealign_context_t;
+
+
+typedef struct{
+ unsigned int first_base_position;
+ unsigned int confident_coverage_start, confident_coverage_end;
+ unsigned short votes;
+ unsigned short masks;
+ int indel_length_inside;
+ short indel_records[LRMMAX_INDEL_SECTIONS*3];
+
+ // for junctions
+ unsigned int secondary_position;
+ unsigned short secondary_votes;
+ unsigned char junction_split_point;
+ unsigned char junction_is_GT_AG;
+ int junction_left_offset_indels;
+} LRMsegment_mapping_candidate_t;
+
+
+typedef struct{
+ char extracted_subreads;
+ LRMsegment_mapping_candidate_t candidates [LRMSEGMENT_MAX_CANDIDATES];
+} LRMsegment_mapping_result_t;
+
+typedef struct{
+ LRMsegment_mapping_result_t * segment_results;
+ unsigned int final_pos;
+ unsigned int votes[LRMMAX_MULTI_BEST];
+ int masks;
+ void * best_candidate;
+} LRMread_mapping_result_t;
+
+typedef struct {
+ char filename [LRMMAX_FILENAME_LENGTH];
+ int file_type ;
+ void * input_fp; // can be system (FILE * sam or fastq or fasta), (seekable_zfile_t *)
+ unsigned long long read_chunk_start;
+} LRMgene_input_t;
+
+
+
+typedef struct{
+ unsigned int memory_block_size;
+ unsigned int start_base_offset;
+ unsigned int start_point;
+ unsigned int length;
+ unsigned char * values;
+ unsigned int values_bytes;
+} LRMgene_value_index_t;
+
+
+
+struct LRMgehash_bucket {
+ int current_items;
+ int space_size;
+ short * new_item_keys;
+ LRMgehash_data_t * item_values;
+};
+
+
+typedef struct {
+ int version_number;
+ unsigned long long int current_items;
+ int buckets_number;
+ char is_small_table;
+ struct LRMgehash_bucket * buckets;
+ int index_gap;
+ int padding;
+} LRMgehash_t;
+
+
+
+typedef struct{
+ int thread_id;
+
+ // SAM BAM output
+ int output_buffer_item, output_buffer_pointer;
+ LRMthread_lock_t output_lock;
+ char * out_SAMBAM_buffer;
+ int out_buff_used;
+ int out_buff_capacity;
+
+ z_stream bam_file_output_stream;
+
+ // Chro events
+ HashTable * events_fresh; // only used in voting step ; event_name "I:chrX:99881122:-5" -> supporting_reads
+ unsigned int * events_realignment_counts; // event_ids -> n_supp
+
+ unsigned int * dynamic_programming_score_buffer;
+ char * dynamic_programming_movement_buffer;
+ char * dynamic_programming_indel_movement_buf;
+ unsigned int mapped_reads;
+
+} LRMthread_context_t;
+
+typedef struct{
+ unsigned int small_side, large_side;
+ unsigned int supporting_reads;
+ short masks;
+ char event_type;
+ char indel_length; // >0: del; <0:ins
+} LRMevent_t;
+
+
+typedef struct {
+ char user_command_line[10000];
+
+ char input_file_name [LRMMAX_FILENAME_LENGTH];
+ char output_file_name [LRMMAX_FILENAME_LENGTH];
+ char index_prefix [LRMMAX_FILENAME_LENGTH];
+
+ int do_junction_detection;
+ int threads;
+ int is_SAM_output;
+ int max_dynamic_indel_length;
+ int max_read_indel_length;
+ int max_cigar_opts_in_read;
+ int multi_best_read_alignments;
+ int is_Phred_64;
+ int max_junction_distance;
+ int max_mismatched_bases_in_subread;
+ int max_subreads_per_segment;
+ int min_voting_number;
+ int min_matched_bases;
+ int segment_overlapping;
+ int result_merge_tolerance;
+ int show_read_validation;
+ int unique_only;
+ int max_best_alignments;
+
+ // RUNNING STATUS
+
+ double start_running_time;
+
+ pthread_t running_threads[LRMMAX_THREADS];
+ LRMthread_context_t thread_contexts[LRMMAX_THREADS];
+ LRMthread_lock_t input_lock;
+ LRMgene_input_t input_file;
+ LRMgehash_t current_index;
+ int current_index_padding;
+ LRMgene_value_index_t current_base_index;
+ seekable_position_t last_saved_zlib_pos;
+ unsigned long long int last_saved_raw_pos;
+
+ int input_exhausted;
+ int processed_reads_in_chunk;
+ int all_processed_reads;
+
+ // Mapping results
+ LRMread_mapping_result_t read_mapping_results[LRMREADS_PER_CHUNK];
+ unsigned int mapped_reads;
+
+ // Output
+ char bam_file_tail_binary[200];
+ int bam_file_tail_length;
+ int sam_bam_file_header_written;
+ FILE * sam_bam_file;
+
+ ArrayList * chromosome_size_list;
+ HashTable * chromosome_size_table;
+
+ LRMthread_lock_t sam_bam_file_lock;
+ HashTable * sam_bam_chromosome_table;
+ ArrayList * sam_bam_chromosome_list;
+
+ // Chro Events
+ LRMthread_lock_t event_space_lock;
+ HashTable * events_realignment; // only used in realignment step ; entry_position -> [max_N_size, N, event_id1, event_id2, ..., event_idN]
+ LRMevent_t * event_space;
+ unsigned int event_space_size;
+ unsigned int event_number;
+
+ // Dynamic Programming
+ int dynamic_programming_score_match, dynamic_programming_score_mismatch, dynamic_programming_score_create_gap, dynamic_programming_score_extend_gap;
+} LRMcontext_t;
+
+
+#define LRMprintf printf
+#define LRMputs puts
+
+#define abs(a) ((a)>=0?(a):-(a))
+#define max(a,b) ((a)<(b)?(b):(a))
+#define min(a,b) ((a)>(b)?(b):(a))
+#define LRMint2base(c) (1413695297 >> (8*(c))&0xff)
+#define LRMbase2int(c) ((c)<'G'?((c)=='A'?0:2):((c)=='G'?1:3))
+
+#define LRMpthread_create pthread_create
+#define LRMpthread_join pthread_join
+
+
+
+int LRMshow_conf(LRMcontext_t* context);
+int LRMrun_task(LRMcontext_t* context);
+int LRMfinalise(LRMcontext_t* context);
+int LRMdestroy_context(LRMcontext_t* context);
+int LRMprint_mapping_summary(LRMcontext_t* context);
+
+int LRMstart_thread(LRMcontext_t * context, int task );
+int LRMwait_threads( LRMcontext_t * context );
+int LRMmerge_threads( LRMcontext_t * context , int step);
+
+void LRMset_default_values_context(LRMcontext_t * context);
+int LRMinput_has_finished( LRMcontext_t * context );
+int LRMload_index(LRMcontext_t* context);
+int LRMsave_input_pos(LRMcontext_t* context);
+int LRMrewind_input_pos(LRMcontext_t* context);
+int LRMmap_chunk_reads(LRMcontext_t* context);
+int LRMrealign_write_chunk_reads(LRMcontext_t* context);
+int LRMfinalise_chunk_reads(LRMcontext_t* context);
+int LRMiterate_reads( LRMcontext_t * context, int task );
+double LRMmiltime();
+
+void * LRMmap_chunk_reads_run(void * thread_args);
+void * LRMrealign_write_chunk_reads_run(void * thread_args);
+int LRMchunk_read_iteration(LRMcontext_t * context, int thread_id, int task);
+int LRMvalidate_and_init_context(LRMcontext_t ** context, int argc, char ** argv);
+//void LRMwrite_chunk_destroy_write_buffer(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iter_context, LRMwritting_buffer_t * buf);
+
+
+
+
+int FIXLENstrcmp(char * fixed_len, char * rname);
+
+#endif
diff --git a/src/longread-mapping/LRMfile-io.c b/src/longread-mapping/LRMfile-io.c
new file mode 100644
index 0000000..17b9cc7
--- /dev/null
+++ b/src/longread-mapping/LRMfile-io.c
@@ -0,0 +1,673 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+#include <stdio.h>
+#include <signal.h>
+#include <dirent.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <zlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "LRMfile-io.h"
+#include "LRMsorted-hashtable.h"
+
+int LRMgenekey2int(char key []){
+ int i;
+ int ret;
+ ret = 0;
+ for (i=0; i<16; i++)
+ ret |= (LRMbase2int(key[i]))<<(2*(15-i));
+
+ return ret;
+}
+
+int LRMgeinput_open(const char * filename, LRMgene_input_t * input){
+ int ret = 0;
+ if(strlen(filename)>LRMMAX_FILENAME_LENGTH-2)
+ return 1;
+
+ strcpy(input->filename, filename);
+ FILE * TMP_FP = fopen(filename, "rb");
+
+ if(TMP_FP == NULL)
+ return 1;
+
+ int id1, id2;
+ id1 = fgetc(TMP_FP);
+ id2 = fgetc(TMP_FP);
+
+ if(id1 == 31 && id2 == 139) {
+ fclose(TMP_FP);
+ input->input_fp = malloc(sizeof(seekable_zfile_t));
+ input->file_type = LRMGENE_INPUT_GZIP_FASTQ;
+ ret = seekgz_open(filename, input->input_fp);
+ }else{
+ input->file_type = LRMGENE_INPUT_FASTQ;
+ input->input_fp = TMP_FP;
+ fseek(input->input_fp, 0, SEEK_SET);
+ }
+
+ return ret;
+}
+
+void LRMgeinput_close(LRMgene_input_t * input){
+ if(input -> file_type == LRMGENE_INPUT_GZIP_FASTQ)
+ seekgz_close((seekable_zfile_t * ) input->input_fp);
+ else
+ fclose((FILE*)input->input_fp);
+}
+
+char * LRM__converting_char_table = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNGNNNCNNNNNNNNNNNNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNGNNNCNNNNNNNNNNNNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ";
+
+void LRMreverse_read(char * InBuff, int read_len){
+ int i;
+
+ for (i=0; i<read_len/2; i++)
+ {
+ int rll1 = read_len - 1 - i;
+ unsigned char tmp = InBuff[rll1];
+
+ InBuff[rll1] = *(LRM__converting_char_table+InBuff[i]);
+ InBuff[i] = *(LRM__converting_char_table+tmp);
+
+ }
+ if(i*2 == read_len-1)
+ {
+ InBuff[i] = *(LRM__converting_char_table+InBuff[i]);
+ }
+}
+
+
+
+int LRMgeinput_getc(LRMgene_input_t * input){
+ if(input -> file_type == LRMGENE_INPUT_GZIP_FASTQ){
+ return seekgz_next_char((seekable_zfile_t*)input -> input_fp);
+ }else{
+ return fgetc((FILE*)input -> input_fp);
+ }
+}
+
+int LRMgeinput_readline(LRMgene_input_t * input, int buf_len, char * linebuffer){
+ int ret =0;
+ while(1){
+ char ch = LRMgeinput_getc(input);
+ if (ch == '\n' || ch == EOF) break;
+ if(ret < buf_len-1)
+ linebuffer[ret++] = ch;
+ }
+ linebuffer[ret]=0;
+ //LRMprintf("READ_IN: %d chars, str %dchars\n%s\n", ret, strlen(linebuffer), linebuffer);
+ return ret;
+}
+
+
+#define SKIP_LINE { nch=' '; while(nch != EOF && nch != '\n') nch = LRMgeinput_getc(input); }
+
+int LRMgeinput_next_read(LRMgene_input_t * input, char * read_name, char * read_string, char * quality_string){
+ char nch;
+ int ret;
+
+ //READ NAME
+ if (read_name == NULL){
+ SKIP_LINE;
+ if(nch == EOF) return -1;
+ } else {
+ int ATnch = LRMgeinput_getc(input);
+ assert(ATnch == '@' || ATnch <0);
+ if(ATnch < 0) return -1;
+
+ int retv = LRMgeinput_readline(input, LRMMAX_READ_NAME_LEN, read_name);
+ if(retv<=0) return -1;
+
+ int cursor = 1;
+ while(read_name[cursor])
+ {
+ if(read_name[cursor] == ' ' || read_name[cursor] == '\t')
+ {
+ read_name [cursor] = 0;
+ break;
+ }
+ cursor++;
+ }
+ }
+ // READ LINE
+ ret = LRMgeinput_readline(input, LRMMAX_READ_LENGTH, read_string);
+
+ // SKIP "+"
+ do{
+ nch = LRMgeinput_getc(input);
+ } while( nch == '\n' );
+ SKIP_LINE;
+
+ // QUAL LINE
+ if (quality_string)
+ LRMgeinput_readline(input, LRMMAX_READ_LENGTH, quality_string);
+ else
+ SKIP_LINE;
+
+ return ret;
+}
+
+int LRMfetch_next_read(LRMcontext_t * context, LRMthread_context_t * thread_context, unsigned int *read_len, char * read_name, char * read_text, char * qual_text, unsigned int * read_no_in_chunk){
+ int this_number = -1;
+ int this_rlen = 0;
+
+ LRMthread_lock(&context -> input_lock);
+ if(context -> processed_reads_in_chunk < LRMREADS_PER_CHUNK)
+ {
+ this_rlen = LRMgeinput_next_read(&context -> input_file, read_name, read_text, qual_text);
+ if(this_rlen > 0 ){
+ this_number = context -> processed_reads_in_chunk;
+ context -> processed_reads_in_chunk ++;
+ }else context -> input_exhausted = 1;
+ }
+ LRMthread_lockrelease(&context -> input_lock);
+
+ if(this_rlen && this_number>=0)
+ {
+ *read_no_in_chunk = this_number;
+ *read_len = this_rlen;
+ return 0;
+ }else{
+ *read_no_in_chunk = -1;
+ return 1;
+ }
+}
+
+void LRMreverse_quality(char * InBuff, int read_len){
+ int i;
+ if(!InBuff) return;
+ if(!InBuff[0]) return;
+ for (i=0; i<read_len/2; i++)
+ {
+ char tmp;
+ tmp = InBuff[i];
+ InBuff[i] = InBuff[read_len -1-i];
+ InBuff[read_len -1-i] = tmp;
+ }
+}
+
+void LRMquality_64_to_33(char *qs){
+ int i;
+ for(i=0; qs[i]; i++){
+ qs[i] -= (64-33);
+ }
+}
+
+int LRMgenerate_bam_record_encode_cigar(int * cigar_int, char * cigar, int * mapped_length){
+ int tmp_int=0;
+ int cigar_cursor = 0, num_opt = 0;
+ (*mapped_length) = 0;
+
+ if(cigar[0]=='*') return 0;
+
+ while(1)
+ {
+ char nch = cigar[cigar_cursor++];
+ if(!nch)break;
+ if(isdigit(nch))
+ {
+ tmp_int = tmp_int*10+(nch-'0');
+ }
+ else
+ {
+ int int_opt=0;
+ if(nch == 'M' ||nch == 'N'||nch == 'D') (*mapped_length) += tmp_int;
+ for(; int_opt<8; int_opt++) if("MIDNSHP=X"[int_opt] == nch)break;
+ cigar_int[num_opt ++] = (tmp_int << 4) | int_opt;
+ tmp_int = 0;
+ if(num_opt>=LRMMAX_CIGAR_OPTS_IN_BAM_READ)break;
+ }
+ }
+
+ return num_opt;
+}
+
+int LRMreg2bin(int beg, int end){
+ --end;
+ if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
+ if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
+ if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20);
+ if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23);
+ if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26);
+ return 0;
+}
+
+int LRMgenerate_bam_record_encode_read_qual(char * bin, char * read, char * qual, int rlen){
+ int w_ptr = 0, xk1;
+
+ for(xk1 = 0; xk1 < rlen; xk1++){
+ int fourbit;
+ for(fourbit=0;fourbit<15;fourbit++) if("=ACMGRSVTWYHKDBN"[fourbit] == read[xk1])break;
+ if( xk1 % 2 == 0){
+ fourbit = fourbit << 4;
+ bin[w_ptr]=0;
+ }
+
+ bin[w_ptr] |= fourbit;
+
+ if(xk1 % 2 == 1)w_ptr++;
+ }
+ if(rlen %2) w_ptr++;
+
+ for(xk1=0; xk1<rlen; xk1++)
+ bin[w_ptr+xk1] = qual[xk1]-33;
+ return w_ptr+rlen;
+}
+
+int LRMgenerate_bam_record(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, char * target_ptr, int flags, unsigned int chro_pos,char * chro_name, int map_quality, char * cigar, int mis_matched, int NHval, int HIval){
+ int mapped_length = 0;
+
+ int chro_number = HashTableGet(context->sam_bam_chromosome_table, chro_name) - NULL;
+ chro_number -=1;
+ memcpy(target_ptr+4, &chro_number, 4);
+
+ memcpy(target_ptr+8, &chro_pos, 4);
+
+ int name_len = strlen(iteration_context -> read_name)+1;
+
+ memcpy(target_ptr+20, &iteration_context->read_length, 4);
+ int next_non = -1;
+ memcpy(target_ptr+24, &next_non, 4);
+ memcpy(target_ptr+28, &next_non, 4);
+ memset(target_ptr+32, 0, 4); //TLEN
+ memcpy(target_ptr+36, iteration_context->read_name, name_len);
+ int bin_ptr = 36+name_len;
+
+ int cigar_opts = LRMgenerate_bam_record_encode_cigar((int *)(target_ptr + bin_ptr), cigar, &mapped_length);
+ int flag_nc = flags << 16 | cigar_opts;
+ memcpy(target_ptr+16, &flag_nc, 4);
+ bin_ptr += cigar_opts*4;
+
+
+ int bin_mq_nl = (LRMreg2bin(chro_pos, chro_pos+mapped_length)<<16) | map_quality << 8 | name_len;
+ memcpy(target_ptr+12, &bin_mq_nl, 4);
+
+ bin_ptr += LRMgenerate_bam_record_encode_read_qual(target_ptr + bin_ptr, iteration_context->read_text, iteration_context -> qual_text, iteration_context->read_length);
+
+ memcpy(target_ptr+bin_ptr, "NM",2);
+ target_ptr[bin_ptr+2]='i';
+ memcpy(target_ptr+bin_ptr+3, &mis_matched, 4);
+ bin_ptr += 7;
+
+ memcpy(target_ptr+bin_ptr, "NH",2);
+ target_ptr[bin_ptr+2]='i';
+ memcpy(target_ptr+bin_ptr+3, &NHval, 4);
+ bin_ptr += 7;
+
+ memcpy(target_ptr+bin_ptr, "HI",2);
+ target_ptr[bin_ptr+2]='i';
+ memcpy(target_ptr+bin_ptr+3, &HIval, 4);
+ bin_ptr += 7;
+
+ bin_ptr -= 4;
+ memcpy(target_ptr, &bin_ptr, 4); // block len not include itself.
+ bin_ptr += 4;
+ return bin_ptr;
+}
+
+unsigned int LRM_CRC32(char * dat, int len){
+ unsigned int crc0 = crc32(0, NULL, 0);
+ unsigned int ret = crc32(crc0, (unsigned char *)dat, len);
+ return ret;
+}
+
+int LRMwrite_chunk_compress_bam_block(LRMcontext_t * context, LRMthread_context_t * thread_context, char * bin_buf, char * bam_buf, int bin_len){
+ char * compressed_buff = bam_buf + 18;
+
+ int compressed_size ;
+ unsigned int CRC32;
+ thread_context -> bam_file_output_stream.avail_out = 66600;
+ thread_context -> bam_file_output_stream.avail_in = bin_len;
+ CRC32 = LRM_CRC32(bin_buf , bin_len);
+
+ int Z_DEFAULT_MEM_LEVEL = 8;
+ thread_context -> bam_file_output_stream.zalloc = Z_NULL;
+ thread_context -> bam_file_output_stream.zfree = Z_NULL;
+ thread_context -> bam_file_output_stream.opaque = Z_NULL;
+
+ deflateInit2(&thread_context -> bam_file_output_stream, LRMBAM_COMPRESS_LEVEL, Z_DEFLATED,
+ LRMGZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+
+ thread_context -> bam_file_output_stream.next_in = (unsigned char*) bin_buf;
+ thread_context -> bam_file_output_stream.next_out = (unsigned char*) compressed_buff;
+
+ deflate(&thread_context -> bam_file_output_stream, Z_FINISH);
+ deflateEnd(&thread_context -> bam_file_output_stream);
+
+ compressed_size = 66600 -thread_context -> bam_file_output_stream.avail_out;
+
+ //LRMprintf("COMPRESS: %d -> %d\n", bin_len, compressed_size);
+
+ bam_buf[0]=31;
+ bam_buf[1]=(char)139;
+ bam_buf[2]=8;
+ bam_buf[3]=4;
+ memset(bam_buf+4, 0, 5);
+ bam_buf[9] = 0xff; // OS
+
+ int tmpi = 6;
+ memcpy(bam_buf+10, &tmpi, 2); //XLSN
+ bam_buf[12]=66; // SI1
+ bam_buf[13]=67; // SI2
+ tmpi = 2;
+ memcpy(bam_buf+14, &tmpi, 2); //BSIZE
+ tmpi = compressed_size + 19 + 6;
+ memcpy(bam_buf+16, &tmpi, 2); //BSIZE
+
+ memcpy(bam_buf+18+compressed_size, &CRC32, 4);
+ memcpy(bam_buf+18+compressed_size+4, &bin_len, 4);
+ return compressed_size + 26;
+}
+
+int LRMhash_strcmp(const void * s1, const void * s2){
+ return strcmp(s1, s2);
+}
+unsigned long LRMhash_strhash(const void * sv){
+ unsigned char *s = (unsigned char *)sv;
+ unsigned long ret = 0;
+ while(*s){
+ ret ^= (ret << 6);
+ ret ^= *s;
+ s++;
+ }
+ return ret;
+}
+
+int LRMreadline(FILE * fp, char * buf, int buflen){
+ int cr = 0;
+ while(!feof(fp)){
+ char c = fgetc(fp);
+ if(c == '\r'){
+ continue;
+ }else if(c == '\n'){
+ break;
+ }else if(cr < buflen-1)
+ buf[cr++]=c;
+ }
+ buf[cr]=0;
+ return cr;
+}
+
+int LRMload_offsets(LRMcontext_t * context){
+ char fn[LRMMAX_FILENAME_LENGTH];
+ FILE * fp;
+ int padding = 0;
+ unsigned int last_end = 0;
+
+ LRMgehash_load_option(context -> index_prefix, LRMSUBREAD_INDEX_OPTION_INDEX_PADDING , &padding);
+ assert(padding>16);
+
+ sprintf(fn, "%s.reads", context->index_prefix);
+ fp = fopen(fn, "r");
+
+ if(!fp){
+ LRMprintf("file not found :%s\n", fn);
+ return 1;
+ }
+
+ context -> current_index_padding = padding;
+ int chromosome_no = 0;
+ while (!feof(fp))
+ {
+ int i=0, step = 0, j=0;
+ unsigned int this_offset =0;
+
+ LRMreadline(fp, fn, LRMMAX_FILENAME_LENGTH-1);
+ if (strlen(fn)<2)continue;
+ char * chro_name = malloc(LRMMAX_CHROMOSOME_NAME_LEN);
+ while (fn[i])
+ {
+ if (fn[i] == '\t') {
+ fn[i]=0;
+ this_offset = (unsigned int)atoll(fn);
+ step = 1;
+ }else if (step) {
+ if(j<LRMMAX_CHROMOSOME_NAME_LEN-1){
+ chro_name[j++]=fn[i];
+ chro_name[j]=0;
+ }
+ }
+ i++;
+ }
+ HashTablePut(context->sam_bam_chromosome_table, chro_name, NULL + chromosome_no + 1);
+ ArrayListPush(context->sam_bam_chromosome_list, chro_name);
+
+ HashTablePut(context->chromosome_size_table, chro_name, NULL + (this_offset - last_end + 16 - context->current_index_padding * 2));
+ ArrayListPush(context->chromosome_size_list, NULL + this_offset);
+
+ chromosome_no++;
+ last_end = this_offset;
+ }
+
+ fclose(fp);
+ return 0;
+}
+
+#define LRMcheck_resize_buff if(new_need >= thread_context -> out_buff_capacity){\
+ thread_context -> out_buff_capacity = max(thread_context -> out_buff_capacity*2, new_need );\
+ thread_context -> out_SAMBAM_buffer=realloc(thread_context -> out_SAMBAM_buffer, thread_context -> out_buff_capacity);\
+ }\
+
+#define LRMBAM_COMPRESS_BLOCK 63000
+#define LRMBAM_COMPRESS_TRIGGER 53000
+
+void LRMsambam_write_header(LRMcontext_t * context, LRMthread_context_t * thread_context){
+ if(context -> sam_bam_file_header_written) return;
+
+ thread_context -> out_SAMBAM_buffer = malloc(3000000);
+ thread_context -> out_buff_capacity = 3000000;
+ thread_context -> out_buff_used = 0;
+
+ int chro_no;
+ if(!context -> is_SAM_output){
+ memcpy(thread_context -> out_SAMBAM_buffer,"BAM\1",4);
+ thread_context -> out_buff_used = 8;
+ }
+
+ for(chro_no = -1; chro_no < context -> sam_bam_chromosome_list->numOfElements + 2; chro_no++){
+ char * header_line = malloc(10000);
+ int wrlen = 0;
+ if(chro_no >=0 && chro_no < context -> sam_bam_chromosome_list->numOfElements){
+ char * chro_name = ArrayListGet(context -> sam_bam_chromosome_list, chro_no);
+ int chro_length = HashTableGet(context -> chromosome_size_table, chro_name) - NULL;
+
+ wrlen = sprintf(header_line, "@SQ\tSN:%s\tLN:%d\n",chro_name,chro_length);
+ }else if(chro_no == -1){
+ wrlen = sprintf(header_line, "@HD\tVN:1.0\tSO:unsorted\n");
+ }else if(chro_no == context -> sam_bam_chromosome_list->numOfElements ){
+ wrlen = sprintf(header_line, "@PG\tID:subread-long-read-mapping\tPN:subread-long-read-mapping\tCL:%s\n", context -> user_command_line);
+ }
+ if(context -> is_SAM_output){
+ fwrite( header_line, 1, wrlen, context -> sam_bam_file);
+ }else{
+ int new_need = thread_context -> out_buff_used + wrlen +1;
+ LRMcheck_resize_buff;
+ memcpy(thread_context -> out_SAMBAM_buffer + thread_context -> out_buff_used,header_line,wrlen);
+ thread_context -> out_buff_used += wrlen;
+ }
+ free(header_line);
+ }
+
+ int new_need = thread_context -> out_buff_used + 10;
+ LRMcheck_resize_buff;
+
+ int BAM_text_len = thread_context -> out_buff_used-8;
+ memcpy(thread_context -> out_SAMBAM_buffer+4, &BAM_text_len, 4);
+ memcpy(thread_context -> out_SAMBAM_buffer + thread_context -> out_buff_used, &context ->sam_bam_chromosome_list->numOfElements , 4);
+ thread_context -> out_buff_used +=4;
+ for(chro_no = 0; chro_no < context -> sam_bam_chromosome_list->numOfElements ; chro_no++){
+ char * chro_name = ArrayListGet(context -> sam_bam_chromosome_list, chro_no);
+ int chro_namelen = strlen(chro_name)+1;
+ int new_need = thread_context -> out_buff_used + chro_namelen + 9;
+ LRMcheck_resize_buff;
+
+ memcpy(thread_context -> out_SAMBAM_buffer+thread_context -> out_buff_used, &chro_namelen, 4);
+ thread_context -> out_buff_used +=4;
+ memcpy(thread_context -> out_SAMBAM_buffer+thread_context -> out_buff_used, chro_name, chro_namelen);
+ thread_context -> out_buff_used += chro_namelen;
+ int chro_length = HashTableGet(context -> chromosome_size_table, chro_name) - NULL;
+ memcpy(thread_context -> out_SAMBAM_buffer+thread_context -> out_buff_used, &chro_length, 4);
+ thread_context -> out_buff_used +=4;
+ }
+
+ if(!context -> is_SAM_output){
+ LRMwrite_chunk_check_buffer_write(context, thread_context, 1);
+
+ }
+ context -> sam_bam_file_header_written = 1;
+ free(thread_context -> out_SAMBAM_buffer);
+}
+
+void LRMbam_generate_tail_binary(LRMcontext_t * context, LRMthread_context_t * thread_context){
+ if(context -> bam_file_tail_length>0)return;
+ context -> bam_file_tail_length = LRMwrite_chunk_compress_bam_block(context, thread_context, "", context -> bam_file_tail_binary , 0);
+}
+
+int LRMwrite_chunk_add_buffered_output(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int flags, char * chro_name, unsigned int chro_pos, int map_quality, char * cigar, int mis_matched, int multi_mapping_locations, int this_mapping_locations){
+ char * target_ptr;
+ int cigar_len = strlen(cigar);
+ int rname_len = strlen(iteration_context->read_name);
+ int read_binlen, actural_target_len;
+ read_binlen = cigar_len+rname_len+2.5*iteration_context->read_length+500;
+
+ if(read_binlen + thread_context->out_buff_used >= thread_context -> out_buff_capacity){
+ thread_context -> out_buff_capacity = max( thread_context -> out_buff_capacity*1.3, read_binlen);
+ thread_context -> out_SAMBAM_buffer=realloc(thread_context -> out_SAMBAM_buffer, thread_context -> out_buff_capacity);
+ }
+
+ target_ptr = thread_context -> out_SAMBAM_buffer + thread_context->out_buff_used;
+
+ if(context -> is_Phred_64) LRMquality_64_to_33(iteration_context->qual_text) ;
+
+ if(context->is_SAM_output){
+ actural_target_len = sprintf(target_ptr,"%s\t%d\t%s\t%u\t%d\t%s\t*\t0\t0\t%s\t%s\tNM:i:%d\tNH:i:%d\tHI:i:%d\n", iteration_context -> read_name, flags, chro_name, chro_pos + 1, map_quality, cigar, iteration_context->read_text, iteration_context->qual_text, mis_matched, multi_mapping_locations,this_mapping_locations);
+ }else{
+ actural_target_len = LRMgenerate_bam_record(context, thread_context, iteration_context, target_ptr, flags, chro_pos, chro_name, map_quality, cigar, mis_matched, multi_mapping_locations,this_mapping_locations);
+ }
+ thread_context -> out_buff_used+=actural_target_len;
+ LRMwrite_chunk_check_buffer_write(context, thread_context, 0);
+ return 0;
+}
+int LRMwrite_chunk_check_buffer_write(LRMcontext_t * context, LRMthread_context_t * thread_context, int force_write){
+ if(force_write || thread_context -> out_buff_used > LRMBAM_COMPRESS_TRIGGER){
+ if(!context->is_SAM_output){
+ int write_cursor = 0;
+ int compressed_cursor = 0;
+ for(write_cursor = 0; write_cursor < thread_context -> out_buff_used; write_cursor += LRMBAM_COMPRESS_BLOCK){
+ char compressed_data [66666];
+ int bin_len = min(LRMBAM_COMPRESS_BLOCK,thread_context -> out_buff_used - write_cursor);
+ int compressed_len = LRMwrite_chunk_compress_bam_block(context, thread_context, thread_context -> out_SAMBAM_buffer + write_cursor, compressed_data, bin_len);
+ if(compressed_len > bin_len){ // very unlikely
+ int new_needed_size = thread_context -> out_buff_used + (compressed_len - bin_len);
+ if(new_needed_size > thread_context -> out_buff_capacity){
+ thread_context -> out_buff_capacity = new_needed_size;
+ thread_context -> out_SAMBAM_buffer=realloc(thread_context -> out_SAMBAM_buffer, thread_context -> out_buff_capacity);
+ }
+ int x1;
+ for(x1 = thread_context -> out_buff_used - 1; x1 >= write_cursor + bin_len; x1 --)
+ thread_context -> out_SAMBAM_buffer[ x1 + (compressed_len - bin_len) ] = thread_context -> out_SAMBAM_buffer[ x1 ];
+ }
+ memcpy(thread_context -> out_SAMBAM_buffer + compressed_cursor, compressed_data, compressed_len);
+ compressed_cursor += compressed_len;
+ }
+ thread_context -> out_buff_used = compressed_cursor;
+ }
+ LRMthread_lock(&context->sam_bam_file_lock);
+ fwrite(thread_context -> out_SAMBAM_buffer,1, thread_context -> out_buff_used, context -> sam_bam_file);
+ LRMthread_lockrelease(&context->sam_bam_file_lock);
+ thread_context -> out_buff_used = 0;
+ }
+
+ return 0;
+}
+
+
+
+void LRMpos2txt(LRMcontext_t * context, unsigned int linear, char * txt){
+ char * chro_name = NULL;
+ int pos;
+
+ int retv = LRMlocate_gene_position(context, linear, &chro_name, &pos);
+ if(chro_name && !retv)
+ sprintf(txt, "%s:%d", chro_name, pos+1);
+ else strcpy(txt, "*");
+}
+
+int LRMlocate_gene_position(LRMcontext_t * context, unsigned int linear, char ** chro_name, int * pos) {
+ int n = 0;
+ int total_offsets = context -> chromosome_size_list -> numOfElements;
+ int jump_ns = total_offsets/4;
+
+ //LRMprintf("LINEAR = %u\n", linear);
+
+ while (jump_ns > 5)
+ {
+ while(n+jump_ns < total_offsets && ArrayListGet(context->chromosome_size_list, n + jump_ns) - NULL <= linear)
+ n+=jump_ns;
+ jump_ns /=4;
+ }
+
+ for (; n < total_offsets; n++)
+ {
+ //LRMprintf("TESTING %u\n",(ArrayListGet(context->chromosome_size_list, n) - NULL) );
+ if ( (ArrayListGet(context->chromosome_size_list, n) - NULL) > linear)
+ {
+ *pos = linear;
+ if(n>0)(*pos) -= (ArrayListGet(context->chromosome_size_list, n-1)-NULL);
+ if( (*pos) < context -> current_index_padding ) return 1;
+ else (*pos) -= context -> current_index_padding;
+ (*chro_name) = ArrayListGet(context -> sam_bam_chromosome_list, n);
+
+ return 0;
+ }
+ }
+ return -1;
+}
+
+int LRMlocate_chro_length(LRMcontext_t * context, unsigned int linear, char ** chro_name, long long * end_pos){
+ int n = 0;
+ int total_offsets = context -> chromosome_size_list -> numOfElements;
+ int jump_ns = total_offsets/4;
+
+ while (jump_ns > 5)
+ {
+ while(n+jump_ns < total_offsets && ArrayListGet(context->chromosome_size_list, n + jump_ns) - NULL <= linear)
+ n+=jump_ns;
+ jump_ns /=4;
+ }
+
+ for (; n < total_offsets; n++)
+ {
+ //LRMprintf("TESTING %u\n",(ArrayListGet(context->chromosome_size_list, n) - NULL) );
+ if ( (ArrayListGet(context->chromosome_size_list, n) - NULL) > linear) {
+ (* chro_name) = ArrayListGet(context -> sam_bam_chromosome_list, n);
+ (* end_pos) = ArrayListGet(context->chromosome_size_list, n)-NULL;
+ if((* end_pos) > context -> current_index_padding) (* end_pos) -= context -> current_index_padding ;
+ return 0;
+ }
+ }
+ return -1;
+}
diff --git a/src/longread-mapping/LRMfile-io.h b/src/longread-mapping/LRMfile-io.h
new file mode 100644
index 0000000..71cfcaa
--- /dev/null
+++ b/src/longread-mapping/LRMfile-io.h
@@ -0,0 +1,55 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+#ifndef __LRMFILEIO_H_
+#define __LRMFILEIO_H_
+
+#define LRMGENE_INPUT_FASTQ 1
+#define LRMGENE_INPUT_GZIP_FASTQ 51
+#define LRMMAX_LINE_LENGTH (LRMMAX_READ_LENGTH + 10)
+
+#include "LRMconfig.h"
+
+int LRMhash_strcmp(const void * s1, const void * s2);
+unsigned long LRMhash_strhash(const void * sv);
+
+int LRMgenekey2int(char key []);
+
+int LRMgeinput_open(const char * filename, LRMgene_input_t * input);
+
+void LRMsambam_write_header(LRMcontext_t * context, LRMthread_context_t * thread_context);
+void LRMbam_generate_tail_binary(LRMcontext_t * context, LRMthread_context_t * thread_context);
+
+int LRMwrite_chunk_check_buffer_write(LRMcontext_t * context, LRMthread_context_t * thread_context, int force_write);
+// return 0 if successful
+int LRMfetch_next_read(LRMcontext_t * context, LRMthread_context_t * thread_context, unsigned int *read_len, char * read_name, char * read_text, char * qual_text, unsigned int * read_no_in_chunk);
+// Return the length of this read or -1 if EOF.
+int LRMgeinput_next_read(LRMgene_input_t * input, char * read_name, char * read_string, char * quality_string);
+void LRMgeinput_close(LRMgene_input_t * input);
+
+// returns read length
+int LRMgeinput_readline(LRMgene_input_t * input, int buf_len, char * linebuffer) ;
+void LRMreverse_read(char * ReadString, int Length);
+void LRMreverse_quality(char * QualtyString, int Length);
+int LRMload_offsets(LRMcontext_t * context);
+int LRMlocate_gene_position(LRMcontext_t * context, unsigned int linear, char ** chro_name, int * pos);
+
+void LRMpos2txt(LRMcontext_t * context, unsigned int linear, char * txt);
+int LRMlocate_chro_length(LRMcontext_t * context, unsigned int linear, char ** chro_name, long long * chro_len);
+int LRMwrite_chunk_add_buffered_output(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int flags, char * chro_name, unsigned int chro_pos, int map_quality, char * cigar, int mis_matched, int multi_mapping_locations, int this_mapping_loca);
+#endif
diff --git a/src/longread-mapping/LRMhelper.c b/src/longread-mapping/LRMhelper.c
new file mode 100644
index 0000000..9735c8c
--- /dev/null
+++ b/src/longread-mapping/LRMhelper.c
@@ -0,0 +1,77 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+#include<stdio.h>
+#include<string.h>
+#include"LRMconfig.h"
+#include"LRMhelper.h"
+
+void basic_sort_run(void * arr, int start, int items, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r)){
+ int i, j;
+ for(i=start; i< start + items - 1; i++)
+ {
+ int min_j = i;
+ for(j=i + 1; j< start + items; j++)
+ {
+ if(compare(arr, min_j, j) > 0)
+ min_j = j;
+ }
+ if(i!=min_j)
+ exchange(arr, i, min_j);
+ }
+}
+
+void basic_sort(void * arr, int items, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r)){
+ basic_sort_run(arr, 0, items, compare, exchange);
+}
+
+
+void merge_sort_run(void * arr, int start, int items, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r), void merge(void * arr, int start, int items, int items2))
+{
+ if(items > 11)
+ {
+ int half_point = items/2;
+ merge_sort_run(arr, start, half_point, compare, exchange, merge);
+ merge_sort_run(arr, start + half_point, items - half_point, compare, exchange, merge);
+ merge(arr, start, half_point, items - half_point);
+ }
+ else
+ {
+ basic_sort_run(arr, start, items, compare, exchange);
+ }
+}
+void merge_sort(void * arr, int arr_size, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r), void merge(void * arr, int start, int items, int items2))
+{
+ merge_sort_run(arr, 0, arr_size, compare, exchange, merge);
+}
+
+int binary_search_less_equal(unsigned int * arr, int size, unsigned int x){
+ int L = 0;
+ int R = size-1;
+ while(1){
+ if(L>=R){
+ if(R<0) return R;
+ if(arr[R] < x)return R;
+ return R-1;
+ }
+ int m = (L+R)/2;
+ if(arr[m] < x) L=m+1;
+ else if(arr[m] > x) R=m-1;
+ else return m;
+ }
+}
diff --git a/src/longread-mapping/LRMhelper.h b/src/longread-mapping/LRMhelper.h
new file mode 100644
index 0000000..6a9e955
--- /dev/null
+++ b/src/longread-mapping/LRMhelper.h
@@ -0,0 +1,28 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+#ifndef __LRM_HELPER_H_
+#define __LRM_HELPER_H_
+
+void basic_sort(void * arr, int items, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r));
+
+void merge_sort(void * arr, int arr_size, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r), void merge(void * arr, int start, int items, int items2));
+
+int binary_search_less_equal(unsigned int * arr, int size, unsigned int x);
+
+#endif
diff --git a/src/longread-mapping/LRMsorted-hashtable.c b/src/longread-mapping/LRMsorted-hashtable.c
new file mode 100644
index 0000000..adccfec
--- /dev/null
+++ b/src/longread-mapping/LRMsorted-hashtable.c
@@ -0,0 +1,534 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "LRMsorted-hashtable.h"
+
+#ifndef MACOS
+#ifndef FREEBSD
+#include <malloc.h>
+#endif
+#endif
+
+#include<math.h>
+#include "LRMfile-io.h"
+
+#define _gehash_hash(k) ((unsigned int)(k))
+#define WITHOUT_CLUSTER_ORDERING 0
+
+struct LRMgehash_bucket * LRM_gehash_get_bucket(LRMgehash_t * the_table, LRMgehash_key_t key)
+{
+ int bucket_number;
+
+ bucket_number = _gehash_hash(key) % the_table -> buckets_number;
+ return &(the_table -> buckets [bucket_number]);
+}
+
+
+
+#define INDEL_SEGMENT_SIZE 5
+
+#define _index_vote(key) (((unsigned int)(key))%LRMGENE_VOTE_TABLE_SIZE)
+#define _index_vote_tol(key) (((unsigned int)(key)/INDEL_SEGMENT_SIZE)%LRMGENE_VOTE_TABLE_SIZE)
+
+
+#define is_quality_subread(scr) ((scr)>15?1:0)
+
+
+size_t LRMgehash_go_q(LRMgehash_t * the_table, LRMgehash_key_t raw_key, int offset, int read_len, int is_reversed, LRMgene_vote_t * vote, int indel_tolerance, int subread_number){
+ //LRMprintf("Q=%u, OFFSET=%d, B=%u ~ %u\n", raw_key, offset, low_border, high_border);
+
+ // VER_1
+ // VER_2
+
+ struct LRMgehash_bucket * current_bucket;
+ int i = 0, items;
+
+ short *current_keys;//, *endp12;
+ short key = raw_key / the_table->buckets_number;
+
+ current_bucket = LRM_gehash_get_bucket (the_table, raw_key);
+ items = current_bucket -> current_items;
+ current_keys = current_bucket -> new_item_keys;
+
+ if(!items) return 0;
+
+ int imin=0, imax=items;
+ int last_accepted_index = 0;
+
+ while( imin < items )
+ {
+ last_accepted_index=(imin+imax)/2;
+ short current_key = current_keys[last_accepted_index];
+ if(current_key>key)
+ {
+ imax = last_accepted_index - 1;
+ }
+ else if(current_key<key)
+ {
+ imin = last_accepted_index + 1;
+ }
+ else
+ break;
+
+ if(imax<imin)
+ return 0;
+
+ }
+
+ while(last_accepted_index){
+ if(current_keys[last_accepted_index-1] == key) last_accepted_index-=1;
+ else break;
+ }
+
+ int ii_end = INDEL_SEGMENT_SIZE;
+ if(indel_tolerance>5) ii_end=(indel_tolerance % INDEL_SEGMENT_SIZE)?(indel_tolerance - indel_tolerance%INDEL_SEGMENT_SIZE+INDEL_SEGMENT_SIZE):indel_tolerance;
+
+ for (; last_accepted_index<items && current_keys[last_accepted_index] == key ; last_accepted_index++) {
+
+ if(0){
+ LRMprintf("KV=%u, Offset=%d\n", current_bucket->item_values[last_accepted_index], offset);
+ }
+
+ unsigned int kv = current_bucket->item_values[last_accepted_index] - offset;
+ int iix, offsetX2, offsetX, datalen, datalen2;
+ offsetX2 = offsetX = _index_vote_tol(kv);
+ datalen = datalen2 = vote -> items[offsetX2];
+ unsigned int * dat2, *dat;
+ dat = dat2 = vote -> pos[offsetX2];
+
+ //LRMprintf("You can find KV at %u\n", kv);
+
+ for(iix = 0; iix<=ii_end; iix = iix>0?-iix:(-iix+INDEL_SEGMENT_SIZE))
+ {
+ if(iix)
+ {
+ offsetX = _index_vote_tol(kv+iix);
+ datalen = vote -> items[offsetX];
+ dat = vote -> pos[offsetX];
+ }
+
+
+ if(!datalen)continue;
+
+ for (i=0;i<datalen;i++)
+ {
+ int di = dat[i];
+ int dist0 = kv-di;
+ if( dist0 >= -indel_tolerance && dist0 <= indel_tolerance )
+ {
+ if(is_reversed == (0!=(vote -> masks[offsetX][i]&LRMIS_NEGATIVE_STRAND)))
+ {
+
+ unsigned char test_max = (vote->votes[offsetX][i]);
+ test_max += 1;
+ vote -> votes[offsetX][i] = test_max;
+
+ if (offset +16 > vote->coverage_end [offsetX][i])
+ vote->coverage_end [offsetX][i] = offset+16;
+
+ int toli = vote -> toli[offsetX][i];
+
+ if (dist0 != vote->current_indel_cursor[offsetX][i])
+ {
+ toli +=3;
+ if (toli < LRMMAX_INDEL_SECTIONS*3)
+ {
+ vote -> toli[offsetX][i] = toli;
+ vote -> indel_recorder[offsetX][i][toli] = subread_number+1;
+ vote -> indel_recorder[offsetX][i][toli+1] = subread_number+1;
+ vote -> indel_recorder[offsetX][i][toli+2] = dist0;
+ //LRMprintf("subread=#%d ,TOLI=%d, DIST0=%d, POS=%u \n", subread_number, toli, dist0, kv);
+
+ if(toli < LRMMAX_INDEL_SECTIONS*3-3) vote -> indel_recorder[offsetX][i][toli+3]=0;
+ }
+ vote->current_indel_cursor [offsetX][i] = (char)dist0;
+ } else vote -> indel_recorder[offsetX][i][toli+1] = subread_number+1;
+
+ i = 9999999;
+ }
+ }
+ }
+ if (i>=9999999){
+ break;
+ }
+
+ }
+
+ if (i < 9999999)
+ {
+ if (datalen2<LRMGENE_VOTE_SPACE)
+ {
+ vote -> items[offsetX2] ++;
+ dat2[datalen2] = kv;
+ vote -> masks[offsetX2][datalen2]=(is_reversed?LRMIS_NEGATIVE_STRAND:0);
+ vote -> votes[offsetX2][datalen2]=1;
+ vote -> toli[offsetX2][datalen2]=0;
+
+ // data structure of recorder:
+ // {unsigned char subread_start; unsigned char subread_end, char indel_offset_from_start}
+ // All subread numbers are added with 1 for not being 0.
+
+ vote -> indel_recorder[offsetX2][datalen2][0] = vote -> indel_recorder[offsetX2][datalen2][1] = subread_number+1;
+ vote -> indel_recorder[offsetX2][datalen2][2] = 0;
+ vote -> indel_recorder[offsetX2][datalen2][3] = 0;
+ vote -> current_indel_cursor [offsetX2][datalen2] = 0;
+ vote -> coverage_start [offsetX2][datalen2] = offset;
+ vote -> coverage_end [offsetX2][datalen2] = offset+16;
+ //LRMprintf("subread=#%d ,NEW RECORD =%u\n", subread_number, kv);
+ }
+ }
+ else i=0;
+ }
+ return 1;
+}
+
+
+short LRMindel_recorder_copy(unsigned short *dst, unsigned short * src)
+{
+ short all_indels = 0;
+// memcpy(dst, src, 3*MAX_INDEL_TOLERANCE); return;
+
+
+ int i=0;
+ while(src[i] && (i<3*LRMMAX_INDEL_TOLERANCE-2))
+ {
+ dst[i] = src[i];
+ i++;
+ dst[i] = src[i];
+ i++;
+ dst[i] = src[i];
+ all_indels = dst[i];
+ i++;
+ }
+ dst[i] = 0;
+ return all_indels;
+
+}
+
+// Data Struct of dumpping:
+// {
+// size_t current_items;
+// size_t buckets_number;
+// struct
+// {
+// size_t current_items;
+// size_t space_size;
+// gehash_key_t item_keys [current_items];
+// gehash_data_t item_values [current_items]
+// } [buckets_number];
+// }
+//
+
+unsigned int load_int32(FILE * fp)
+{
+ int ret;
+ int read_length;
+ read_length = fread(&ret, sizeof(int), 1, fp);
+ assert(read_length>0);
+ return ret;
+}
+
+long long int load_int64(FILE * fp)
+{
+ long long int ret;
+ int read_length;
+ read_length = fread(&ret, sizeof(long long int), 1, fp);
+ assert(read_length>0);
+ return ret;
+}
+
+
+int LRMgehash_load_option(const char fname [], int option_no, int * result){
+ char tabname[LRMMAX_FILENAME_LENGTH];
+ char magic_chars[8];
+ int found = 0;
+ sprintf(tabname, "%s.00.b.tab", fname);
+ FILE * fp = fopen(tabname, "rb");
+ if(fp == NULL){
+ sprintf(tabname, "%s.00.c.tab", fname);
+ fp = fopen(tabname, "rb");
+ }
+ if(fp){
+ fread(magic_chars,1,8,fp);
+ if(memcmp(magic_chars, "2subindx",7)==0) {
+ while(1) {
+ short option_key, option_length;
+
+ fread(&option_key, 2, 1, fp);
+ if(!option_key) break;
+
+ fread(&option_length, 2, 1, fp);
+
+ if(option_key == option_no){
+ *result = 0;
+ fread(result ,option_length,1,fp);
+ found = 1;
+ }
+ else
+ fseek(fp, option_length, SEEK_CUR);
+ }
+ }
+ fclose(fp);
+ }else return -1;
+ return found;
+}
+
+int LRMgehash_load(LRMgehash_t * the_table, const char fname [])
+{
+ int i, read_length;
+ char magic_chars[8];
+ magic_chars[7]=0;
+
+ the_table -> index_gap = 0;
+
+ FILE * fp = fopen(fname, "rb");
+ if (!fp)
+ {
+ LRMprintf ("Table file '%s' is not found.\n", fname);
+ return 1;
+ }
+
+ fread(magic_chars,1,8,fp);
+
+
+ if(memcmp(magic_chars+1, "subindx",7)==0)
+ {
+ if('2'==magic_chars[0])
+ the_table -> version_number = LRMSUBINDEX_VER2;
+ else assert(0);
+
+ while(1)
+ {
+ short option_key, option_length;
+
+ fread(&option_key, 2, 1, fp);
+ if(!option_key) break;
+
+ fread(&option_length, 2, 1, fp);
+
+ if(option_key == LRMSUBREAD_INDEX_OPTION_INDEX_GAP)
+ fread(&(the_table -> index_gap),2,1,fp);
+ else if (option_key == LRMSUBREAD_INDEX_OPTION_INDEX_PADDING)
+ fread(&(the_table -> padding),2,1,fp);
+ else
+ fseek(fp, option_length, SEEK_CUR);
+ }
+ assert(the_table -> index_gap);
+
+ the_table -> current_items = load_int64(fp);
+ if(the_table -> current_items < 1 || the_table -> current_items > 0xffffffffllu){
+ LRMputs("ERROR: the index format is unrecognizable.");
+ return 1;
+ }
+ the_table -> buckets_number = load_int32(fp);
+ the_table -> buckets = (struct LRMgehash_bucket * )malloc(sizeof(struct LRMgehash_bucket) * the_table -> buckets_number);
+ if(!the_table -> buckets)
+ {
+ LRMputs("Error: out of memory");
+ return 1;
+ }
+
+ for (i=0; i<the_table -> buckets_number; i++)
+ {
+ struct LRMgehash_bucket * current_bucket = &(the_table -> buckets[i]);
+ current_bucket -> current_items = load_int32(fp);
+ current_bucket -> space_size = load_int32(fp);
+ current_bucket -> space_size = current_bucket -> current_items;
+ current_bucket -> new_item_keys = (short *) malloc ( sizeof(short) * current_bucket -> space_size);
+ current_bucket -> item_values = (LRMgehash_data_t *) malloc ( sizeof(LRMgehash_data_t) * current_bucket -> space_size);
+
+ if(!(current_bucket -> new_item_keys&¤t_bucket -> item_values))
+ {
+ LRMputs("Error: out of memory");
+ return 1;
+
+ }
+
+ if(current_bucket -> current_items > 0)
+ {
+ read_length = fread(current_bucket -> new_item_keys, sizeof(short), current_bucket -> current_items, fp);
+ if(read_length < current_bucket -> current_items){
+ LRMprintf("ERROR: the index is incomplete : %d < %u.\n",read_length, current_bucket -> current_items);
+ return 1;
+ }
+ read_length = fread(current_bucket -> item_values, sizeof(LRMgehash_data_t), current_bucket -> current_items, fp);
+ if(read_length < current_bucket -> current_items){
+ LRMprintf("ERROR: the index value is incomplete : %d < %u.\n",read_length, current_bucket -> current_items);
+ return 1;
+ }
+ }
+
+ }
+
+ read_length = fread(&(the_table -> is_small_table), sizeof(char), 1, fp);
+ assert(read_length>0);
+ fclose(fp);
+ return 0;
+
+ }
+ else assert(0);
+ return 0;
+}
+
+void LRMtest2key(unsigned int kk, char * obuf){
+ int xx,oo=0;
+ for(xx=0; xx<32;xx++){
+ obuf[oo++] = (kk & (1<<xx)) ?'1':'0';
+ if(xx%2 == 1 && xx < 31) obuf[oo++]=' ';
+ }
+ obuf[oo]=0;
+}
+
+int LRMtest2key_dist(unsigned int k1, unsigned int k2){
+ int xx, ret = 0;
+ for(xx=0; xx<16;xx++){
+ int b1 = (k1 >> (xx*2)) & 3;
+ int b2 = (k2 >> (xx*2)) & 3;
+ if(b1!=b2) ret++;
+ }
+ return ret;
+}
+
+size_t LRMgehash_go_tolerance(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMgehash_t * the_table, LRMgehash_key_t key, int offset, int read_len, int is_reversed, LRMgene_vote_t * vote, int indel_tolerance, int subread_number, int max_MM){
+
+ int ret = 0, error_bases ;
+
+ ret+=LRMgehash_go_q(the_table, key, offset, read_len, is_reversed, vote, indel_tolerance, subread_number);
+ assert(max_MM <=3);
+ for (error_bases=1; error_bases <= max_MM; error_bases++)
+ {
+ int i, j;
+ char error_pos_stack[10]; // max error bases = 10;
+ LRMgehash_key_t mutation_key;
+
+ for(i=0; i<error_bases; i++)
+ error_pos_stack [i] = i;
+ while (1)
+ {
+
+ char mutation_stack[10];
+ memset(mutation_stack, 0 , error_bases);
+ while(1)
+ {
+ int base_to_change=-1;
+ mutation_key = key;
+
+ for (j = 0; j < error_bases; j++)
+ {
+ base_to_change = error_pos_stack[j];
+ int old_value = (key >> 2*base_to_change) & 3;
+ int new_index = mutation_stack[j];
+
+ int new_value;
+ if(old_value <= new_index) new_value = 1+ new_index;
+ else new_value = new_index;
+
+ mutation_key = mutation_key & ~(0x3 << (2*base_to_change));
+ mutation_key = mutation_key | (new_value << (2*base_to_change));
+ }
+
+ if(key != mutation_key ){
+ int dret=LRMgehash_go_q(the_table, mutation_key, offset, read_len, is_reversed, vote, indel_tolerance, subread_number);
+ ret += dret;
+ if(0 && FIXLENstrcmp("@39076b1f-df29-4487-be51-4c30bf6c1cc4_Basecall_Alignment_template", iteration_context->read_name)==0){
+ char bin_mutation_key[53], bin_key[53];
+ LRMtest2key(mutation_key, bin_mutation_key);
+ LRMtest2key(key, bin_key);
+ LRMprintf("GO_TOLE_TEST: %s + %d DIST = %d HITS = %d :\nNEWKY: %s\nORGKY: %s\n", iteration_context->read_name , offset, LRMtest2key_dist(mutation_key,key), dret, bin_mutation_key, bin_key);
+ }
+ }
+ // increase one in the mutation_stack
+ mutation_stack[error_bases-1]++;
+ for(i = error_bases-1; i>=0; i--){
+ if( mutation_stack[i]>2 ) {
+ if(i == 0)break;
+ else{
+ mutation_stack[i] = 0;
+ mutation_stack[i-1]++;
+ }
+ }
+ }
+ if(mutation_stack[0]>2)break;
+ }
+
+
+ int is_end = 1;
+ for (i = error_bases-1; i>=0; i--)
+ if (error_pos_stack[i] < 16 - (error_bases - i)){
+ error_pos_stack[i] ++;
+ for (j = i+1; j<error_bases; j++)
+ error_pos_stack[j] = error_pos_stack[i] + (j-i);
+ is_end = 0;
+ break;
+ }
+
+ if(is_end) break;
+ }
+ }
+ return ret;
+}
+
+void LRMgehash_destory(LRMgehash_t * the_table)
+{
+ int i;
+
+ for (i=0; i<the_table -> buckets_number; i++)
+ {
+ struct LRMgehash_bucket * current_bucket = &(the_table -> buckets[i]);
+ if (current_bucket -> space_size > 0)
+ {
+ free (current_bucket -> new_item_keys);
+ free (current_bucket -> item_values);
+ }
+ }
+
+ free (the_table -> buckets);
+
+ the_table -> current_items = 0;
+ the_table -> buckets_number = 0;
+}
+
+void LRMprint_v(LRMcontext_t * context, LRMread_iteration_context_t * iteration_context, int min_votes){
+ LRMgene_vote_t * v = &iteration_context-> vote_table;
+
+ LRMprintf(" ==== VOTING TABLE ========================= \n");
+ int iii, jjj;
+ for(iii=0; iii < LRMGENE_VOTE_TABLE_SIZE; iii++){
+ for(jjj = 0; jjj < v -> items[iii]; jjj++){
+ unsigned int this_pos = v -> pos[iii][jjj];
+ int votes = v -> votes[iii][jjj];
+ if(votes >= min_votes){
+ char postxt[100];
+ int conf_start = v -> coverage_start[iii][jjj] , conf_end = v -> coverage_end [iii][jjj];
+ LRMpos2txt(context, this_pos, postxt);
+ //LRMprintf(" %d (%s) % 3d - % 3d : % 16s (U % 10u) ", votes,(v -> masks[iii][jjj]&LRMIS_NEGATIVE_STRAND)?"NEG":"POS", conf_start, conf_end, postxt, this_pos);
+ LRMprintf(" %d (%s) %3d - %3d : %16s (U %10u) ", votes,(v -> masks[iii][jjj]&LRMIS_NEGATIVE_STRAND)?"NEG":"POS", conf_start, conf_end, postxt, this_pos);
+ int ix;
+ for(ix=0; v->indel_recorder[iii][jjj][ix]; ix+=3){
+ LRMprintf(" %d ~ %d : %d ", v->indel_recorder[iii][jjj][ix] , v->indel_recorder[iii][jjj][ix+1], v->indel_recorder[iii][jjj][ix+2]);
+ }
+ LRMprintf("\n");
+ }
+ }
+ }
+ LRMprintf(" =========================================== \n\n");
+}
diff --git a/src/longread-mapping/LRMsorted-hashtable.h b/src/longread-mapping/LRMsorted-hashtable.h
new file mode 100644
index 0000000..cb4b631
--- /dev/null
+++ b/src/longread-mapping/LRMsorted-hashtable.h
@@ -0,0 +1,44 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+
+#ifndef _SORTED_HASHTABLE_H_
+#define _SORTED_HASHTABLE_H_
+#include <stdlib.h>
+#include <stdio.h>
+#include "LRMconfig.h"
+
+#define LRMGEHASH_DEFAULT_SIZE 2000000000
+#define LRMGEHASH_BUCKET_LENGTH 2291
+
+#define LRMinit_gene_vote(a) {memset((a)->items, 0, LRMGENE_VOTE_TABLE_SIZE*sizeof( *((a)->items))); }
+
+size_t LRMgehash_go_q(LRMgehash_t * the_table, LRMgehash_key_t key, int offset, int read_len, int is_reversed, LRMgene_vote_t * vote, int indel_tolerance, int subread_number);
+size_t LRMgehash_go_tolerance(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMgehash_t * the_table, LRMgehash_key_t key, int offset, int read_len, int is_reversed, LRMgene_vote_t * vote, int indel_tolerance, int subread_number, int max_MM);
+
+void LRMgehash_destory(LRMgehash_t * the_table);
+void LRMfinalise_vote(LRMgene_vote_t * vote);
+int LRMgehash_load(LRMgehash_t * the_table, const char fname []);
+void LRMassign_best_vote(LRMgene_vote_t * vote, int i, int j);
+
+void LRMselect_best_vote(LRMgene_vote_t * vote);
+void LRMgehash_sort(LRMgehash_t * the_table);
+int LRMgehash_load_option(const char fname [], int option_no, int * result);
+void LRMprint_v(LRMcontext_t * context, LRMread_iteration_context_t * iteration_context, int min_votes);
+
+#endif
diff --git a/src/longread-mapping/Makefile b/src/longread-mapping/Makefile
new file mode 100644
index 0000000..47c1b79
--- /dev/null
+++ b/src/longread-mapping/Makefile
@@ -0,0 +1,23 @@
+include ../makefile.version
+include make.version
+
+OPT_LEVEL = 3
+CCFLAGS = -mtune=core2 ${MACOS} -O${OPT_LEVEL} -Wall -DMAKE_FOR_EXON -D MAKE_STANDALONE -D_FILE_OFFSET_BITS=64 -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\"
+LDFLAGS = -lpthread -lz -lm -O${OPT_LEVEL} -DMAKE_FOR_EXON -D MAKE_STANDALONE # -DREPORT_ALL_THE_BEST
+CC_EXEC = gcc
+CC = ${CC_EXEC} ${CCFLAGS} -fmessage-length=0 -ggdb
+
+ALL_LIBS=LRMsorted-hashtable LRMbase-index LRMchro-event LRMhelper seek-zlib LRMfile-io hashtable
+
+ALL_OBJECTS=$(addsuffix .o, ${ALL_LIBS})
+ALL_H=$(addsuffix .h, ${ALL_LIBS})
+ALL_C=$(addsuffix .c, ${ALL_LIBS})
+
+all: sublong
+ mv sublong ../
+
+clean:
+ rm -f *.o
+
+sublong: longread-mapping.c ${ALL_OBJECTS}
+ ${CC} -o sublong longread-mapping.c ${ALL_OBJECTS} ${LDFLAGS}
diff --git a/src/longread-mapping/hashtable.c b/src/longread-mapping/hashtable.c
new file mode 100644
index 0000000..7573264
--- /dev/null
+++ b/src/longread-mapping/hashtable.c
@@ -0,0 +1,797 @@
+/*--------------------------------------------------------------------------*\
+ * -----===== HashTable =====-----
+ *
+ * Author: Keith Pomakis (pomakis at pobox.com)
+ * Date: August, 1998
+ * Released to the public domain.
+ *
+ *--------------------------------------------------------------------------
+ * $Id: hashtable.c,v 1.4 2017/05/25 04:42:23 cvs Exp $
+\*--------------------------------------------------------------------------*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
+#include "LRMconfig.h"
+#include "hashtable.h"
+
+static int pointercmp(const void *pointer1, const void *pointer2);
+static unsigned long pointerHashFunction(const void *pointer);
+static int isProbablePrime(long number);
+static long calculateIdealNumOfBuckets(HashTable *hashTable);
+
+
+ArrayList * ArrayListCreate(int init_capacity){
+ ArrayList * ret = malloc(sizeof(ArrayList));
+ memset(ret,0,sizeof(ArrayList));
+ ret -> capacityOfElements = init_capacity;
+ ret -> elementList = malloc(sizeof(void *)*init_capacity);
+ return ret;
+}
+
+void ArrayListDestroy(ArrayList * list){
+ long x1;
+ if(list -> elemDeallocator)
+ for(x1 = 0;x1 < list->numOfElements; x1++)
+ list -> elemDeallocator(list -> elementList[x1]);
+
+ free(list -> elementList);
+ free(list);
+}
+
+void * ArrayListGet(ArrayList * list, long n){
+ if(n<0 || n >= list->numOfElements)return NULL;
+ return list -> elementList[n];
+}
+
+int ArrayListPush(ArrayList * list, void * new_elem){
+ if(list -> capacityOfElements <= list->numOfElements){
+ list -> capacityOfElements *=1.3;
+ list -> elementList=realloc(list -> elementList, sizeof(void *)*list -> capacityOfElements);
+ }
+ list->elementList[list->numOfElements++] = new_elem;
+ return list->numOfElements;
+}
+void ArrayListSetDeallocationFunction(ArrayList * list, void (*elem_deallocator)(void *elem)){
+ list -> elemDeallocator = elem_deallocator;
+}
+
+
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableCreate() - creates a new HashTable
+ * DESCRIPTION:
+ * Creates a new HashTable. When finished with this HashTable, it
+ * should be explicitly destroyed by calling the HashTableDestroy()
+ * function.
+ * EFFICIENCY:
+ * O(1)
+ * ARGUMENTS:
+ * numOfBuckets - the number of buckets to start the HashTable out with.
+ * Must be greater than zero, and should be prime.
+ * Ideally, the number of buckets should between 1/5
+ * and 1 times the expected number of elements in the
+ * HashTable. Values much more or less than this will
+ * result in wasted memory or decreased performance
+ * respectively. The number of buckets in a HashTable
+ * can be re-calculated to an appropriate number by
+ * calling the HashTableRehash() function once the
+ * HashTable has been populated. The number of buckets
+ * in a HashTable may also be re-calculated
+ * automatically if the ratio of elements to buckets
+ * passes the thresholds set by HashTableSetIdealRatio().
+ * RETURNS:
+ * HashTable - a new Hashtable, or NULL on error
+\*--------------------------------------------------------------------------*/
+
+
+HashTable *HashTableCreate(long numOfBuckets) {
+ HashTable *hashTable;
+ int i;
+
+
+ assert(numOfBuckets > 0);
+
+ hashTable = (HashTable *) malloc(sizeof(HashTable));
+ if (hashTable == NULL)
+ return NULL;
+
+ hashTable->appendix1=NULL;
+ hashTable->appendix2=NULL;
+ hashTable->appendix3=NULL;
+
+ hashTable->counter1=0;
+ hashTable->counter2=0;
+ hashTable->counter3=0;
+
+ hashTable->bucketArray = (KeyValuePair **)
+ malloc(numOfBuckets * sizeof(KeyValuePair *));
+ if (hashTable->bucketArray == NULL) {
+ free(hashTable);
+ return NULL;
+ }
+
+ hashTable->numOfBuckets = numOfBuckets;
+ hashTable->numOfElements = 0;
+
+ for (i=0; i<numOfBuckets; i++)
+ hashTable->bucketArray[i] = NULL;
+
+ hashTable->idealRatio = 3.0;
+ hashTable->lowerRehashThreshold = 0.0;
+ hashTable->upperRehashThreshold = 15.0;
+
+ hashTable->keycmp = pointercmp;
+ hashTable->valuecmp = pointercmp;
+ hashTable->hashFunction = pointerHashFunction;
+ hashTable->keyDeallocator = NULL;
+ hashTable->valueDeallocator = NULL;
+
+ return hashTable;
+}
+
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableDestroy() - destroys an existing HashTable
+ * DESCRIPTION:
+ * Destroys an existing HashTable.
+ * EFFICIENCY:
+ * O(n)
+ * ARGUMENTS:
+ * hashTable - the HashTable to destroy
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableDestroy(HashTable *hashTable) {
+ int i;
+
+ for (i=0; i<hashTable->numOfBuckets; i++) {
+ KeyValuePair *pair = hashTable->bucketArray[i];
+ while (pair != NULL) {
+ KeyValuePair *nextPair = pair->next;
+
+ if (hashTable->keyDeallocator != NULL)
+ hashTable->keyDeallocator((void *) pair->key);
+ if (hashTable->valueDeallocator != NULL){
+// fprintf(stderr,"FREE %p\n", pair->value);
+ hashTable->valueDeallocator(pair->value);
+ }
+ free(pair);
+ pair = nextPair;
+ }
+ }
+
+ free(hashTable->bucketArray);
+ free(hashTable);
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableContainsKey() - checks the existence of a key in a HashTable
+ * DESCRIPTION:
+ * Determines whether or not the specified HashTable contains the
+ * specified key. Uses the comparison function specified by
+ * HashTableSetKeyComparisonFunction().
+ * EFFICIENCY:
+ * O(1), assuming a good hash function and element-to-bucket ratio
+ * ARGUMENTS:
+ * hashTable - the HashTable to search
+ * key - the key to search for
+ * RETURNS:
+ * bool - whether or not the specified HashTable contains the
+ * specified key.
+\*--------------------------------------------------------------------------*/
+
+int HashTableContainsKey(const HashTable *hashTable, const void *key) {
+ return (HashTableGet(hashTable, key) != NULL);
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableContainsValue()
+ * - checks the existence of a value in a HashTable
+ * DESCRIPTION:
+ * Determines whether or not the specified HashTable contains the
+ * specified value. Unlike HashTableContainsKey(), this function is
+ * not very efficient, since it has to scan linearly looking for a
+ * match. Uses the comparison function specified by
+ * HashTableSetValueComparisonFunction().
+ * EFFICIENCY:
+ * O(n)
+ * ARGUMENTS:
+ * hashTable - the HashTable to search
+ * value - the value to search for
+ * RETURNS:
+ * bool - whether or not the specified HashTable contains the
+ * specified value.
+\*--------------------------------------------------------------------------*/
+
+int HashTableContainsValue(const HashTable *hashTable, const void *value) {
+ int i;
+
+ for (i=0; i<hashTable->numOfBuckets; i++) {
+ KeyValuePair *pair = hashTable->bucketArray[i];
+ while (pair != NULL) {
+ if (hashTable->valuecmp(value, pair->value) == 0)
+ return 1;
+ pair = pair->next;
+ }
+ }
+
+ return 0;
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTablePut() - adds a key/value pair to a HashTable
+ * DESCRIPTION:
+ * Adds the specified key/value pair to the specified HashTable. If
+ * the key already exists in the HashTable (determined by the comparison
+ * function specified by HashTableSetKeyComparisonFunction()), its value
+ * is replaced by the new value. May trigger an auto-rehash (see
+ * HashTableSetIdealRatio()). It is illegal to specify NULL as the
+ * key or value.
+ * EFFICIENCY:
+ * O(1), assuming a good hash function and element-to-bucket ratio
+ * ARGUMENTS:
+ * hashTable - the HashTable to add to
+ * key - the key to add or whose value to replace
+ * value - the value associated with the key
+ * RETURNS:
+ * err - 0 if successful, -1 if an error was encountered
+\*--------------------------------------------------------------------------*/
+
+int HashTablePut(HashTable *hashTable, const void *key, void *value) {
+ return HashTablePutReplace(hashTable, key, value, 1);
+}
+int HashTablePutReplace(HashTable *hashTable, const void *key, void *value, int replace_key) {
+ long hashValue;
+ KeyValuePair *pair;
+
+ assert(key != NULL);
+ assert(value != NULL);
+
+ hashValue = hashTable->hashFunction(key) % hashTable->numOfBuckets;
+
+ pair = hashTable->bucketArray[hashValue];
+
+ while (pair != NULL && hashTable->keycmp(key, pair->key) != 0)
+ pair = pair->next;
+
+ if (pair) {
+ if (pair->key != key) {
+ if(replace_key)
+ {
+ if(hashTable->keyDeallocator)
+ hashTable->keyDeallocator((void *) pair->key);
+ pair->key = key;
+ }
+ }
+ if (pair->value != value) {
+ if (hashTable->valueDeallocator != NULL)
+ hashTable->valueDeallocator(pair->value);
+ pair->value = value;
+ }
+ }
+ else {
+ KeyValuePair *newPair = (KeyValuePair *) malloc(sizeof(KeyValuePair));
+ if (newPair == NULL) {
+ return -1;
+ }
+ else {
+ newPair->key = key;
+ newPair->value = value;
+ newPair->next = hashTable->bucketArray[hashValue];
+ hashTable->bucketArray[hashValue] = newPair;
+ hashTable->numOfElements++;
+
+ if (hashTable->upperRehashThreshold > hashTable->idealRatio) {
+ float elementToBucketRatio = (float) hashTable->numOfElements /
+ (float) hashTable->numOfBuckets;
+ if (elementToBucketRatio > hashTable->upperRehashThreshold)
+ HashTableRehash(hashTable, 0);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableGet() - retrieves the value of a key in a HashTable
+ * DESCRIPTION:
+ * Retrieves the value of the specified key in the specified HashTable.
+ * Uses the comparison function specified by
+ * HashTableSetKeyComparisonFunction().
+ * EFFICIENCY:
+ * O(1), assuming a good hash function and element-to-bucket ratio
+ * ARGUMENTS:
+ * hashTable - the HashTable to search
+ * key - the key whose value is desired
+ * RETURNS:
+ * void * - the value of the specified key, or NULL if the key
+ * doesn't exist in the HashTable
+\*--------------------------------------------------------------------------*/
+
+void *HashTableGet(const HashTable *hashTable, const void *key) {
+ long hashValue = hashTable->hashFunction(key) % hashTable->numOfBuckets;
+
+ KeyValuePair *pair = hashTable->bucketArray[hashValue];
+
+ while (pair != NULL && hashTable->keycmp(key, pair->key) != 0)
+ pair = pair->next;
+
+ return (pair == NULL)? NULL : pair->value;
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableRemove() - removes a key/value pair from a HashTable
+ * DESCRIPTION:
+ * Removes the key/value pair identified by the specified key from the
+ * specified HashTable if the key exists in the HashTable. May trigger
+ * an auto-rehash (see HashTableSetIdealRatio()).
+ * EFFICIENCY:
+ * O(1), assuming a good hash function and element-to-bucket ratio
+ * ARGUMENTS:
+ * hashTable - the HashTable to remove the key/value pair from
+ * key - the key specifying the key/value pair to be removed
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableRemove(HashTable *hashTable, const void *key) {
+ long hashValue = hashTable->hashFunction(key) % hashTable->numOfBuckets;
+
+
+ KeyValuePair *pair = hashTable->bucketArray[hashValue];
+ KeyValuePair *previousPair = NULL;
+
+ while (pair != NULL && hashTable->keycmp(key, pair->key) != 0) {
+ previousPair = pair;
+ pair = pair->next;
+ }
+
+ if (pair != NULL) {
+ if (hashTable->keyDeallocator != NULL)
+ hashTable->keyDeallocator((void *) pair->key);
+ if (hashTable->valueDeallocator != NULL)
+ hashTable->valueDeallocator(pair->value);
+ if (previousPair != NULL)
+ previousPair->next = pair->next;
+ else
+ hashTable->bucketArray[hashValue] = pair->next;
+ free(pair);
+ hashTable->numOfElements--;
+
+ if (hashTable->lowerRehashThreshold > 0.0) {
+ float elementToBucketRatio = (float) hashTable->numOfElements /
+ (float) hashTable->numOfBuckets;
+ if (elementToBucketRatio < hashTable->lowerRehashThreshold)
+ HashTableRehash(hashTable, 0);
+ }
+ }
+
+
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableRemoveAll() - removes all key/value pairs from a HashTable
+ * DESCRIPTION:
+ * Removes all key/value pairs from the specified HashTable. May trigger
+ * an auto-rehash (see HashTableSetIdealRatio()).
+ * EFFICIENCY:
+ * O(n)
+ * ARGUMENTS:
+ * hashTable - the HashTable to remove all key/value pairs from
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableRemoveAll(HashTable *hashTable) {
+ int i;
+
+ for (i=0; i<hashTable->numOfBuckets; i++) {
+ KeyValuePair *pair = hashTable->bucketArray[i];
+ while (pair != NULL) {
+ KeyValuePair *nextPair = pair->next;
+ if (hashTable->keyDeallocator != NULL)
+ hashTable->keyDeallocator((void *) pair->key);
+ if (hashTable->valueDeallocator != NULL)
+ hashTable->valueDeallocator(pair->value);
+ free(pair);
+ pair = nextPair;
+ }
+ hashTable->bucketArray[i] = NULL;
+ }
+
+ hashTable->numOfElements = 0;
+ HashTableRehash(hashTable, 5);
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableIsEmpty() - determines if a HashTable is empty
+ * DESCRIPTION:
+ * Determines whether or not the specified HashTable contains any
+ * key/value pairs.
+ * EFFICIENCY:
+ * O(1)
+ * ARGUMENTS:
+ * hashTable - the HashTable to check
+ * RETURNS:
+ * bool - whether or not the specified HashTable contains any
+ * key/value pairs
+\*--------------------------------------------------------------------------*/
+
+int HashTableIsEmpty(const HashTable *hashTable) {
+ return (hashTable->numOfElements == 0);
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSize() - returns the number of elements in a HashTable
+ * DESCRIPTION:
+ * Returns the number of key/value pairs that are present in the
+ * specified HashTable.
+ * EFFICIENCY:
+ * O(1)
+ * ARGUMENTS:
+ * hashTable - the HashTable whose size is requested
+ * RETURNS:
+ * long - the number of key/value pairs that are present in
+ * the specified HashTable
+\*--------------------------------------------------------------------------*/
+
+long HashTableSize(const HashTable *hashTable) {
+ return hashTable->numOfElements;
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableGetNumBuckets() - returns the number of buckets in a HashTable
+ * DESCRIPTION:
+ * Returns the number of buckets that are in the specified HashTable.
+ * This may change dynamically throughout the life of a HashTable if
+ * automatic or manual rehashing is performed.
+ * EFFICIENCY:
+ * O(1)
+ * ARGUMENTS:
+ * hashTable - the HashTable whose number of buckets is requested
+ * RETURNS:
+ * long - the number of buckets that are in the specified
+ * HashTable
+\*--------------------------------------------------------------------------*/
+
+long HashTableGetNumBuckets(const HashTable *hashTable) {
+ return hashTable->numOfBuckets;
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSetKeyComparisonFunction()
+ * - specifies the function used to compare keys in a HashTable
+ * DESCRIPTION:
+ * Specifies the function used to compare keys in the specified
+ * HashTable. The specified function should return zero if the two
+ * keys are considered equal, and non-zero otherwise. The default
+ * function is one that simply compares pointers.
+ * ARGUMENTS:
+ * hashTable - the HashTable whose key comparison function is being
+ * specified
+ * keycmp - a function which returns zero if the two arguments
+ * passed to it are considered "equal" keys and non-zero
+ * otherwise
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableSetKeyComparisonFunction(HashTable *hashTable,
+ int (*keycmp)(const void *key1, const void *key2)) {
+ assert(keycmp != NULL);
+ hashTable->keycmp = keycmp;
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSetValueComparisonFunction()
+ * - specifies the function used to compare values in a HashTable
+ * DESCRIPTION:
+ * Specifies the function used to compare values in the specified
+ * HashTable. The specified function should return zero if the two
+ * values are considered equal, and non-zero otherwise. The default
+ * function is one that simply compares pointers.
+ * ARGUMENTS:
+ * hashTable - the HashTable whose value comparison function is being
+ * specified
+ * valuecmp - a function which returns zero if the two arguments
+ * passed to it are considered "equal" values and non-zero
+ * otherwise
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableSetValueComparisonFunction(HashTable *hashTable,
+ int (*valuecmp)(const void *value1, const void *value2)) {
+ assert(valuecmp != NULL);
+ hashTable->valuecmp = valuecmp;
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSetHashFunction()
+ * - specifies the hash function used by a HashTable
+ * DESCRIPTION:
+ * Specifies the function used to determine the hash value for a key
+ * in the specified HashTable (before modulation). An ideal hash
+ * function is one which is easy to compute and approximates a
+ * "random" function. The default function is one that works
+ * relatively well for pointers. If the HashTable keys are to be
+ * strings (which is probably the case), then this default function
+ * will not suffice, in which case consider using the provided
+ * HashTableStringHashFunction() function.
+ * ARGUMENTS:
+ * hashTable - the HashTable whose hash function is being specified
+ * hashFunction - a function which returns an appropriate hash code
+ * for a given key
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableSetHashFunction(HashTable *hashTable,
+ unsigned long (*hashFunction)(const void *key))
+{
+ assert(hashFunction != NULL);
+ hashTable->hashFunction = hashFunction;
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableRehash() - reorganizes a HashTable to be more efficient
+ * DESCRIPTION:
+ * Reorganizes a HashTable to be more efficient. If a number of
+ * buckets is specified, the HashTable is rehashed to that number of
+ * buckets. If 0 is specified, the HashTable is rehashed to a number
+ * of buckets which is automatically calculated to be a prime number
+ * that achieves (as closely as possible) the ideal element-to-bucket
+ * ratio specified by the HashTableSetIdealRatio() function.
+ * EFFICIENCY:
+ * O(n)
+ * ARGUMENTS:
+ * hashTable - the HashTable to be reorganized
+ * numOfBuckets - the number of buckets to rehash the HashTable to.
+ * Should be prime. Ideally, the number of buckets
+ * should be between 1/5 and 1 times the expected
+ * number of elements in the HashTable. Values much
+ * more or less than this will result in wasted memory
+ * or decreased performance respectively. If 0 is
+ * specified, an appropriate number of buckets is
+ * automatically calculated.
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableRehash(HashTable *hashTable, long numOfBuckets) {
+ KeyValuePair **newBucketArray;
+ int i;
+
+ assert(numOfBuckets >= 0);
+ if (numOfBuckets == 0)
+ numOfBuckets = calculateIdealNumOfBuckets(hashTable);
+
+ if (numOfBuckets == hashTable->numOfBuckets)
+ return; /* already the right size! */
+
+ newBucketArray = (KeyValuePair **)
+ malloc(numOfBuckets * sizeof(KeyValuePair *));
+ if (newBucketArray == NULL) {
+ /* Couldn't allocate memory for the new array. This isn't a fatal
+ * error; we just can't perform the rehash. */
+ return;
+ }
+
+ for (i=0; i<numOfBuckets; i++)
+ newBucketArray[i] = NULL;
+
+ for (i=0; i<hashTable->numOfBuckets; i++) {
+ KeyValuePair *pair = hashTable->bucketArray[i];
+ while (pair != NULL) {
+ KeyValuePair *nextPair = pair->next;
+ long hashValue = hashTable->hashFunction(pair->key) % numOfBuckets;
+ pair->next = newBucketArray[hashValue];
+ newBucketArray[hashValue] = pair;
+ pair = nextPair;
+ }
+ }
+
+ free(hashTable->bucketArray);
+ hashTable->bucketArray = newBucketArray;
+ hashTable->numOfBuckets = numOfBuckets;
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSetIdealRatio()
+ * - sets the ideal element-to-bucket ratio of a HashTable
+ * DESCRIPTION:
+ * Sets the ideal element-to-bucket ratio, as well as the lower and
+ * upper auto-rehash thresholds, of the specified HashTable. Note
+ * that this function doesn't actually perform a rehash.
+ *
+ * The default values for these properties are 3.0, 0.0 and 15.0
+ * respectively. This is likely fine for most situations, so there
+ * is probably no need to call this function.
+ * ARGUMENTS:
+ * hashTable - a HashTable
+ * idealRatio - the ideal element-to-bucket ratio. When a rehash
+ * occurs (either manually via a call to the
+ * HashTableRehash() function or automatically due the
+ * the triggering of one of the thresholds below), the
+ * number of buckets in the HashTable will be
+ * recalculated to be a prime number that achieves (as
+ * closely as possible) this ideal ratio. Must be a
+ * positive number.
+ * lowerRehashThreshold
+ * - the element-to-bucket ratio that is considered
+ * unacceptably low (i.e., too few elements per bucket).
+ * If the actual ratio falls below this number, a
+ * rehash will automatically be performed. Must be
+ * lower than the value of idealRatio. If no ratio
+ * is considered unacceptably low, a value of 0.0 can
+ * be specified.
+ * upperRehashThreshold
+ * - the element-to-bucket ratio that is considered
+ * unacceptably high (i.e., too many elements per bucket).
+ * If the actual ratio rises above this number, a
+ * rehash will automatically be performed. Must be
+ * higher than idealRatio. However, if no ratio
+ * is considered unacceptably high, a value of 0.0 can
+ * be specified.
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableSetIdealRatio(HashTable *hashTable, float idealRatio,
+ float lowerRehashThreshold, float upperRehashThreshold) {
+ assert(idealRatio > 0.0);
+ assert(lowerRehashThreshold < idealRatio);
+ assert(upperRehashThreshold == 0.0 || upperRehashThreshold > idealRatio);
+
+ hashTable->idealRatio = idealRatio;
+ hashTable->lowerRehashThreshold = lowerRehashThreshold;
+ hashTable->upperRehashThreshold = upperRehashThreshold;
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSetDeallocationFunctions()
+ * - sets the key and value deallocation functions of a HashTable
+ * DESCRIPTION:
+ * Sets the key and value deallocation functions of the specified
+ * HashTable. This determines what happens to a key or a value when it
+ * is removed from the HashTable. If the deallocation function is NULL
+ * (the default if this function is never called), its reference is
+ * simply dropped and it is up to the calling program to perform the
+ * proper memory management. If the deallocation function is non-NULL,
+ * it is called to free the memory used by the object. E.g., for simple
+ * objects, an appropriate deallocation function may be free().
+ *
+ * This affects the behaviour of the HashTableDestroy(), HashTablePut(),
+ * HashTableRemove() and HashTableRemoveAll() functions.
+ * ARGUMENTS:
+ * hashTable - a HashTable
+ * keyDeallocator
+ * - if non-NULL, the function to be called when a key is
+ * removed from the HashTable.
+ * valueDeallocator
+ * - if non-NULL, the function to be called when a value is
+ * removed from the HashTable.
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableSetDeallocationFunctions(HashTable *hashTable,
+ void (*keyDeallocator)(void *key),
+ void (*valueDeallocator)(void *value)) {
+ hashTable->keyDeallocator = keyDeallocator;
+ hashTable->valueDeallocator = valueDeallocator;
+}
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableStringHashFunction() - a good hash function for strings
+ * DESCRIPTION:
+ * A hash function that is appropriate for hashing strings. Note that
+ * this is not the default hash function. To make it the default hash
+ * function, call HashTableSetHashFunction(hashTable,
+ * HashTableStringHashFunction).
+ * ARGUMENTS:
+ * key - the key to be hashed
+ * RETURNS:
+ * unsigned long - the unmodulated hash value of the key
+\*--------------------------------------------------------------------------*/
+
+unsigned long HashTableStringHashFunction(const void *key) {
+ const unsigned char *str = (const unsigned char *) key;
+ unsigned long hashValue = 0;
+ int i;
+
+ for (i=0; str[i] != '\0'; i++)
+ hashValue = hashValue * 37 + str[i];
+
+ return hashValue;
+}
+
+static int pointercmp(const void *pointer1, const void *pointer2) {
+ return (pointer1 != pointer2);
+}
+
+static unsigned long pointerHashFunction(const void *pointer) {
+ return ((unsigned long) pointer) ;
+}
+
+static int isProbablePrime(long oddNumber) {
+ long i;
+
+ for (i=3; i<51; i+=2)
+ if (oddNumber == i)
+ return 1;
+ else if (oddNumber%i == 0)
+ return 0;
+
+ return 1; /* maybe */
+}
+
+static long calculateIdealNumOfBuckets(HashTable *hashTable) {
+ long idealNumOfBuckets = hashTable->numOfElements / hashTable->idealRatio;
+ if (idealNumOfBuckets < 5)
+ idealNumOfBuckets = 5;
+ else
+ idealNumOfBuckets |= 0x01; /* make it an odd number */
+ while (!isProbablePrime(idealNumOfBuckets))
+ idealNumOfBuckets += 2;
+
+ return idealNumOfBuckets;
+}
+
+
+void free_values_destroy(HashTable * tab)
+{
+
+ KeyValuePair * cursor;
+ int bucket;
+
+ for(bucket=0; bucket< tab -> numOfBuckets; bucket++)
+ {
+ cursor = tab -> bucketArray[bucket];
+ while (1)
+ {
+ if(!cursor) break;
+ char * read_txt = (char *) cursor ->value;
+ free(read_txt);
+ cursor = cursor->next;
+ }
+ }
+
+ HashTableDestroy(tab);
+}
+
+void HashTableIteration(HashTable * tab, void process_item(void * key, void * hashed_obj, HashTable * tab) )
+{
+ int i;
+ for (i=0; i< tab ->numOfBuckets; i++) {
+ KeyValuePair *pair = tab ->bucketArray[i];
+ while (pair != NULL) {
+ process_item(( void * )pair -> key, pair -> value, tab);
+ KeyValuePair *nextPair = pair->next;
+ pair = nextPair;
+ }
+ }
+}
diff --git a/src/longread-mapping/hashtable.h b/src/longread-mapping/hashtable.h
new file mode 100644
index 0000000..579aeb2
--- /dev/null
+++ b/src/longread-mapping/hashtable.h
@@ -0,0 +1,471 @@
+/*--------------------------------------------------------------------------*\
+ * -----===== HashTable =====-----
+ *
+ * Author: Keith Pomakis (pomakis at pobox.com)
+ * Date: August, 1998
+ * Released to the public domain.
+ *
+ *--------------------------------------------------------------------------
+ * $Id: hashtable.h,v 1.2 2017/11/09 21:37:23 cvs Exp $
+\*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*\
+ * Author of the ArrayList functions and data structures : Yang Liao
+ * Licence: GPL2
+\*--------------------------------------------------------------------------*/
+
+
+#ifndef _HASHTABLE_H
+#define _HASHTABLE_H
+
+/* These structs should not be accessed directly from user code.
+ * All access should be via the public functions declared below. */
+
+typedef struct KeyValuePair_struct {
+ const void *key;
+ void *value;
+ struct KeyValuePair_struct *next;
+} KeyValuePair;
+
+typedef struct {
+ long numOfBuckets;
+ long numOfElements;
+ KeyValuePair **bucketArray;
+ float idealRatio, lowerRehashThreshold, upperRehashThreshold;
+ int (*keycmp)(const void *key1, const void *key2);
+ int (*valuecmp)(const void *value1, const void *value2);
+ unsigned long (*hashFunction)(const void *key);
+ void (*keyDeallocator)(void *key);
+ void (*valueDeallocator)(void *value);
+
+ void * appendix1;
+ void * appendix2;
+ void * appendix3;
+ long long int counter1;
+ long long int counter2;
+ long long int counter3;
+} HashTable;
+
+
+typedef struct {
+ void ** elementList;
+ long numOfElements;
+ long capacityOfElements;
+ void (*elemDeallocator)(void *elem);
+} ArrayList;
+
+ArrayList * ArrayListCreate(int init_capacity);
+void ArrayListDestroy(ArrayList * list);
+void * ArrayListGet(ArrayList * list, long n);
+int ArrayListPush(ArrayList * list, void * new_elem);
+void ArrayListSetDeallocationFunction(ArrayList * list, void (*elem_deallocator)(void *elem));
+
+
+void HashTableIteration(HashTable * tab, void process_item(void * key, void * hashed_obj, HashTable * tab) );
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableCreate() - creates a new HashTable
+ * DESCRIPTION:
+ * Creates a new HashTable. When finished with this HashTable, it
+ * should be explicitly destroyed by calling the HashTableDestroy()
+ * function.
+ * EFFICIENCY:
+ * O(1)
+ * ARGUMENTS:
+ * numOfBuckets - the number of buckets to start the HashTable out with.
+ * Must be greater than zero, and should be prime.
+ * Ideally, the number of buckets should between 1/5
+ * and 1 times the expected number of elements in the
+ * HashTable. Values much more or less than this will
+ * result in wasted memory or decreased performance
+ * respectively. The number of buckets in a HashTable
+ * can be re-calculated to an appropriate number by
+ * calling the HashTableRehash() function once the
+ * HashTable has been populated. The number of buckets
+ * in a HashTable may also be re-calculated
+ * automatically if the ratio of elements to buckets
+ * passes the thresholds set by HashTableSetIdealRatio().
+ * RETURNS:
+ * HashTable - a new Hashtable, or NULL on error
+\*--------------------------------------------------------------------------*/
+
+HashTable *HashTableCreate(long numOfBuckets);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableDestroy() - destroys an existing HashTable
+ * DESCRIPTION:
+ * Destroys an existing HashTable.
+ * EFFICIENCY:
+ * O(n)
+ * ARGUMENTS:
+ * hashTable - the HashTable to destroy
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableDestroy(HashTable *hashTable);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableContainsKey() - checks the existence of a key in a HashTable
+ * DESCRIPTION:
+ * Determines whether or not the specified HashTable contains the
+ * specified key. Uses the comparison function specified by
+ * HashTableSetKeyComparisonFunction().
+ * EFFICIENCY:
+ * O(1), assuming a good hash function and element-to-bucket ratio
+ * ARGUMENTS:
+ * hashTable - the HashTable to search
+ * key - the key to search for
+ * RETURNS:
+ * bool - whether or not the specified HashTable contains the
+ * specified key.
+\*--------------------------------------------------------------------------*/
+
+int HashTableContainsKey(const HashTable *hashTable, const void *key);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableContainsValue()
+ * - checks the existence of a value in a HashTable
+ * DESCRIPTION:
+ * Determines whether or not the specified HashTable contains the
+ * specified value. Unlike HashTableContainsKey(), this function is
+ * not very efficient, since it has to scan linearly looking for a
+ * match. Uses the comparison function specified by
+ * HashTableSetValueComparisonFunction().
+ * EFFICIENCY:
+ * O(n)
+ * ARGUMENTS:
+ * hashTable - the HashTable to search
+ * value - the value to search for
+ * RETURNS:
+ * bool - whether or not the specified HashTable contains the
+ * specified value.
+\*--------------------------------------------------------------------------*/
+
+int HashTableContainsValue(const HashTable *hashTable, const void *value);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTablePut() - adds a key/value pair to a HashTable
+ * DESCRIPTION:
+ * Adds the specified key/value pair to the specified HashTable. If
+ * the key already exists in the HashTable (determined by the comparison
+ * function specified by HashTableSetKeyComparisonFunction()), its value
+ * is replaced by the new value. May trigger an auto-rehash (see
+ * HashTableSetIdealRatio()). It is illegal to specify NULL as the
+ * key or value.
+ * EFFICIENCY:
+ * O(1), assuming a good hash function and element-to-bucket ratio
+ * ARGUMENTS:
+ * hashTable - the HashTable to add to
+ * key - the key to add or whose value to replace
+ * value - the value associated with the key
+ * RETURNS:
+ * err - 0 if successful, -1 if an error was encountered
+\*--------------------------------------------------------------------------*/
+
+int HashTablePut(HashTable *hashTable, const void *key, void *value);
+int HashTablePutReplace(HashTable *hashTable, const void *key, void *value, int replace_key);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableGet() - retrieves the value of a key in a HashTable
+ * DESCRIPTION:
+ * Retrieves the value of the specified key in the specified HashTable.
+ * Uses the comparison function specified by
+ * HashTableSetKeyComparisonFunction().
+ * EFFICIENCY:
+ * O(1), assuming a good hash function and element-to-bucket ratio
+ * ARGUMENTS:
+ * hashTable - the HashTable to search
+ * key - the key whose value is desired
+ * RETURNS:
+ * void * - the value of the specified key, or NULL if the key
+ * doesn't exist in the HashTable
+\*--------------------------------------------------------------------------*/
+
+void *HashTableGet(const HashTable *hashTable, const void *key);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableRemove() - removes a key/value pair from a HashTable
+ * DESCRIPTION:
+ * Removes the key/value pair identified by the specified key from the
+ * specified HashTable if the key exists in the HashTable. May trigger
+ * an auto-rehash (see HashTableSetIdealRatio()).
+ * EFFICIENCY:
+ * O(1), assuming a good hash function and element-to-bucket ratio
+ * ARGUMENTS:
+ * hashTable - the HashTable to remove the key/value pair from
+ * key - the key specifying the key/value pair to be removed
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableRemove(HashTable *hashTable, const void *key);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableRemoveAll() - removes all key/value pairs from a HashTable
+ * DESCRIPTION:
+ * Removes all key/value pairs from the specified HashTable. May trigger
+ * an auto-rehash (see HashTableSetIdealRatio()).
+ * EFFICIENCY:
+ * O(n)
+ * ARGUMENTS:
+ * hashTable - the HashTable to remove all key/value pairs from
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableRemoveAll(HashTable *hashTable);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableIsEmpty() - determines if a HashTable is empty
+ * DESCRIPTION:
+ * Determines whether or not the specified HashTable contains any
+ * key/value pairs.
+ * EFFICIENCY:
+ * O(1)
+ * ARGUMENTS:
+ * hashTable - the HashTable to check
+ * RETURNS:
+ * bool - whether or not the specified HashTable contains any
+ * key/value pairs
+\*--------------------------------------------------------------------------*/
+
+int HashTableIsEmpty(const HashTable *hashTable);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSize() - returns the number of elements in a HashTable
+ * DESCRIPTION:
+ * Returns the number of key/value pairs that are present in the
+ * specified HashTable.
+ * EFFICIENCY:
+ * O(1)
+ * ARGUMENTS:
+ * hashTable - the HashTable whose size is requested
+ * RETURNS:
+ * long - the number of key/value pairs that are present in
+ * the specified HashTable
+\*--------------------------------------------------------------------------*/
+
+long HashTableSize(const HashTable *hashTable);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableGetNumBuckets() - returns the number of buckets in a HashTable
+ * DESCRIPTION:
+ * Returns the number of buckets that are in the specified HashTable.
+ * This may change dynamically throughout the life of a HashTable if
+ * automatic or manual rehashing is performed.
+ * EFFICIENCY:
+ * O(1)
+ * ARGUMENTS:
+ * hashTable - the HashTable whose number of buckets is requested
+ * RETURNS:
+ * long - the number of buckets that are in the specified
+ * HashTable
+\*--------------------------------------------------------------------------*/
+
+long HashTableGetNumBuckets(const HashTable *hashTable);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSetKeyComparisonFunction()
+ * - specifies the function used to compare keys in a HashTable
+ * DESCRIPTION:
+ * Specifies the function used to compare keys in the specified
+ * HashTable. The specified function should return zero if the two
+ * keys are considered equal, and non-zero otherwise. The default
+ * function is one that simply compares pointers.
+ * ARGUMENTS:
+ * hashTable - the HashTable whose key comparison function is being
+ * specified
+ * keycmp - a function which returns zero if the two arguments
+ * passed to it are considered "equal" keys and non-zero
+ * otherwise
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableSetKeyComparisonFunction(HashTable *hashTable,
+ int (*keycmp)(const void *key1, const void *key2));
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSetValueComparisonFunction()
+ * - specifies the function used to compare values in a HashTable
+ * DESCRIPTION:
+ * Specifies the function used to compare values in the specified
+ * HashTable. The specified function should return zero if the two
+ * values are considered equal, and non-zero otherwise. The default
+ * function is one that simply compares pointers.
+ * ARGUMENTS:
+ * hashTable - the HashTable whose value comparison function is being
+ * specified
+ * valuecmp - a function which returns zero if the two arguments
+ * passed to it are considered "equal" values and non-zero
+ * otherwise
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableSetValueComparisonFunction(HashTable *hashTable,
+ int (*valuecmp)(const void *value1, const void *value2));
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSetHashFunction()
+ * - specifies the hash function used by a HashTable
+ * DESCRIPTION:
+ * Specifies the function used to determine the hash value for a key
+ * in the specified HashTable (before modulation). An ideal hash
+ * function is one which is easy to compute and approximates a
+ * "random" function. The default function is one that works
+ * relatively well for pointers. If the HashTable keys are to be
+ * strings (which is probably the case), then this default function
+ * will not suffice, in which case consider using the provided
+ * HashTableStringHashFunction() function.
+ * ARGUMENTS:
+ * hashTable - the HashTable whose hash function is being specified
+ * hashFunction - a function which returns an appropriate hash code
+ * for a given key
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableSetHashFunction(HashTable *hashTable,
+ unsigned long (*hashFunction)(const void *key));
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableRehash() - reorganizes a HashTable to be more efficient
+ * DESCRIPTION:
+ * Reorganizes a HashTable to be more efficient. If a number of
+ * buckets is specified, the HashTable is rehashed to that number of
+ * buckets. If 0 is specified, the HashTable is rehashed to a number
+ * of buckets which is automatically calculated to be a prime number
+ * that achieves (as closely as possible) the ideal element-to-bucket
+ * ratio specified by the HashTableSetIdealRatio() function.
+ * EFFICIENCY:
+ * O(n)
+ * ARGUMENTS:
+ * hashTable - the HashTable to be reorganized
+ * numOfBuckets - the number of buckets to rehash the HashTable to.
+ * Should be prime. Ideally, the number of buckets
+ * should be between 1/5 and 1 times the expected
+ * number of elements in the HashTable. Values much
+ * more or less than this will result in wasted memory
+ * or decreased performance respectively. If 0 is
+ * specified, an appropriate number of buckets is
+ * automatically calculated.
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableRehash(HashTable *hashTable, long numOfBuckets);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSetIdealRatio()
+ * - sets the ideal element-to-bucket ratio of a HashTable
+ * DESCRIPTION:
+ * Sets the ideal element-to-bucket ratio, as well as the lower and
+ * upper auto-rehash thresholds, of the specified HashTable. Note
+ * that this function doesn't actually perform a rehash.
+ *
+ * The default values for these properties are 3.0, 0.0 and 15.0
+ * respectively. This is likely fine for most situations, so there
+ * is probably no need to call this function.
+ * ARGUMENTS:
+ * hashTable - a HashTable
+ * idealRatio - the ideal element-to-bucket ratio. When a rehash
+ * occurs (either manually via a call to the
+ * HashTableRehash() function or automatically due the
+ * the triggering of one of the thresholds below), the
+ * number of buckets in the HashTable will be
+ * recalculated to be a prime number that achieves (as
+ * closely as possible) this ideal ratio. Must be a
+ * positive number.
+ * lowerRehashThreshold
+ * - the element-to-bucket ratio that is considered
+ * unacceptably low (i.e., too few elements per bucket).
+ * If the actual ratio falls below this number, a
+ * rehash will automatically be performed. Must be
+ * lower than the value of idealRatio. If no ratio
+ * is considered unacceptably low, a value of 0.0 can
+ * be specified.
+ * upperRehashThreshold
+ * - the element-to-bucket ratio that is considered
+ * unacceptably high (i.e., too many elements per bucket).
+ * If the actual ratio rises above this number, a
+ * rehash will automatically be performed. Must be
+ * higher than idealRatio. However, if no ratio
+ * is considered unacceptably high, a value of 0.0 can
+ * be specified.
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableSetIdealRatio(HashTable *hashTable, float idealRatio,
+ float lowerRehashThreshold,
+ float upperRehashThreshold);
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableSetDeallocationFunctions()
+ * - sets the key and value deallocation functions of a HashTable
+ * DESCRIPTION:
+ * Sets the key and value deallocation functions of the specified
+ * HashTable. This determines what happens to a key or a value when it
+ * is removed from the HashTable. If the deallocation function is NULL
+ * (the default if this function is never called), its reference is
+ * simply dropped and it is up to the calling program to perform the
+ * proper memory management. If the deallocation function is non-NULL,
+ * it is called to free the memory used by the object. E.g., for simple
+ * objects, an appropriate deallocation function may be free().
+ *
+ * This affects the behaviour of the HashTableDestroy(), HashTablePut(),
+ * HashTableRemove() and HashTableRemoveAll() functions.
+ * ARGUMENTS:
+ * hashTable - a HashTable
+ * keyDeallocator
+ * - if non-NULL, the function to be called when a key is
+ * removed from the HashTable.
+ * valueDeallocator
+ * - if non-NULL, the function to be called when a value is
+ * removed from the HashTable.
+ * RETURNS:
+ * <nothing>
+\*--------------------------------------------------------------------------*/
+
+void HashTableSetDeallocationFunctions(HashTable *hashTable,
+ void (*keyDeallocator)(void *key),
+ void (*valueDeallocator)(void *value));
+
+/*--------------------------------------------------------------------------*\
+ * NAME:
+ * HashTableStringHashFunction() - a good hash function for strings
+ * DESCRIPTION:
+ * A hash function that is appropriate for hashing strings. Note that
+ * this is not the default hash function. To make it the default hash
+ * function, call HashTableSetHashFunction(HashTableStringHashFunction).
+ * ARGUMENTS:
+ * key - the key to be hashed
+ * RETURNS:
+ * long - the unmodulated hash value of the key
+\*--------------------------------------------------------------------------*/
+
+unsigned long HashTableStringHashFunction(const void *key);
+
+void free_values_destroy(HashTable * tab);
+#endif /* _HASHTABLE_H */
+
+
diff --git a/src/longread-mapping/longread-mapping-one.c b/src/longread-mapping/longread-mapping-one.c
new file mode 100644
index 0000000..c04f3cb
--- /dev/null
+++ b/src/longread-mapping/longread-mapping-one.c
@@ -0,0 +1,1231 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <time.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <sys/timeb.h>
+#include <sys/stat.h>
+#include <locale.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#include "LRMconfig.h"
+#include "LRMsorted-hashtable.h"
+#include "LRMbase-index.h"
+#include "LRMchro-event.h"
+#include "LRMfile-io.h"
+
+int LRMvalidate_and_init_context(LRMcontext_t ** context, int argc, char ** argv);
+
+int main(int argc, char ** argv){
+ int retv=0;
+
+ LRMcontext_t *context = NULL;
+ retv = retv || LRMvalidate_and_init_context(&context, argc, argv);
+ retv = retv || LRMshow_conf(context);
+ retv = retv || LRMrun_task(context);
+ retv = retv || LRMfinalise(context);
+ retv = retv || LRMprint_mapping_summary(context);
+ retv = retv || LRMdestroy_context(context);
+ context = NULL;
+
+ return retv;
+}
+
+int LRMprint_mapping_summary(LRMcontext_t * context){
+ LRMprintf("\n\nAll finished.\n\nTotal processed reads : %d\n", context -> all_processed_reads);
+ LRMprintf("Time: %.1f minutes\n\n" , (LRMmiltime() - context->start_running_time)/60);
+ return 0;
+}
+
+int LRMvalidate_and_init_context(LRMcontext_t ** context, int argc, char ** argv){
+ int c;
+
+ (*context) = malloc(sizeof(LRMcontext_t));
+ memset((*context), 0, sizeof(LRMcontext_t));
+ LRMset_default_values_context(*context);
+
+ (*context) -> input_file_name[0] = 0;
+ (*context) -> output_file_name[0] = 0;
+ (*context) -> index_prefix[0] = 0;
+
+ optind = 0;
+ opterr = 1;
+ optopt = 63;
+ while ((c = getopt (argc, argv, "r:i:o:T:P:js"))!=-1){
+ switch(c){
+ case 'P':
+ (*context) -> is_Phred_64=(optarg[0]=='6');
+ break;
+ case 'j':
+ (*context) -> do_junction_detection = 1;
+ break;
+ case 'r':
+ strcpy((*context) -> input_file_name, optarg);
+ break;
+ case 'i':
+ strcpy((*context) -> index_prefix, optarg);
+ break;
+ case 's':
+ (*context) -> is_SAM_output = 1;
+ break;
+ case 'o':
+ strcpy((*context) -> output_file_name, optarg);
+ break;
+ case 'T':
+ (*context) -> threads = min(max(1,atoi(optarg)),LRMMAX_THREADS);
+ break;
+ case 'n':
+ (*context) -> max_subreads_per_segment = atoi(optarg);
+ assert( (*context) -> max_subreads_per_segment< LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT );
+ break;
+ case 'X':
+ (*context) -> max_mismatched_bases_in_subread = atoi(optarg);
+ assert((*context) -> max_mismatched_bases_in_subread <3);
+ break;
+ case 'O':
+ (*context) -> segment_overlapping = atoi(optarg);
+ break;
+ case 'm':
+ (*context) -> min_voting_number = atoi(optarg);
+ break;
+ case '?':
+ default:
+ return 1;
+ }
+ }
+
+ if((*context) -> input_file_name[0] ==0 || (*context) -> output_file_name[0]==0 || (*context) -> index_prefix[0]==0){
+ LRMprintf("Please specify the input, output files and the index.\n");
+ return 1;
+ }
+
+ (*context) -> user_command_line[0]=0;
+ for(c = 0; c<argc;c++)
+ sprintf((*context) -> user_command_line+strlen( (*context) -> user_command_line), "\"%s\" ", argv[c]);
+
+
+ LRMthread_lockinit(&(*context) -> input_lock);
+ LRMthread_lockinit(&(*context) -> sam_bam_file_lock);
+
+ (*context)-> sam_bam_chromosome_table = HashTableCreate(199);
+ HashTableSetKeyComparisonFunction((*context)-> sam_bam_chromosome_table, LRMhash_strcmp);
+ HashTableSetHashFunction((*context)-> sam_bam_chromosome_table, LRMhash_strhash);
+ HashTableSetDeallocationFunctions((*context)-> sam_bam_chromosome_table, NULL, NULL);
+
+
+ (*context)-> chromosome_size_list = ArrayListCreate(29);
+
+ (*context)-> chromosome_size_table = HashTableCreate(199);
+ HashTableSetKeyComparisonFunction((*context)-> chromosome_size_table, LRMhash_strcmp);
+ HashTableSetHashFunction((*context)-> chromosome_size_table, LRMhash_strhash);
+ HashTableSetDeallocationFunctions((*context)-> chromosome_size_table, free, NULL);
+
+ (*context) -> sam_bam_chromosome_list = ArrayListCreate(29);
+
+ LRMload_offsets(*context);
+
+ int retv = LRMgeinput_open((*context)->input_file_name,&(*context) -> input_file);
+
+ (*context)->sam_bam_file = fopen( (*context) -> output_file_name, "w");
+ if(NULL == (*context)->sam_bam_file) retv = 1;
+
+ (*context)->event_space = malloc(sizeof(LRMevent_t)*20000);
+ (*context)->event_space_size = 20000;
+ LRMthread_lockinit(&(*context) -> event_space_lock);
+ (*context)->events_realignment = HashTableCreate(320000);
+
+ assert(LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT >= (*context)->max_subreads_per_segment );
+
+ return retv;
+}
+
+
+double LRMmiltime(){
+ double ret;
+ struct timeb trp;
+ ftime(&trp);
+ ret = trp.time*1.0+(trp.millitm*1.0/1000.0);
+ return ret;
+}
+
+
+void LRMset_default_values_context(LRMcontext_t * context){
+ context->threads = 1;
+ context->start_running_time = LRMmiltime();
+ context->multi_best_read_alignments = 1;
+
+ context->max_indel_length = 15;
+ context->max_junction_distance = 100000;
+ context->max_mismatched_bases_in_subread = 0;
+ context->max_subreads_per_segment = 10;
+ context->min_voting_number = 1;
+ context->segment_overlapping = 30;
+
+ context->dynamic_programming_score_match = 2;
+ context->dynamic_programming_score_create_gap = -2;
+ context->dynamic_programming_score_extend_gap = -1;
+}
+
+int LRMshow_conf(LRMcontext_t * context){
+ LRMprintf("\n ====== Subread long read mapping ======\n\n");
+ LRMprintf("Threads: %d\n" , context->threads);
+ LRMprintf("Input file: %s\n" , context->input_file_name);
+ LRMprintf("Output file: %s (%s)\n" , context->output_file_name, context->is_SAM_output?"SAM":"BAM");
+ LRMprintf("Index: %s\n\n" , context->index_prefix);
+
+ return 0;
+}
+
+int LRMinit_chunk(LRMcontext_t * context){
+ if(context->all_processed_reads) memset(context -> read_mapping_results, 0, sizeof(LRMread_mapping_result_t)*LRMREADS_PER_CHUNK);
+ return 0;
+}
+
+int LRMrun_task(LRMcontext_t * context){
+ int retv = 0;
+ retv = LRMload_index( context );
+ LRMprintf("Index was loaded; the gap bewteen subreads is %d bases\n", context -> current_index.index_gap );
+ while(!(retv ||LRMinput_has_finished( context ))){
+ retv=retv || LRMinit_chunk(context);
+ retv=retv || LRMsave_input_pos(context);
+ retv=retv || LRMiterate_reads( context , LRMRUNNING_STEP_VOTING);
+ retv=retv || LRMrewind_input_pos(context);
+ retv=retv || LRMiterate_reads( context , LRMRUNNING_STEP_REALIGN);
+ retv=retv || LRMfinalise_chunk_reads(context);
+ }
+ return retv;
+}
+
+int LRMfinalise(LRMcontext_t * context){
+ return 0;
+}
+
+int LRMdestroy_context(LRMcontext_t * context){
+
+ LRMgehash_destory(&(context -> current_index));
+ LRMgvindex_destory(&(context -> current_base_index));
+
+ HashTableDestroy(context -> chromosome_size_table);
+ ArrayListDestroy(context -> chromosome_size_list);
+
+ HashTableDestroy(context -> sam_bam_chromosome_table);
+ ArrayListDestroy(context -> sam_bam_chromosome_list);
+
+ HashTableSetDeallocationFunctions(context -> events_realignment, NULL, free);
+ HashTableDestroy(context -> events_realignment);
+
+ free(context -> event_space);
+
+ int readno;
+ for(readno = 0; readno < LRMREADS_PER_CHUNK; readno++){
+ if(context -> read_mapping_results[readno].segment_results != NULL)
+ free(context -> read_mapping_results[readno].segment_results);
+ }
+
+ if(!context -> is_SAM_output){
+ fwrite(context -> bam_file_tail_binary,1, context -> bam_file_tail_length, context->sam_bam_file);
+ }
+
+ LRMgeinput_close(&context->input_file);
+ fclose(context->sam_bam_file);
+ //free(context->user_command_line);
+ free(context);
+ return 0;
+}
+
+
+int LRMinput_has_finished( LRMcontext_t * context ){
+ return context -> input_exhausted ;
+}
+
+int LRMload_index(LRMcontext_t * context){
+ int retv = 0;
+ char indextab_fname[LRMMAX_FILENAME_LENGTH];
+
+ sprintf(indextab_fname, "%s.00.b.tab", context -> index_prefix);
+ retv = retv || LRMgehash_load(&(context -> current_index), indextab_fname);
+
+ sprintf(indextab_fname, "%s.00.b.array", context -> index_prefix);
+ retv = retv || LRMgvindex_load(&(context -> current_base_index), indextab_fname);
+
+ return retv;
+}
+
+
+int LRMiterate_reads( LRMcontext_t * context, int task ){
+ int retv = 0;
+ retv = retv || LRMstart_thread( context , task );
+ retv = retv || LRMwait_threads( context );
+ retv = retv || LRMmerge_threads( context, task );
+ return retv;
+}
+
+void * LRM_thread_runner (void * args){
+ void ** argv = args;
+ LRMcontext_t * context = argv[0];
+ int thid = argv[1]-NULL;
+ int task = argv[2]-NULL;
+ free(args);
+
+ LRMchunk_read_iteration(context, thid, task);
+
+ return NULL;
+}
+
+int LRMstart_thread_init_context(LRMcontext_t * context, int thread_id, int step){
+ LRMthread_context_t * thread_context = context -> thread_contexts+thread_id;
+ memset(thread_context, 0, sizeof(LRMthread_context_t));
+ thread_context->thread_id = thread_id;
+
+ if(step == LRMRUNNING_STEP_VOTING){
+ if( thread_context -> thread_id == 0 )LRMsambam_write_header(context, thread_context);
+ thread_context -> dynamic_programming_movement_buffer = malloc(( 2* context -> max_indel_length + 1) * LRMDYNAMIC_MAXIMUM_GAP_LENGTH);
+ thread_context -> dynamic_programming_score_buffer = malloc(sizeof(int) * ( 2 * context -> max_indel_length + 1) *( LRMDYNAMIC_MAXIMUM_GAP_LENGTH+1));
+ thread_context -> dynamic_programming_indel_movement_buf = malloc( max( LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 1.3, 300 ) + context -> max_indel_length + 1 );
+ }else if(step == LRMRUNNING_STEP_REALIGN){
+ thread_context -> dynamic_programming_movement_buffer = malloc(( 2* context -> max_indel_length + 1) * LRMDYNAMIC_MAXIMUM_GAP_LENGTH);
+ thread_context -> dynamic_programming_score_buffer = malloc(sizeof(int) * ( 2 * context -> max_indel_length + 1) *( LRMDYNAMIC_MAXIMUM_GAP_LENGTH+1));
+ thread_context -> dynamic_programming_indel_movement_buf = malloc( max( LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 1.3, 300 ) + context -> max_indel_length + 1 );
+
+ thread_context -> out_SAMBAM_buffer = malloc(2400000);
+ if(thread_context -> out_SAMBAM_buffer == NULL) return 1;
+
+ thread_context -> out_buff_used = 0;
+ thread_context -> out_buff_capacity = 2400000;
+ }
+ return 0;
+}
+
+int LRMstart_thread(LRMcontext_t * context, int task ){
+ int th_id, retv=0;
+
+ for(th_id=0; th_id<context -> threads; th_id++){
+
+ retv = retv || LRMstart_thread_init_context(context,th_id,task);
+ if(retv)
+ break;
+ else {
+ void ** th_args=malloc(sizeof(void *)*3); // to be freed in the thread.
+ th_args[0] = context;
+ th_args[1] = NULL + th_id;
+ th_args[2] = NULL + task;
+ LRMpthread_create(context -> running_threads+th_id, NULL, LRM_thread_runner, th_args);
+ }
+ }
+
+ return retv;
+}
+
+int LRMwait_threads( LRMcontext_t * context ){
+ int th_id;
+ for(th_id=0; th_id<context -> threads; th_id++)
+ LRMpthread_join(context -> running_threads[th_id], NULL);
+ return 0;
+}
+
+void LRMmerge_threads_destroy_context(LRMcontext_t * context, LRMthread_context_t * thread_context, int task){
+ if(task == LRMRUNNING_STEP_VOTING){
+ free(thread_context -> dynamic_programming_movement_buffer);
+ free(thread_context -> dynamic_programming_score_buffer);
+ free(thread_context -> dynamic_programming_indel_movement_buf);
+ }else if(task == LRMRUNNING_STEP_REALIGN){
+ free(thread_context -> dynamic_programming_movement_buffer);
+ free(thread_context -> dynamic_programming_score_buffer);
+ free(thread_context -> dynamic_programming_indel_movement_buf);
+ free(thread_context -> out_SAMBAM_buffer);
+ }
+}
+
+int LRMmerge_threads( LRMcontext_t * context , int step){
+ int retv = 0;
+ int th_id;
+
+ for(th_id=0; th_id<context -> threads; th_id++){
+
+ if(step == LRMRUNNING_STEP_VOTING){
+ retv = retv || LRMevents_reorder(context);
+ retv = retv || LRMevents_build_entries(context);
+ }else if(step == LRMRUNNING_STEP_REALIGN){
+ LRMwrite_chunk_check_buffer_write(context, context -> thread_contexts+th_id, 1);
+ if(th_id == context -> threads-1)LRMbam_generate_tail_binary(context, context -> thread_contexts+th_id);
+ }else assert(0);
+ LRMmerge_threads_destroy_context(context, context -> thread_contexts+th_id, step);
+ }
+
+ return retv;
+}
+
+int LRMrewind_input_pos(LRMcontext_t * context){
+ context -> processed_reads_in_chunk = 0;
+ if(context->input_file.file_type == LRMGENE_INPUT_GZIP_FASTQ)
+ seekgz_seek(context->input_file.input_fp, &context->last_saved_zlib_pos);
+ else
+ fseeko(context->input_file.input_fp, context->last_saved_raw_pos, SEEK_SET);
+ return 0;
+}
+
+int LRMsave_input_pos( LRMcontext_t * context){
+ context -> processed_reads_in_chunk = 0;
+ if(context->input_file.file_type == LRMGENE_INPUT_GZIP_FASTQ)
+ seekgz_tell(context->input_file.input_fp, &context->last_saved_zlib_pos);
+ else
+ context -> last_saved_raw_pos = ftello(context->input_file.input_fp);
+
+ return 0;
+}
+
+int LRMsplit_read_to_segments(LRMcontext_t * context, LRMread_iteration_context_t* iteration_context){
+ int seg_curs = 0;
+ iteration_context->total_segments = 0;
+ if(iteration_context->read_length<16) return 1;
+
+ int increment_step = LRMSEGMENT_MIN_LENGTH - context -> segment_overlapping; //;LRMSEGMENT_MIN_LENGTH - LRMSEGMENT_OVERLAPPING;
+
+ while(1){
+ int seg_end = seg_curs + increment_step;
+ if(seg_end + LRMSEGMENT_MIN_LENGTH > iteration_context->read_length) seg_end = iteration_context->read_length;
+
+ iteration_context->segment_texts[iteration_context->total_segments] = iteration_context->read_text + seg_curs;
+ iteration_context->segment_quals[iteration_context->total_segments] = iteration_context->qual_text + seg_curs;
+ iteration_context->segment_lengths[iteration_context->total_segments] =(seg_end == iteration_context->read_length ? iteration_context->read_length - seg_curs : LRMSEGMENT_MIN_LENGTH);
+
+ iteration_context->total_segments ++;
+ seg_curs = seg_end;
+ if(seg_curs >= iteration_context->read_length) break;
+ }
+ return 0;
+}
+
+void LRMreverse_read_and_qual(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context){
+ LRMreverse_read(iteration_context -> read_text, iteration_context -> read_length);
+ LRMreverse_quality(iteration_context -> qual_text, iteration_context -> read_length);
+
+ int segi;
+ for(segi = 0; segi < iteration_context->total_segments / 2; segi++){
+ //#warning " positions of the subreads should be moved after reversing -- for the best sequencing quality "
+ unsigned int tmp_offsets[LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT];
+ memcpy(tmp_offsets, iteration_context->subread_offsets[segi], sizeof(int)* LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT);
+ memcpy(iteration_context->subread_offsets[segi], iteration_context->subread_offsets[ iteration_context->total_segments-segi-1 ],sizeof(int)* LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT);
+ memcpy(iteration_context->subread_offsets[iteration_context->total_segments-segi-1], tmp_offsets, sizeof(int)* LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT);
+ }
+
+ for(segi = 0; segi < iteration_context->total_segments / 2; segi++){
+ int old_left_length = iteration_context->segment_lengths[segi];
+ int old_right_length = iteration_context->segment_lengths[iteration_context->total_segments - segi -1];
+
+ int old_left_start = iteration_context -> segment_texts[segi] - iteration_context->read_text;
+ int old_right_start = iteration_context -> segment_texts[ iteration_context->total_segments - segi -1 ] - iteration_context->read_text;
+ int old_left_new_start = iteration_context -> read_length - (old_left_start + old_left_length);
+ int old_right_new_start = iteration_context -> read_length - (old_right_start + old_right_length);
+
+ iteration_context -> segment_texts[segi] = iteration_context->read_text + old_right_new_start;
+ iteration_context -> segment_quals[segi] = iteration_context->qual_text + old_right_new_start;
+ iteration_context -> segment_texts[iteration_context->total_segments - segi -1] = iteration_context->read_text + old_left_new_start;
+ iteration_context -> segment_quals[iteration_context->total_segments - segi -1] = iteration_context->qual_text + old_left_new_start;
+
+ int tmpi = iteration_context -> segment_lengths[ iteration_context->total_segments - segi -1];
+ iteration_context -> segment_lengths[ iteration_context->total_segments - segi -1] = iteration_context -> segment_lengths[segi];
+ iteration_context -> segment_lengths[segi] = tmpi;
+ }
+
+ if(iteration_context->total_segments % 2){
+ int old_len = iteration_context -> segment_lengths[segi];
+ int segi = iteration_context->total_segments / 2;
+ int old_start = iteration_context -> segment_texts[segi] - iteration_context->read_text;
+ int new_start = iteration_context->read_length - (old_start + old_len);
+
+ iteration_context -> segment_texts[segi] = iteration_context->read_text + new_start;
+ iteration_context -> segment_quals[segi] = iteration_context->qual_text + new_start;
+ }
+}
+
+void LRMdo_one_voting_read_process_setres(LRMcontext_t * context, LRMread_iteration_context_t * iteration_context, LRMsegment_mapping_result_t * seg_result, int replace_index, LRMgene_vote_t *vote_table, int iii, int jjj){
+ int x1;
+ for(x1 = LRMSEGMENT_MAX_CANDIDATES-2; x1 >= replace_index; x1--)
+ memcpy(seg_result->candidates + x1 + 1, seg_result->candidates + x1, sizeof(LRMsegment_mapping_candidate_t));
+
+ seg_result->candidates[replace_index].first_base_position = vote_table -> pos[iii][jjj];
+ seg_result->candidates[replace_index].indel_length_inside = vote_table -> current_indel_cursor[iii][jjj];
+ seg_result->candidates[replace_index].confident_coverage_start = vote_table -> coverage_start[iii][jjj];
+ seg_result->candidates[replace_index].confident_coverage_end = vote_table -> coverage_end[iii][jjj];
+ seg_result->candidates[replace_index].votes = vote_table -> votes[iii][jjj];
+ seg_result->candidates[replace_index].masks = vote_table -> masks[iii][jjj];
+
+ if(0 && seg_result->candidates[replace_index].votes>2){
+ char postxt[100];
+ LRMpos2txt(context, seg_result->candidates[replace_index].first_base_position , postxt);
+ LRMprintf("REPLACE CANDIDATE %d : to %s (%s), V=%d\n", replace_index, postxt, (seg_result->candidates[replace_index].masks & LRMIS_NEGATIVE_STRAND)?"NEG":"POS", seg_result->candidates[replace_index].votes);
+ }
+
+ memcpy(seg_result->candidates[replace_index].indel_records, vote_table ->indel_recorder[iii][jjj], sizeof(short)*3*LRMMAX_INDEL_SECTIONS);
+}
+
+int LRMdo_one_voting_read_process_samechro(LRMcontext_t * context, unsigned int p1, unsigned int p2){
+ char * chro_name1, *chro_name2;
+ int chro_pos1, chro_pos2;
+ LRMlocate_gene_position(context, p1, &chro_name1, & chro_pos1);
+ LRMlocate_gene_position(context, p2, &chro_name2, & chro_pos2);
+
+ return chro_name1 == chro_name2; // they can be compared in this way because they are pointers in the sam_bam_chromosome_list.
+}
+
+
+#define LRMseg_fetch_result(mapr, sid) ( (mapr) -> segment_results + ((iteration_context -> is_reversed == 0)?(sid):( iteration_context -> total_segments - sid - 1 ) ) )
+
+void LRMdo_one_voting_read_process_votetab(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int this_seg_id){
+ LRMread_mapping_result_t * mapping_result = context -> read_mapping_results + iteration_context -> read_no_in_chunk;
+ LRMsegment_mapping_result_t * seg_result = LRMseg_fetch_result( mapping_result, this_seg_id );
+ seg_result -> extracted_subreads = iteration_context -> extracted_subreads;
+
+ int iii, jjj, hhh;
+ for(iii=0; iii < LRMGENE_VOTE_TABLE_SIZE; iii++){
+ for(jjj = 0; jjj < iteration_context -> vote_table.items[iii]; jjj++){
+ if( iteration_context -> vote_table.votes [iii][jjj] < context -> min_voting_number || iteration_context -> vote_table.votes [iii][jjj] <= seg_result -> candidates[LRMSEGMENT_MAX_CANDIDATES - 1].votes) continue;
+ int replace_index = LRMSEGMENT_MAX_CANDIDATES - 1;
+ int kkk;
+ for(kkk = 0; kkk < LRMSEGMENT_MAX_CANDIDATES; kkk ++){
+ if(seg_result->candidates[kkk].votes < iteration_context -> vote_table.votes [iii][jjj]){
+ replace_index = kkk;
+ break;
+ }
+ }
+ LRMdo_one_voting_read_process_setres(context, iteration_context, seg_result, replace_index, &iteration_context -> vote_table, iii, jjj);
+ }
+ }
+
+ if(context -> do_junction_detection)
+ for(hhh = 0; hhh < LRMSEGMENT_MAX_CANDIDATES; hhh++){
+
+ if(0){
+ char p1txt[100];
+ LRMpos2txt(context, seg_result->candidates[hhh].first_base_position, p1txt);
+ LRMprintf("process_votetab: [%d] votes=%d, pos=%s (%u)\n", hhh, seg_result->candidates[hhh].votes, p1txt, seg_result->candidates[hhh].first_base_position);
+ }
+
+ if(seg_result -> candidates[hhh].votes<1)break;
+ if((seg_result -> candidates[hhh].masks & LRMIS_NEGATIVE_STRAND) != (iteration_context -> is_reversed ? LRMIS_NEGATIVE_STRAND : 0)) continue;
+ seg_result -> candidates[hhh].secondary_votes = 0;
+
+ unsigned int best_secondary_half_pos = 0;
+ int best_secondary_score = -1, best_secondary_split_point = -1, best_secondary_is_GT_AG = -1, best_secondary_votes = -1, best_left_offset_indels = 0;
+
+ for(iii=0; iii < LRMGENE_VOTE_TABLE_SIZE; iii++){
+ for(jjj = 0; jjj < iteration_context -> vote_table.items[iii]; jjj++){
+ if(iteration_context -> vote_table.votes [iii][jjj] > seg_result -> candidates[hhh].votes || iteration_context -> vote_table.votes [iii][jjj] <1) continue;
+
+ long long dist0 = seg_result->candidates[hhh].first_base_position;
+ dist0 -= iteration_context -> vote_table.pos[iii][jjj];
+ int is_junction_distance = abs(dist0) > 3 && abs(dist0) < context -> max_junction_distance;
+ int is_same_chro = LRMdo_one_voting_read_process_samechro(context, seg_result->candidates[hhh].first_base_position, iteration_context -> vote_table.pos[iii][jjj]);
+ //LRMprintf("TEST JUNCTION COND: %u ~ %u : DIST=%d, SAME=%d\n", seg_result->candidates[hhh].first_base_position, iteration_context -> vote_table.pos[iii][jjj], is_junction_distance, is_same_chro );
+ if(is_junction_distance && is_same_chro){
+ int this_split_point = -1, this_is_GT_AG = -1, left_indel_offset = 0;
+ int indel_length_in_anchor = seg_result->candidates[hhh].indel_length_inside;
+ int indel_length_in_secondary = iteration_context -> vote_table.current_indel_cursor[iii][jjj];
+ int this_score = LRMdonor_score(context, thread_context, iteration_context, seg_result -> candidates + hhh, this_seg_id, iteration_context ->vote_table.pos[iii][jjj] , iteration_context ->vote_table.coverage_start[iii][jjj] , iteration_context -> vote_table.coverage_end[iii][jjj], indel_length_in_anchor, indel_length_in_secondary, & this_split_point, & this_is_GT_AG , &left_indel_offset);
+
+ if(0 && this_score > 0){
+ char pos1txt[100], pos2txt[100];
+ LRMpos2txt(context, seg_result -> candidates[hhh].first_base_position, pos1txt);
+ LRMpos2txt(context, iteration_context ->vote_table. pos[iii][jjj], pos2txt);
+ LRMprintf("TEST JUNCTION SCORE CAND %d: %s ~ %s = %d, <?< %d ; VOTES=%d + %d\n", hhh, pos1txt, pos2txt, this_score, best_secondary_score, seg_result -> candidates[hhh].votes, iteration_context -> vote_table.votes [iii][jjj]);
+ }
+
+ if(this_score > best_secondary_score ){
+ best_secondary_half_pos = iteration_context ->vote_table. pos[iii][jjj];
+ best_secondary_split_point = this_split_point;
+ best_secondary_is_GT_AG = this_is_GT_AG;
+ best_secondary_votes = iteration_context -> vote_table.votes [iii][jjj];
+ best_secondary_score = this_score;
+ best_left_offset_indels = left_indel_offset;
+ }
+ }
+ }
+ }
+
+ if(best_secondary_score > 0){
+ seg_result -> candidates[hhh].secondary_position = best_secondary_half_pos;
+ seg_result -> candidates[hhh].secondary_votes = best_secondary_votes;
+ seg_result -> candidates[hhh].junction_split_point = best_secondary_split_point;
+ seg_result -> candidates[hhh].junction_is_GT_AG = best_secondary_is_GT_AG;
+ seg_result -> candidates[hhh].junction_left_offset_indels = best_left_offset_indels;
+ }
+ }
+ //END: if context -> do_junction_detection
+}
+
+void LRMdo_one_voting_read_segment_extraction(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int this_seg_id){
+ int seg_len = iteration_context -> segment_lengths[this_seg_id];
+ char * seg_qual = iteration_context -> segment_quals[this_seg_id];
+
+ iteration_context -> extracted_subreads = min(context -> max_subreads_per_segment ,(seg_len - 15 - context -> current_index.index_gap) / 16 + 1);
+ float subread_gap = (seg_len - context -> current_index.index_gap + 1)*1. / iteration_context -> extracted_subreads;
+ int subr_i, pos_i;
+
+
+ //LRMprintf("EXTSUB: %s [%d] LEN=%d\t" , iteration_context -> read_name, this_seg_id , seg_len);
+ for(subr_i = 0; subr_i < iteration_context -> extracted_subreads; subr_i++){
+ pos_i = subr_i * subread_gap;
+ iteration_context -> subread_offsets[this_seg_id][subr_i]=pos_i;
+
+ //#warning "FOR COMPARISON ONLY ===================="
+ //continue;
+
+ int highest_qual = -1;
+ int total_qual = 0;
+ int search_end = pos_i + subread_gap - 1, search_end_1 = pos_i + 15 + context -> current_index.index_gap;
+
+ for(; pos_i < search_end_1; pos_i++)
+ total_qual += seg_qual[ pos_i ];
+ highest_qual = total_qual;
+
+ for(; pos_i < search_end; pos_i++){
+ total_qual += seg_qual[ pos_i ];
+ total_qual -= seg_qual[ pos_i -15 - context -> current_index.index_gap];
+ if(total_qual > highest_qual) {
+ highest_qual = total_qual;
+ iteration_context -> subread_offsets[this_seg_id][subr_i]=pos_i-14 - context -> current_index.index_gap;
+ }
+ }
+ // LRMprintf("%d:%d\t", (int)(subr_i * subread_gap), iteration_context -> subread_offsets[this_seg_id][subr_i]);
+ }
+ //LRMprintf("\n");
+}
+
+void LRMdo_one_voting_read_segment(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int this_seg_id){
+ LRMinit_gene_vote((& iteration_context-> vote_table));
+
+ char * seg_text = iteration_context -> segment_texts[this_seg_id];
+ int seg_len = iteration_context -> segment_lengths[this_seg_id];
+
+ //LRMprintf("READ %s SEG %d REV %d [%dbp] at %p : %.*s\n", iteration_context -> read_name, this_seg_id, iteration_context -> is_reversed, seg_len, seg_text, seg_len, seg_text);
+ LRMdo_one_voting_read_segment_extraction(context, thread_context,iteration_context , this_seg_id);
+
+ //LRMprintf("Extract subread from %d bp seg : %d\n", seg_len, iteration_context -> extracted_subreads);
+ int this_subread_no, this_gap_offset;
+ for(this_subread_no=0; this_subread_no< iteration_context ->extracted_subreads;this_subread_no++){
+ for(this_gap_offset=0; this_gap_offset<context -> current_index.index_gap; this_gap_offset++){
+ int this_subread_offset = this_gap_offset + iteration_context -> subread_offsets[this_seg_id][this_subread_no];
+
+ char * subread_string = seg_text + this_subread_offset;
+ LRMgehash_key_t subread_integer = LRMgenekey2int(subread_string);
+
+ LRMgehash_go_tolerance(context, thread_context, iteration_context,& context->current_index, subread_integer , this_subread_offset, seg_len, iteration_context -> is_reversed, & iteration_context-> vote_table, context -> max_indel_length, this_subread_no, context -> max_mismatched_bases_in_subread);
+ }
+ }
+
+ if(0){
+ LRMprintf("\nREAD %s [seg %d] STAGE %s : %.*s\n", iteration_context -> read_name, this_seg_id, iteration_context -> is_reversed?"NEG":"POS", seg_len, seg_text);
+ LRMprint_v(context, iteration_context, 2);
+ }
+ LRMdo_one_voting_read_process_votetab(context, thread_context, iteration_context, this_seg_id);
+}
+
+int LRMfind_subread_end(int len, int total_subreads, int subread){
+ return subread * 16;
+}
+
+void LRMdo_one_voting_insert_chro_events(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int seg_id){
+ int x2;
+
+ LRMread_mapping_result_t * mapping_result = context -> read_mapping_results + iteration_context -> read_no_in_chunk;
+ LRMsegment_mapping_result_t * seg_result = LRMseg_fetch_result( mapping_result, seg_id );
+
+ if(context -> do_junction_detection)
+ for(x2 = 0; x2 < LRMSEGMENT_MAX_CANDIDATES; x2++){
+ LRMsegment_mapping_candidate_t * cand_res = seg_result -> candidates + x2;
+ //LRMprintf("TRY INSERT JUNCTION: [%d] , VOTE=%d, 2ndVOTE=%d\n", x2, cand_res -> votes, cand_res -> secondary_votes);
+ if(cand_res -> secondary_votes > 0){
+ LRMevent_t new_event;
+ memset(&new_event, 0, sizeof(LRMevent_t));
+ new_event.event_type = LRMEVENT_TYPE_JUNCTION;
+ new_event.small_side = - 1 + min(cand_res->first_base_position, cand_res->secondary_position) + cand_res->junction_split_point + cand_res->junction_left_offset_indels;
+ new_event.large_side = max(cand_res->first_base_position, cand_res->secondary_position) + cand_res->junction_split_point;
+ new_event.masks = cand_res-> junction_is_GT_AG?LRM_EVENT_IS_GT_AT_DONOR:0;
+
+ //LRMprintf("INSERT JUNCTION EVENT: %u~%u\n", new_event.small_side, new_event.large_side);
+
+ int retv = LRMchro_event_new(context, thread_context, iteration_context, &new_event);
+ if(retv) return;
+ }
+ }
+ //END: if context -> do_junction_detection
+
+ // find and insert indels
+ for(x2 = 0; x2 < LRMSEGMENT_MAX_CANDIDATES; x2++){
+ LRMsegment_mapping_candidate_t * cand_res = seg_result -> candidates + x2;
+ int last_correct_subread = cand_res -> indel_records[1]-1, last_indel = 0;
+ int indel_i = 0;
+
+ if(0){
+ char ptxt[100];
+ LRMpos2txt(context, cand_res -> first_base_position, ptxt);
+ LRMprintf("CANDIDATE %s [SEG-%d][%d] START AT %s\n", iteration_context -> read_name, seg_id, x2, ptxt);
+ }
+
+ for(indel_i=1; indel_i<LRMMAX_INDEL_SECTIONS; indel_i++){
+ //LRMprintf("CANDIDATE INDEL[%d] = %d %d %d\n", indel_i, cand_res -> indel_records[indel_i*3], cand_res -> indel_records[indel_i*3+1], cand_res -> indel_records[indel_i*3+2] );
+ if( cand_res -> indel_records[indel_i*3]<1)break;
+
+ int next_correct_subread = cand_res -> indel_records[indel_i*3] - 1;
+ int last_correct_base = iteration_context -> subread_offsets[seg_id][last_correct_subread] - 10;
+ int first_correct_base = iteration_context -> subread_offsets[seg_id][next_correct_subread]+ 13;
+
+ int expected_indels_in_region=cand_res->indel_records[indel_i*3+2] - last_indel;
+ last_correct_base = max(0, last_correct_base);
+ last_correct_base = min(iteration_context -> read_length-1, last_correct_base);
+ first_correct_base = min(first_correct_base, iteration_context -> read_length-1);
+ first_correct_base = max(0, first_correct_base);
+ first_correct_base = max(first_correct_base, last_correct_base);
+ last_correct_subread = cand_res->indel_records[indel_i*3+1]-1;
+
+ //LRMprintf("CANDIDATE EXPINDEL=%d , GAP_BASES = %d, %d\n", expected_indels_in_region, first_correct_base, last_correct_base);
+
+ if(abs(expected_indels_in_region) <= context -> max_indel_length && first_correct_base - last_correct_base > 1){
+ int currently_reversed = 1;
+ char * corrected_read= iteration_context -> segment_texts[seg_id];
+
+ if(( (cand_res -> masks & LRMIS_NEGATIVE_STRAND ) == 0 && currently_reversed) ||
+ (( cand_res -> masks & LRMIS_NEGATIVE_STRAND ) != 0 && !currently_reversed)) {
+ LRMreverse_read( corrected_read , iteration_context -> segment_lengths[seg_id] );
+ currently_reversed = !currently_reversed;
+ }
+ unsigned int chro_cursor = cand_res -> first_base_position + last_correct_base + last_indel, total_mismatched;
+ int move_i, indel_movements = LRMindel_dynamic_search(context, thread_context, - expected_indels_in_region /* inversed definition */ , chro_cursor, corrected_read, last_correct_base, first_correct_base, &total_mismatched);
+ //LRMprintf("%s from %d MOVES=%s\n", (cand_res -> masks & LRMIS_NEGATIVE_STRAND )?"REV":"STD", last_correct_base , indel_movement_buff);
+
+ if(total_mismatched <= 1 || (total_mismatched <= 2 && first_correct_base - last_correct_base > 30) || (total_mismatched <= 10 && first_correct_base - last_correct_base > 100)){
+ int current_chr=-1, current_len = 0;
+ for(move_i = 0; move_i < 1+ indel_movements; move_i++){
+ int nch = thread_context -> dynamic_programming_movement_buffer[move_i];
+ nch = (nch=='X')?'M':nch;
+ if(current_chr!=nch){
+ if(current_chr>0 && current_chr != 'M'){
+ LRMevent_t new_event;
+ memset(&new_event, 0, sizeof(LRMevent_t));
+ new_event.indel_length = current_chr == 'D' ? current_len : - current_len;
+ new_event.event_type = LRMEVENT_TYPE_INDEL;
+ new_event.large_side = chro_cursor;
+ new_event.small_side = current_chr == 'D' ? chro_cursor - current_len - 1 : (chro_cursor - 1);
+ new_event.masks = cand_res-> junction_is_GT_AG?LRM_EVENT_IS_GT_AT_DONOR:0;
+
+ if(0){
+ char p1txt[100], p2txt[100], p0txt[100];
+ LRMpos2txt(context, new_event.small_side , p1txt);
+ LRMpos2txt(context, new_event.large_side , p2txt);
+ LRMpos2txt(context, cand_res -> first_base_position , p0txt);
+ if(1|| ( new_event.small_side >= 197828782 - 3 && new_event.small_side <= 197828782 + 5)){
+ LRMprintf("\nINSERT INDEL EVENT FROM %s: %s~%s ; LEN=%d\n", iteration_context -> read_name, p1txt, p2txt, new_event.indel_length);
+ LRMprintf("INSERT INDEL AT %s + %d + %d\n" , p0txt, last_correct_base, last_indel);
+ LRMprintf("%s MOVES=%s\n\n", (cand_res -> masks & LRMIS_NEGATIVE_STRAND )?"REV":"STD" , thread_context -> dynamic_programming_movement_buffer);
+ }
+ }
+
+ int retv = LRMchro_event_new(context, thread_context, iteration_context, &new_event);
+ if(retv) return;
+ }
+ current_chr = nch;
+ current_len = 0;
+ }
+ current_len++;
+ if(nch !='I') chro_cursor ++;
+ }
+ }
+ if(currently_reversed == 0) LRMreverse_read( corrected_read , iteration_context -> segment_lengths[seg_id] );
+ }
+ last_indel = cand_res->indel_records[indel_i*3+2];
+ }
+ }
+}
+
+void LRMdo_one_voting_read(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context){
+ LRMread_mapping_result_t * mapping_result = context -> read_mapping_results + iteration_context -> read_no_in_chunk;
+ if(iteration_context -> total_segments < 1) return;
+
+ mapping_result -> segment_results = malloc(sizeof(LRMsegment_mapping_result_t) * iteration_context->total_segments);
+ memset(mapping_result -> segment_results, 0, sizeof(LRMsegment_mapping_result_t) * iteration_context->total_segments);
+
+ for(iteration_context->is_reversed = 0; iteration_context->is_reversed<2; iteration_context->is_reversed++){
+ int seg_id;
+ for(seg_id=0; seg_id<iteration_context -> total_segments; seg_id++){
+ LRMdo_one_voting_read_segment(context, thread_context, iteration_context, seg_id);
+ if(iteration_context->is_reversed) LRMdo_one_voting_insert_chro_events(context, thread_context, iteration_context, seg_id);
+ }
+
+ if(0 == iteration_context->is_reversed) LRMreverse_read_and_qual(context, thread_context, iteration_context);
+ }
+}
+
+
+#define LRMSEGMENT_MAX_ANCHOR_POINTERS 50
+
+typedef struct{
+ unsigned int read_head_pos;
+ unsigned int votes;
+ short masks;
+ unsigned short cand_number;
+ unsigned int last_seg_start_pos, second_last_seg_start_pos;
+ int segment_number;
+ int segment_id[ LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_LENGTH ];
+ int realign_cand_id[ LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_LENGTH ];
+ long long last_distance;
+ int last_votes;
+} LRMread_final_candidate_t;
+
+void LRMfix_cigar(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, char * cigar){
+ int ci, nch;
+ unsigned int tmpi = 0;
+ unsigned int lastlen = 0;
+ int lastopt = 0, outi = 0;
+ for(ci=0; 0!=(nch=cigar[ci]); ci++){
+ if(nch <= '9' && nch >= '0'){
+ tmpi = 10*tmpi +(nch-'0');
+ }else{
+ if(nch >= 'A' && nch <= 'Z'){
+ if(nch != lastopt){
+ if(lastlen>0)
+ outi+=sprintf(cigar+outi, "%u%c", lastlen, lastopt);
+ lastopt = nch;
+ lastlen = 0;
+ }
+ lastlen += tmpi;
+ }
+ tmpi = 0;
+ }
+ }
+ if(lastlen>0)outi+=sprintf(cigar+outi, "%u%c", lastlen, lastopt);
+}
+
+long long LRMcalculate_written_chro_pos(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int written_read_cursor, int start_read_cursor, char * cigar, unsigned int start_chro_pos){
+ if(written_read_cursor <=start_read_cursor) return start_chro_pos;
+ int tmpi = 0;
+ int ci=0, nch;
+
+ while(0!=(nch = cigar[ci++])){
+ if(nch <='9' && nch >= '0') tmpi = tmpi*10+(nch-'0');
+ else{
+ if(nch == 'M'||nch == 'S' ||nch == 'N'||nch == 'D') start_chro_pos += tmpi;
+ if(nch == 'M'||nch == 'S' ||nch == 'I') start_read_cursor += tmpi;
+ if(start_read_cursor > written_read_cursor) {
+ if(nch == 'I') return start_chro_pos;
+ else return start_chro_pos - (start_read_cursor - written_read_cursor);
+ }
+
+ tmpi = 0;
+ }
+ }
+ return -1;
+}
+
+#define update_read_res { if(read_res -> votes < selected_cand_list -> votes) { read_res -> votes = selected_cand_list -> votes ; read_res -> final_pos = selected_cand_list -> read_head_pos; read_res -> masks = selected_cand_list -> masks; read_res -> best_candidate = selected_cand_list;} }
+
+int LRMread_final_result_merge(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMread_mapping_result_t * read_res){
+ int x1, x2, x3;
+
+ read_res -> best_candidate = NULL;
+ ArrayList * entry_tab = ArrayListCreate(100); // [ LRMread_final_candidate_t, ... ]
+ ArrayListSetDeallocationFunction(entry_tab, free);
+
+ for(x1=0;x1<iteration_context->total_segments;x1++){
+ int segment_head_moved = iteration_context -> segment_texts [x1] - iteration_context -> read_text;
+ int cond_found = 0;
+ for(x2 = 0; x2 < LRMMERGING_MAX_CANDIDATES; x2++){
+ int seg_cand_votes = iteration_context -> segment_best_votes[x1][x2];
+ if(seg_cand_votes < 1) break;
+ unsigned int seg_cand_segpos = iteration_context -> segment_best_pos[x1][x2];
+ int seg_cand_masks = iteration_context -> segment_best_masks[x1][x2], is_closely_related = 0;
+
+ for(x3 = 0; x3 < LRMMERGING_MAX_CANDIDATES; x3++){
+ long long int delta = iteration_context -> segment_best_pos[x1][x3];
+ delta -= seg_cand_segpos;
+ if(delta < 0 && abs(delta)< LRMREAD_RESULT_MERGE_TOLERANCE) is_closely_related = 1;
+ }
+
+ if(is_closely_related) continue;
+
+ if(0){
+ char postxt[100];
+ LRMpos2txt(context, seg_cand_segpos, postxt);
+ LRMprintf("Trying %s [seg %d] [cand %d] : pos=%s, votes=%d\n", iteration_context -> read_name, x1, x2, postxt, seg_cand_votes);
+ }
+
+ unsigned int seg_pos_moved = seg_cand_segpos - segment_head_moved, cani;
+
+ for(cani = 0; cani < entry_tab -> numOfElements; cani++){
+ long long min_positive_dist = 0x7fffffff;
+ LRMread_final_candidate_t * selected_cand_list = NULL;
+ LRMread_final_candidate_t * cand_rec = ArrayListGet(entry_tab, cani);
+
+ long long dist0 = seg_cand_segpos ;
+ if(cand_rec -> segment_id[cand_rec -> segment_number-1] < x1)dist0 -= cand_rec -> last_seg_start_pos;
+ else dist0 -= cand_rec -> second_last_seg_start_pos;
+
+
+ if( cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) dist0 = -dist0;
+ if(0){
+ char postxt[100];
+ LRMpos2txt(context, cand_rec -> last_seg_start_pos, postxt);
+ LRMprintf(" Old Cand #%p : Old pos=%s, votes=%d, dist=%lld, masks=%d, %d\n", cand_rec, postxt, seg_cand_votes, dist0, seg_cand_masks, cand_rec -> masks );
+ }
+
+ if(( seg_cand_masks & LRMIS_NEGATIVE_STRAND ) == ( cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) && dist0 < LRMREAD_RESULT_MERGE_TOLERANCE && dist0 >= 0 && (cand_rec -> segment_id[cand_rec -> segment_number-1] < x1 || ( cand_rec -> segment_id[cand_rec -> segment_number-1] == x1 && cand_rec -> last_distance > dist0 )) && dist0 < min_positive_dist){
+ cond_found = 1;
+ selected_cand_list = cand_rec;
+ min_positive_dist = dist0;
+ }
+
+ if(selected_cand_list){
+ if(selected_cand_list -> segment_id[selected_cand_list -> segment_number-1] == x1){
+ selected_cand_list -> segment_number --;
+ selected_cand_list -> votes -= selected_cand_list -> last_votes;
+ }else selected_cand_list -> second_last_seg_start_pos = selected_cand_list -> last_seg_start_pos;
+
+ selected_cand_list -> votes += seg_cand_votes;
+ selected_cand_list -> segment_id[selected_cand_list -> segment_number] = x1;
+ selected_cand_list -> realign_cand_id[selected_cand_list -> segment_number] = x2;
+ selected_cand_list -> segment_number ++;
+ assert(selected_cand_list -> segment_number <= LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_LENGTH);
+ selected_cand_list -> last_seg_start_pos = seg_cand_segpos;
+ selected_cand_list -> last_votes = seg_cand_votes;
+ selected_cand_list -> last_distance = min_positive_dist;
+ if(0) LRMprintf(" Added to Old Cand #%p: Seg[%d] %d (V=%d); BEST CAND : %p (V=%d)\n", selected_cand_list, selected_cand_list -> segment_number - 1, x1, selected_cand_list -> votes, read_res -> best_candidate, read_res -> votes);
+ update_read_res;
+ }
+ if(cond_found)break;
+ }
+
+ if(!cond_found){
+ LRMread_final_candidate_t * selected_cand_list = malloc(sizeof(LRMread_final_candidate_t));
+ memset(selected_cand_list, 0, sizeof(LRMread_final_candidate_t));
+ selected_cand_list -> read_head_pos = seg_pos_moved;
+ selected_cand_list -> votes = seg_cand_votes;
+ selected_cand_list -> masks = seg_cand_masks;
+ selected_cand_list -> segment_id[0] = x1;
+ selected_cand_list -> realign_cand_id[0] = x2;
+ selected_cand_list -> segment_number = 1;
+ selected_cand_list -> last_seg_start_pos = seg_cand_segpos;
+ selected_cand_list -> second_last_seg_start_pos = 0;
+ selected_cand_list -> last_votes = seg_cand_votes;
+ selected_cand_list -> last_distance = 0;
+
+ ArrayListPush(entry_tab, selected_cand_list);
+ update_read_res;
+ }
+ }
+ }
+
+ //LRMprintf("FINAL BEST CANDS : %p\n", read_res -> best_candidate);
+
+ if(read_res -> best_candidate){
+ LRMread_final_candidate_t * best_cand_rec = read_res -> best_candidate;
+ long long final_mapping_pos = -1;
+ long long merged_chro_cursor = -1, merged_read_cursor = -1;
+ char * target_cigar = iteration_context -> merged_cigar;
+ int target_cigar_ptr = 0, last_seg_last_base_read = -1;
+ long long last_seg_last_base_chro = -1;
+
+ memset(target_cigar, 0, LRMMERGE_CIGAR_SIZE+1);
+
+ if(best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) LRMreverse_read(iteration_context -> read_text, iteration_context -> read_length);
+
+
+ // remove "S" sections from the middle parts
+
+ for(x1 = 0; x1 < best_cand_rec -> segment_number ; x1 ++){
+ int this_seg_id = ( best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) ? best_cand_rec -> segment_id[best_cand_rec -> segment_number - x1 - 1] : best_cand_rec -> segment_id[x1];
+ int this_segcand_id = ( best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) ? best_cand_rec -> realign_cand_id[best_cand_rec -> segment_number - x1 - 1] : best_cand_rec -> realign_cand_id[x1];
+ int cigar_max = strlen( iteration_context -> segment_cigars[this_seg_id][this_segcand_id]) + 1;
+ cigar_max = min( LRMMERGE_CIGAR_SIZE, cigar_max );
+ char * new_cigar = malloc(cigar_max+1);
+ new_cigar[0]=0;
+ int new_cigar_ptr = 0;
+ int cci = 0, tmpi = 0, nch, is_first_section = 1;
+
+ while(0!=(nch = iteration_context -> segment_cigars[this_seg_id][this_segcand_id] [cci++])){
+ if(nch >='0' && nch <='9')
+ tmpi = tmpi*10+(nch - '0');
+ else{
+ if(nch == 'S'){
+ if(is_first_section == ((best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND)?0:1))
+ iteration_context -> segment_texts[this_seg_id] += tmpi;
+ if(is_first_section)
+ iteration_context -> segment_best_pos[this_seg_id][this_segcand_id] += tmpi;
+ iteration_context -> segment_lengths[this_seg_id] -= tmpi;
+ }else{
+ //LRMprintf("CIGAR OLD:%s, NEW: %p, LEN=%d NCH='%c'(%d) PTR SIZE=%d\n", iteration_context -> segment_cigars[this_seg_id][this_segcand_id], new_cigar, tmpi, nch, nch, cigar_max - new_cigar_ptr);
+ //fflush(stderr);
+ new_cigar_ptr += snprintf(new_cigar+new_cigar_ptr, cigar_max - new_cigar_ptr, "%d%c", tmpi, nch);
+ // LRMprintf("CIGAR NEW: %d + %s\n\n", new_cigar_ptr, new_cigar);
+ //fflush(stderr);
+ }
+ is_first_section = 0;
+ tmpi=0;
+ }
+ }
+ strncpy(iteration_context -> segment_cigars[this_seg_id][this_segcand_id], new_cigar, LRMSEGMENT_CIGAR_SIZE);
+ free(new_cigar);
+ }
+
+
+ long long validateCigar_cursor = -1;
+ int written_read_cursor = 0;
+ for(x1 = 0; x1 < best_cand_rec -> segment_number ; x1 ++){
+ int is_first_section = 1;
+ int this_seg_id = ( best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) ? best_cand_rec -> segment_id[best_cand_rec -> segment_number - x1 - 1] : best_cand_rec -> segment_id[x1];
+ int this_segcand_id = ( best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) ? best_cand_rec -> realign_cand_id[best_cand_rec -> segment_number - x1 - 1] : best_cand_rec -> realign_cand_id[x1];
+
+ if(1){
+ int this_start_offset = iteration_context -> segment_texts[this_seg_id] - iteration_context -> read_text;
+
+ assert(iteration_context -> segment_best_pos[this_seg_id][this_segcand_id] >=this_start_offset);
+
+ if(best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ){
+ int R_pos = iteration_context -> read_length - (iteration_context -> segment_texts[this_seg_id] - iteration_context -> read_text + iteration_context -> segment_lengths[this_seg_id]);
+ assert(iteration_context -> segment_best_pos[this_seg_id][this_segcand_id] >=R_pos);
+ this_start_offset = R_pos;
+ }
+ if(validateCigar_cursor<0) validateCigar_cursor = this_start_offset;
+
+ if(final_mapping_pos < 0){
+ if(this_start_offset > 0)if(target_cigar_ptr < LRMMERGE_CIGAR_SIZE)target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - target_cigar_ptr, "%dS", this_start_offset);
+ final_mapping_pos = iteration_context -> segment_best_pos[this_seg_id][this_segcand_id] - this_start_offset;
+ }
+ merged_chro_cursor = iteration_context -> segment_best_pos[this_seg_id][this_segcand_id];
+ merged_read_cursor = this_start_offset;
+
+ if(1){
+ char postxt[100];
+ LRMpos2txt(context, iteration_context -> segment_best_pos[this_seg_id][this_segcand_id], postxt );
+ LRMprintf("FINAL MERGING : %s [%d / %d] ; CHRO POS = %s (%s) ; READ POS = %d ; CIGAR = %s\n", iteration_context -> read_name, this_seg_id, iteration_context -> total_segments, postxt, (best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND)?"NEG":"POS", this_start_offset, iteration_context -> segment_cigars[this_seg_id][this_segcand_id] );
+ }
+
+ if(1){
+ long long chro_pos_from_new = LRMcalculate_written_chro_pos(context, thread_context, iteration_context, written_read_cursor, merged_read_cursor, iteration_context -> segment_cigars[this_seg_id][this_segcand_id], merged_chro_cursor);
+ if(chro_pos_from_new < 0) continue;
+
+ long long delta = 0;
+
+ if(last_seg_last_base_chro>0){
+ delta =chro_pos_from_new - last_seg_last_base_chro;
+ if(this_start_offset > last_seg_last_base_read) {
+ if(target_cigar_ptr < LRMMERGE_CIGAR_SIZE)target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - target_cigar_ptr, "%dM", this_start_offset - last_seg_last_base_read);
+ delta -= this_start_offset - last_seg_last_base_read;
+ written_read_cursor = this_start_offset;
+ }
+ }
+
+ if(0&&abs(delta) <= context -> max_indel_length && LRMDYNAMIC_MAXIMUM_GAP_LENGTH -1 > this_start_offset - last_seg_last_base_read - min(0, delta)){
+ unsigned int total_mismatched_bases = 0;
+ int move_i, moves = LRMindel_dynamic_search(context, thread_context,-(int)delta, last_seg_last_base_chro, iteration_context -> read_text , last_seg_last_base_read, this_start_offset , &total_mismatched_bases);
+ if(moves > 0){
+ int tmpi = 0;
+ for(move_i = 0; move_i < moves; move_i++){
+ tmpi ++;
+ char nch = thread_context -> dynamic_programming_indel_movement_buf[move_i];
+ nch =(nch == 'X'?'M':nch);
+ char nnh = thread_context -> dynamic_programming_indel_movement_buf[move_i + 1];
+ nnh =(nnh == 'X'?'M':nnh);
+ if(nnh != nch){
+ if(target_cigar_ptr < LRMMERGE_CIGAR_SIZE)target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - target_cigar_ptr, "%d%c", tmpi, nch);
+ tmpi = 0;
+ }
+ }
+ }
+ }else{
+ if(delta){
+ if(target_cigar_ptr < LRMMERGE_CIGAR_SIZE)target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - target_cigar_ptr, "%lld%c" , abs(delta), delta > 0?'N':'I' );
+ if(delta < 0)
+ written_read_cursor += abs(delta);
+ }
+ }
+ }
+
+ if(merged_chro_cursor >= 0){
+ int cci = 0, tmpi = 0, nch;
+ while(0!=(nch = iteration_context -> segment_cigars[this_seg_id][this_segcand_id] [cci++])){
+ if(nch >='0' && nch <='9')
+ tmpi = tmpi*10+(nch - '0');
+ else{
+ if(nch == 'M' || nch == 'S' || nch == 'D' || nch == 'N')merged_chro_cursor += tmpi;
+ if(nch == 'M' || nch == 'S' || nch == 'I') merged_read_cursor += tmpi;
+
+ if(written_read_cursor <= merged_read_cursor){
+ int writting_optlen = tmpi;
+ if(nch == 'M' || nch == 'S' || nch == 'I') writting_optlen = (merged_read_cursor - written_read_cursor);
+ if(target_cigar_ptr < LRMMERGE_CIGAR_SIZE)target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - target_cigar_ptr, "%d%c" , writting_optlen, nch);
+ written_read_cursor = merged_read_cursor;
+ }
+ tmpi = 0;
+ is_first_section = 0;
+ }
+ }
+ last_seg_last_base_read = merged_read_cursor;
+ last_seg_last_base_chro = merged_chro_cursor;
+ }
+ }
+ }
+ if(last_seg_last_base_read < iteration_context -> read_length){
+ if(target_cigar_ptr < LRMMERGE_CIGAR_SIZE)target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - target_cigar_ptr, "%dS", iteration_context -> read_length - last_seg_last_base_read );
+ }
+ if(best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) LRMreverse_read(iteration_context -> read_text, iteration_context -> read_length);
+
+ iteration_context -> merged_position = final_mapping_pos;
+ iteration_context -> merged_masks = best_cand_rec -> masks;
+ if(1){
+ char postxt[100];
+ int mapped_length = 0;
+ LRMpos2txt(context , final_mapping_pos, postxt);
+ LRMprintf("\nFINAL READ %s to %s (%s)\n", iteration_context -> read_name, postxt, best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND?"NEG":"POS");
+ int matched_bases = LRMvalidate_mapping(context , iteration_context -> read_text, target_cigar, &context -> current_base_index, final_mapping_pos, best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND, &mapped_length , 1);
+ LRMprintf("Matched %d in %d : %s\n", matched_bases, mapped_length, target_cigar);
+ LRMprintf("\n\n");
+ }
+ } // end : if the best candidate list is not NULL
+
+ ArrayListDestroy(entry_tab);
+ return 0;
+}
+
+
+void LRMdo_one_realign_read(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context){
+ int flags=4;
+ char * chro_name="*";
+ int chro_pos = 0;
+ int map_quality = 0;
+ int mis_matched = 0;
+ int seg_i;
+
+ LRMread_mapping_result_t * read_res = context -> read_mapping_results + iteration_context -> read_no_in_chunk;
+ memset(iteration_context -> segment_best_candidate_score, 0, sizeof(int) * LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_LENGTH * LRMMERGING_MAX_CANDIDATES);
+
+ for(seg_i = 0; seg_i < iteration_context -> total_segments ; seg_i++){
+ LRMrealign_context_t realign_context;
+ int * this_best_scores = iteration_context -> segment_best_candidate_score[seg_i];
+
+ int cand_i;
+ for(cand_i = 0; cand_i < LRMSEGMENT_MAX_CANDIDATES; cand_i ++){
+ LRMsegment_mapping_candidate_t * cand_res = read_res -> segment_results[seg_i].candidates + cand_i;
+
+ if(0){
+ char postxt[100];
+ LRMpos2txt(context, cand_res -> first_base_position , postxt);
+ LRMprintf("TRY REALIGN READ %s [%d , %d] : V=%d ; POS=%s (%s)\n", iteration_context -> read_name, seg_i, cand_i, cand_res -> votes, postxt, (cand_res -> masks & LRMIS_NEGATIVE_STRAND)?"NEG":"POS");
+ }
+ if(cand_res -> votes < 1) break;
+
+ memset(&realign_context, 0, sizeof(LRMrealign_context_t));
+ realign_context.current_segment_id = seg_i;
+ realign_context.current_candidate_id = cand_i;
+ LRMrealign_one_segment(context, thread_context, iteration_context, &realign_context);
+ if(realign_context.best_stack_score[0]+realign_context.best_stack_score[1] > this_best_scores[LRMMERGING_MAX_CANDIDATES-1]){
+ int replace_i, replace_index = LRMMERGING_MAX_CANDIDATES-1, is_repeated = 0;
+ for(replace_i = LRMMERGING_MAX_CANDIDATES-1; replace_i >= 0; replace_i --){
+ if(realign_context.best_stack_score[0]+realign_context.best_stack_score[1] > this_best_scores[replace_i])
+ replace_index = replace_i;
+ if(iteration_context -> segment_best_pos[seg_i][replace_i] == realign_context.best_chro_pos) is_repeated = 1;
+ }
+
+ if(0 == is_repeated){
+ for(replace_i = LRMMERGING_MAX_CANDIDATES - 2 ; replace_i >= replace_index; replace_i--){
+ strncpy(iteration_context -> segment_cigars[seg_i][replace_i+1], iteration_context -> segment_cigars[seg_i][replace_i], LRMSEGMENT_CIGAR_SIZE);
+ iteration_context -> segment_best_pos[seg_i][replace_i+1] = iteration_context -> segment_best_pos[seg_i][replace_i];
+ iteration_context -> segment_best_masks[seg_i][replace_i+1] = iteration_context -> segment_best_masks[seg_i][replace_i];
+ iteration_context -> segment_best_candidate_score[seg_i][replace_i+1] = iteration_context -> segment_best_candidate_score[seg_i][replace_i];
+ iteration_context -> segment_best_votes[seg_i][replace_i+1] = iteration_context -> segment_best_votes[seg_i][replace_i];
+ }
+
+ strncpy(iteration_context -> segment_cigars[seg_i][replace_index], realign_context.best_cigar, LRMSEGMENT_CIGAR_SIZE);
+ iteration_context -> segment_best_pos[seg_i][replace_index] = realign_context.best_chro_pos;
+ iteration_context -> segment_best_masks[seg_i][replace_index] = cand_res -> masks;
+ iteration_context -> segment_best_candidate_score[seg_i][replace_index] = realign_context.best_stack_score[0]+realign_context.best_stack_score[1] ;
+ iteration_context -> segment_best_votes[seg_i][replace_index] = cand_res -> votes;
+ }
+ }
+ }
+ }
+
+ strcpy(iteration_context -> merged_cigar, "*");
+ LRMread_final_result_merge(context, thread_context, iteration_context, read_res);
+
+ if(read_res -> votes > 0){
+ LRMlocate_gene_position(context, iteration_context -> merged_position, &chro_name, & chro_pos);
+ //fprintf(stderr, "RELA: %u => rv=%d\n", read_res -> final_pos, rv);
+ map_quality = 10;// read_res -> votes + 10;
+ if( iteration_context -> merged_masks & LRMIS_NEGATIVE_STRAND){
+ flags=16;
+ LRMreverse_read_and_qual(context, thread_context, iteration_context);
+ }else flags = 0;
+ }else flags = 4;
+
+ LRMfix_cigar(context, thread_context, iteration_context, iteration_context -> merged_cigar);
+
+ LRMwrite_chunk_add_buffered_output(context, thread_context, iteration_context, flags, chro_name, chro_pos, map_quality, iteration_context -> merged_cigar, mis_matched);
+}
+
+int LRMchunk_read_iteration(LRMcontext_t * context, int thread_id, int task){
+ LRMthread_context_t * thread_context = context -> thread_contexts+ thread_id;
+
+ LRMread_iteration_context_t * iteration_context;
+ iteration_context = malloc(sizeof(LRMread_iteration_context_t));
+ //LRMprintf(" ============ LITR_CONTEXT PTR=%p, SIZE=%lld \n", iteration_context, sizeof(LRMread_iteration_context_t));
+ memset(iteration_context, 0, sizeof(LRMread_iteration_context_t));
+ while(1){
+ int retv = LRMfetch_next_read(context, thread_context, &iteration_context-> read_length, iteration_context->read_name, iteration_context->read_text, iteration_context->qual_text, &iteration_context -> read_no_in_chunk);
+ if(retv) break;
+
+ LRMsplit_read_to_segments(context, iteration_context);
+ if(task==LRMRUNNING_STEP_VOTING)
+ LRMdo_one_voting_read(context, thread_context, iteration_context);
+ else if(task==LRMRUNNING_STEP_REALIGN)
+ LRMdo_one_realign_read(context, thread_context, iteration_context);
+ else assert(0);
+
+ if(iteration_context -> read_no_in_chunk % 2000 == 0)
+ LRMprintf("Processing %d-th read for task %d; used %.1f minutes\n", context -> all_processed_reads + iteration_context -> read_no_in_chunk, task, (LRMmiltime() - context -> start_running_time)/60);
+
+ //LRMprintf("R:%s, T:%s\n", iteration_context -> read_name, iteration_context -> read_text);
+ }
+ free(iteration_context);
+ return 0;
+}
+
+int LRMfinalise_chunk_reads(LRMcontext_t* context){
+ context -> all_processed_reads += context -> processed_reads_in_chunk;
+ return 0;
+}
+
+int FIXLENstrcmp(char * fixed_len, char * rname){
+ int x=0;
+ for(; fixed_len[x]; x++){
+ if(rname[x]!=fixed_len[x]) return 1;
+ }
+ return 0;
+}
+
diff --git a/src/longread-mapping/longread-mapping.c b/src/longread-mapping/longread-mapping.c
new file mode 100644
index 0000000..241761a
--- /dev/null
+++ b/src/longread-mapping/longread-mapping.c
@@ -0,0 +1,2012 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <time.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <sys/timeb.h>
+#include <sys/stat.h>
+#include <locale.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#include "../subread.h"
+#include "LRMconfig.h"
+#include "LRMhelper.h"
+#include "LRMsorted-hashtable.h"
+#include "LRMbase-index.h"
+#include "LRMchro-event.h"
+#include "LRMfile-io.h"
+
+
+int main(int argc, char ** argv){
+ int retv=0;
+
+ LRMcontext_t *context = NULL;
+ retv = retv || LRMvalidate_and_init_context(&context, argc, argv);
+ retv = retv || LRMshow_conf(context);
+ retv = retv || LRMrun_task(context);
+ retv = retv || LRMfinalise(context);
+ retv = retv || LRMprint_mapping_summary(context);
+ retv = retv || LRMdestroy_context(context);
+ context = NULL;
+
+ return retv;
+}
+
+int LRMprint_mapping_summary(LRMcontext_t * context){
+ LRMprintf("\n\nAll finished.\n\nTotal processed reads : %d\n", context -> all_processed_reads);
+ LRMprintf("Mapped reads: %u (%.1f%%)\n", context->mapped_reads, context->mapped_reads*100./context->all_processed_reads);
+ LRMprintf("Time: %.1f minutes\n\n" , (LRMmiltime() - context->start_running_time)/60);
+ return 0;
+}
+
+void LRMprint_version(){
+ LRMprintf("\nVersion %s\n", SUBREAD_VERSION);
+ LRMputs("");
+}
+void LRMprint_usage(){
+ LRMprint_version();
+ LRMputs("Usage:");
+ LRMputs("");
+ LRMputs("./sublong [options] -i <index_name> -r <input> -o <output>");
+ LRMputs("");
+ LRMputs("## Mandatory arguments:");
+ LRMputs("");
+ LRMputs(" -i <string> Base name of the index. The index must be built as a full index.");
+ LRMputs("");
+ LRMputs(" -r <string> Name of an input read file. Acceptable formats include gzipped");
+ LRMputs(" FASTQ, FASTQ and FASTA (automatically identified). The quality");
+ LRMputs(" scores should be in the Phred 33 format for the FASTQ or gzipped");
+ LRMputs(" FASTQ formats.");
+ LRMputs("");
+ LRMputs(" -o <string> Name of an output file. By default, the output is in BAM format.");
+ LRMputs("");
+ LRMputs("## Optional arguments:");
+ LRMputs("# input reads and output");
+ LRMputs("");
+ LRMputs(" --SAMoutput Save mapping results in SAM format.");
+ LRMputs("");
+ LRMputs("# thresholds for mapping");
+ LRMputs("");
+ LRMputs(" -n <int> Number of selected subreads in a readlet, 85 by default.");
+ LRMputs("");
+ LRMputs(" -m <int> Consensus threshold for mapping a readlet. 1 by default.");
+ LRMputs("");
+ LRMputs(" -X <int> Maximum number of mis-matched bases allowed in each subread.");
+ LRMputs(" 0 by default.");
+ LRMputs("");
+ LRMputs("# number of CPU threads");
+ LRMputs("");
+ LRMputs(" -T <int> Number of CPU threads used. 1 by default.");
+ LRMputs("");
+ LRMputs("# others");
+ LRMputs("");
+ LRMputs(" -v Output version of the program.");
+ LRMputs("");
+ LRMputs("Refer to Users Manual for detailed description to the arguments.");
+ LRMputs("");
+}
+
+
+static struct option long_options[] ={
+ {"SAMoutput", no_argument, 0, 0},
+ {0, 0, 0, 0}
+};
+
+int LRMvalidate_and_init_context(LRMcontext_t ** context, int argc, char ** argv){
+ int c;
+
+ (*context) = malloc(sizeof(LRMcontext_t));
+ memset((*context), 0, sizeof(LRMcontext_t));
+ LRMset_default_values_context(*context);
+
+ (*context) -> input_file_name[0] = 0;
+ (*context) -> output_file_name[0] = 0;
+ (*context) -> index_prefix[0] = 0;
+
+ optind = 0;
+ opterr = 1;
+ optopt = 63;
+ int option_index = 0;
+ while ((c = getopt_long (argc, argv, "B:Mr:i:o:T:P:m:O:X:n:jqv", long_options, &option_index))!=-1){
+ switch(c){
+ case 'M':
+ (*context) -> unique_only = 0;
+ break;
+ case 'B':
+ (*context) -> max_best_alignments = atoi(optarg);
+ (*context) -> max_best_alignments = min((*context) -> max_best_alignments, LRMMAX_MULTI_BEST );
+ (*context) -> unique_only = 0;
+ break;
+ case 'P': // not useable now!
+ (*context) -> is_Phred_64=(optarg[0]=='6');
+ break;
+ case 'j': // not useable now.
+ (*context) -> do_junction_detection = 1;
+ (*context) -> result_merge_tolerance = 500000;
+ break;
+ case 'q': // not for end users.
+ (*context) -> show_read_validation = 1;
+ break;
+ case 'r':
+ strcpy((*context) -> input_file_name, optarg);
+ break;
+ case 'i':
+ strcpy((*context) -> index_prefix, optarg);
+ break;
+ case 'o':
+ strcpy((*context) -> output_file_name, optarg);
+ break;
+ case 'T':
+ (*context) -> threads = min(max(1,atoi(optarg)),LRMMAX_THREADS);
+ break;
+ case 'n':
+ (*context) -> max_subreads_per_segment = atoi(optarg);
+ assert( (*context) -> max_subreads_per_segment< LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT );
+ break;
+ case 'X':
+ (*context) -> max_mismatched_bases_in_subread = atoi(optarg);
+ assert((*context) -> max_mismatched_bases_in_subread <3);
+ break;
+ case 'O': // not useable now.
+ (*context) -> segment_overlapping = atoi(optarg);
+ break;
+ case 'm':
+ (*context) -> min_voting_number = atoi(optarg);
+ break;
+ case 'v':
+ LRMprint_version();
+ return 1;
+ case 0:
+ if(strcmp("SAMoutput", long_options[option_index].name)==0) {
+ (*context) -> is_SAM_output = 1;
+ }
+ break;
+ case '?':
+ default:
+ LRMprint_usage();
+ return 1;
+ }
+ }
+
+ if((*context) -> input_file_name[0] ==0 || (*context) -> output_file_name[0]==0 || (*context) -> index_prefix[0]==0){
+ LRMprint_usage();
+ if(!((*context) -> input_file_name[0] ==0 && (*context) -> output_file_name[0]==0 && (*context) -> index_prefix[0]==0)){
+ LRMprintf("Please specify the input, output files and the index.\n");//(*context) -> input_file_name[0] , (*context) -> output_file_name[0], (*context) -> index_prefix[0]);
+ }
+ return 1;
+ }
+
+ int index_gap = 99999;
+ int retv = LRMgehash_load_option((*context) -> index_prefix, LRMSUBREAD_INDEX_OPTION_INDEX_GAP , &index_gap);
+ if(retv<0){
+ LRMprintf("\nUnable to find the index.\n\n");
+ return 1;
+ }else retv = 0;
+ if(index_gap != 1){
+ LRMprintf("\nPlease build the index as a full index.\n\n");
+ return 1;
+ }
+
+ (*context) -> user_command_line[0]=0;
+ for(c = 0; c<argc;c++)
+ sprintf((*context) -> user_command_line+strlen( (*context) -> user_command_line), "\"%s\" ", argv[c]);
+
+
+ (*context) -> max_cigar_opts_in_read = (*context) -> is_SAM_output? LRMMAX_CIGAR_OPTS_IN_SAM_READ : LRMMAX_CIGAR_OPTS_IN_BAM_READ;
+ LRMthread_lockinit(&(*context) -> input_lock);
+ LRMthread_lockinit(&(*context) -> sam_bam_file_lock);
+
+ (*context)-> sam_bam_chromosome_table = HashTableCreate(199);
+ HashTableSetKeyComparisonFunction((*context)-> sam_bam_chromosome_table, LRMhash_strcmp);
+ HashTableSetHashFunction((*context)-> sam_bam_chromosome_table, LRMhash_strhash);
+ HashTableSetDeallocationFunctions((*context)-> sam_bam_chromosome_table, NULL, NULL);
+
+
+ (*context)-> chromosome_size_list = ArrayListCreate(29);
+
+ (*context)-> chromosome_size_table = HashTableCreate(199);
+ HashTableSetKeyComparisonFunction((*context)-> chromosome_size_table, LRMhash_strcmp);
+ HashTableSetHashFunction((*context)-> chromosome_size_table, LRMhash_strhash);
+ HashTableSetDeallocationFunctions((*context)-> chromosome_size_table, free, NULL);
+
+ (*context) -> sam_bam_chromosome_list = ArrayListCreate(29);
+
+ LRMload_offsets(*context);
+
+ retv = LRMgeinput_open((*context)->input_file_name,&(*context) -> input_file);
+ if(retv) LRMprintf("\nUnable to open the input file.\n");
+
+ (*context)->sam_bam_file = fopen( (*context) -> output_file_name, "w");
+ if(NULL == (*context)->sam_bam_file) retv = 1;
+
+ (*context)->event_space = malloc(sizeof(LRMevent_t)*20000);
+ (*context)->event_space_size = 20000;
+ LRMthread_lockinit(&(*context) -> event_space_lock);
+ (*context)->events_realignment = HashTableCreate(320000);
+
+ assert(LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT >= (*context)->max_subreads_per_segment );
+
+ return retv;
+}
+
+
+double LRMmiltime(){
+ double ret;
+ #ifdef FREEBSD
+ struct timeval tp;
+ struct timezone tz;
+ tz.tz_minuteswest=0;
+ tz.tz_dsttime=0;
+
+ gettimeofday(&tp,&tz);
+
+ ret = tp.tv_sec+ 0.001*0.001* tp.tv_usec;
+
+ #else
+
+ struct timeb trp;
+ ftime(&trp);
+ ret = trp.time*1.0+(trp.millitm*1.0/1000.0);
+ #endif
+
+ return ret;
+}
+
+
+void LRMset_default_values_context(LRMcontext_t * context){
+ context->threads = 1;
+ context->start_running_time = LRMmiltime();
+ context->multi_best_read_alignments = 1;
+ context->max_dynamic_indel_length = 5000;
+
+ context->max_read_indel_length = 0;
+ context->max_junction_distance = 100000;
+ context->max_mismatched_bases_in_subread = 0;
+ context->max_subreads_per_segment = 85;
+ context->min_voting_number = 1;
+ context->min_matched_bases = 40;
+ context->segment_overlapping = 20;
+ context->result_merge_tolerance = 15000;
+ context->unique_only = 1;
+ context->max_best_alignments = 1;
+
+
+ context->dynamic_programming_score_match = 6;
+ context->dynamic_programming_score_mismatch = 0;
+ context->dynamic_programming_score_create_gap = -4;
+ context->dynamic_programming_score_extend_gap = -1;
+}
+
+int LRMshow_conf(LRMcontext_t * context){
+ LRMprintf("\n ====== Subread long read mapping ======\n\n");
+ LRMprintf("Threads: %d\n" , context->threads);
+ LRMprintf("Input file: %s\n" , context->input_file_name);
+ LRMprintf("Output file: %s (%s)\n" , context->output_file_name, context->is_SAM_output?"SAM":"BAM");
+ LRMprintf("Index: %s\n\n" , context->index_prefix);
+
+ return 0;
+}
+
+int LRMinit_chunk(LRMcontext_t * context){
+ if(context->all_processed_reads) memset(context -> read_mapping_results, 0, sizeof(LRMread_mapping_result_t)*LRMREADS_PER_CHUNK);
+ return 0;
+}
+
+int LRMrun_task(LRMcontext_t * context){
+ int retv = 0;
+ retv = LRMload_index( context );
+ if(!retv) LRMprintf("Index was loaded; the gap bewteen subreads is %d bases\n", context -> current_index.index_gap );
+ while(!(retv ||LRMinput_has_finished( context ))){
+ retv=retv || LRMinit_chunk(context);
+ retv=retv || LRMsave_input_pos(context);
+ retv=retv || LRMiterate_reads( context , LRMRUNNING_STEP_VOTING);
+ retv=retv || LRMrewind_input_pos(context);
+ retv=retv || LRMiterate_reads( context , LRMRUNNING_STEP_REALIGN);
+ retv=retv || LRMfinalise_chunk_reads(context);
+ }
+ return retv;
+}
+
+int LRMfinalise(LRMcontext_t * context){
+ return 0;
+}
+
+int LRMdestroy_context(LRMcontext_t * context){
+
+ LRMgehash_destory(&(context -> current_index));
+ LRMgvindex_destory(&(context -> current_base_index));
+
+ HashTableDestroy(context -> chromosome_size_table);
+ ArrayListDestroy(context -> chromosome_size_list);
+
+ HashTableDestroy(context -> sam_bam_chromosome_table);
+ ArrayListDestroy(context -> sam_bam_chromosome_list);
+
+ HashTableSetDeallocationFunctions(context -> events_realignment, NULL, free);
+ HashTableDestroy(context -> events_realignment);
+
+ free(context -> event_space);
+
+ int readno;
+ for(readno = 0; readno < LRMREADS_PER_CHUNK; readno++){
+ if(context -> read_mapping_results[readno].segment_results != NULL)
+ free(context -> read_mapping_results[readno].segment_results);
+ }
+
+ if(!context -> is_SAM_output){
+ fwrite(context -> bam_file_tail_binary,1, context -> bam_file_tail_length, context->sam_bam_file);
+ }
+
+ LRMgeinput_close(&context->input_file);
+ fclose(context->sam_bam_file);
+ //free(context->user_command_line);
+ free(context);
+ return 0;
+}
+
+
+int LRMinput_has_finished( LRMcontext_t * context ){
+ return context -> input_exhausted ;
+}
+
+int LRMload_index(LRMcontext_t * context){
+ int retv = 0;
+ char indextab_fname[LRMMAX_FILENAME_LENGTH];
+
+ sprintf(indextab_fname, "%s.00.b.tab", context -> index_prefix);
+ retv = retv || LRMgehash_load(&(context -> current_index), indextab_fname);
+
+ sprintf(indextab_fname, "%s.00.b.array", context -> index_prefix);
+ retv = retv || LRMgvindex_load(&(context -> current_base_index), indextab_fname);
+
+ return retv;
+}
+
+
+int LRMiterate_reads( LRMcontext_t * context, int task ){
+ int retv = 0;
+ retv = retv || LRMstart_thread( context , task );
+ retv = retv || LRMwait_threads( context );
+ retv = retv || LRMmerge_threads( context, task );
+ return retv;
+}
+
+void * LRM_thread_runner (void * args){
+ void ** argv = args;
+ LRMcontext_t * context = argv[0];
+ int thid = argv[1]-NULL;
+ int task = argv[2]-NULL;
+ free(args);
+
+ LRMchunk_read_iteration(context, thid, task);
+
+ return NULL;
+}
+
+int LRMstart_thread_init_context(LRMcontext_t * context, int thread_id, int step){
+ LRMthread_context_t * thread_context = context -> thread_contexts+thread_id;
+ memset(thread_context, 0, sizeof(LRMthread_context_t));
+ thread_context->thread_id = thread_id;
+
+ if(step == LRMRUNNING_STEP_VOTING){
+ if( thread_context -> thread_id == 0 )LRMsambam_write_header(context, thread_context);
+ thread_context -> dynamic_programming_movement_buffer = malloc(( 2* LRMINDEL_DYNAMIC_CHANNEL_TOLERANCE + 1) * LRMDYNAMIC_MAXIMUM_GAP_LENGTH);
+ thread_context -> dynamic_programming_score_buffer = malloc(sizeof(int) * ( 2 * LRMINDEL_DYNAMIC_CHANNEL_TOLERANCE + 1) *( LRMDYNAMIC_MAXIMUM_GAP_LENGTH+1));
+ thread_context -> dynamic_programming_indel_movement_buf = malloc( max( LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15, 300 ) + context -> max_dynamic_indel_length + 1 );
+ }else if(step == LRMRUNNING_STEP_REALIGN){
+ thread_context -> mapped_reads = 0;
+ thread_context -> dynamic_programming_movement_buffer = malloc(( 2* LRMINDEL_DYNAMIC_CHANNEL_TOLERANCE + 1) * LRMDYNAMIC_MAXIMUM_GAP_LENGTH);
+ thread_context -> dynamic_programming_score_buffer = malloc(sizeof(int) * ( 2 * LRMINDEL_DYNAMIC_CHANNEL_TOLERANCE + 1) *( LRMDYNAMIC_MAXIMUM_GAP_LENGTH+1));
+ thread_context -> dynamic_programming_indel_movement_buf = malloc( max( LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15, 300 ) + context -> max_dynamic_indel_length + 1 );
+
+ thread_context -> out_SAMBAM_buffer = malloc(2400000);
+ if(thread_context -> out_SAMBAM_buffer == NULL) return 1;
+
+ thread_context -> out_buff_used = 0;
+ thread_context -> out_buff_capacity = 2400000;
+ }
+ return 0;
+}
+
+int LRMstart_thread(LRMcontext_t * context, int task ){
+ int th_id, retv=0;
+
+ for(th_id=0; th_id<context -> threads; th_id++){
+
+ retv = retv || LRMstart_thread_init_context(context,th_id,task);
+ if(retv)
+ break;
+ else {
+ void ** th_args=malloc(sizeof(void *)*3); // to be freed in the thread.
+ th_args[0] = context;
+ th_args[1] = NULL + th_id;
+ th_args[2] = NULL + task;
+ LRMpthread_create(context -> running_threads+th_id, NULL, LRM_thread_runner, th_args);
+ }
+ }
+
+ return retv;
+}
+
+int LRMwait_threads( LRMcontext_t * context ){
+ int th_id;
+ for(th_id=0; th_id<context -> threads; th_id++)
+ LRMpthread_join(context -> running_threads[th_id], NULL);
+ return 0;
+}
+
+void LRMmerge_threads_destroy_context(LRMcontext_t * context, LRMthread_context_t * thread_context, int task){
+ if(task == LRMRUNNING_STEP_VOTING){
+ free(thread_context -> dynamic_programming_movement_buffer);
+ free(thread_context -> dynamic_programming_score_buffer);
+ free(thread_context -> dynamic_programming_indel_movement_buf);
+ }else if(task == LRMRUNNING_STEP_REALIGN){
+ free(thread_context -> dynamic_programming_movement_buffer);
+ free(thread_context -> dynamic_programming_score_buffer);
+ free(thread_context -> dynamic_programming_indel_movement_buf);
+ free(thread_context -> out_SAMBAM_buffer);
+ }
+}
+
+int LRMmerge_threads( LRMcontext_t * context , int step){
+ int retv = 0;
+ int th_id;
+
+ for(th_id=0; th_id<context -> threads; th_id++){
+
+ if(step == LRMRUNNING_STEP_VOTING){
+ retv = retv || LRMevents_reorder(context);
+ retv = retv || LRMevents_build_entries(context);
+ }else if(step == LRMRUNNING_STEP_REALIGN){
+ LRMwrite_chunk_check_buffer_write(context, context -> thread_contexts+th_id, 1);
+ if(th_id == context -> threads-1)LRMbam_generate_tail_binary(context, context -> thread_contexts+th_id);
+ context -> mapped_reads += context -> thread_contexts[th_id].mapped_reads;
+ }else assert(0);
+ LRMmerge_threads_destroy_context(context, context -> thread_contexts+th_id, step);
+ }
+
+ return retv;
+}
+
+int LRMrewind_input_pos(LRMcontext_t * context){
+ context -> processed_reads_in_chunk = 0;
+ if(context->input_file.file_type == LRMGENE_INPUT_GZIP_FASTQ)
+ seekgz_seek(context->input_file.input_fp, &context->last_saved_zlib_pos);
+ else
+ fseeko(context->input_file.input_fp, context->last_saved_raw_pos, SEEK_SET);
+ return 0;
+}
+
+int LRMsave_input_pos( LRMcontext_t * context){
+ context -> processed_reads_in_chunk = 0;
+ if(context->input_file.file_type == LRMGENE_INPUT_GZIP_FASTQ)
+ seekgz_tell(context->input_file.input_fp, &context->last_saved_zlib_pos);
+ else
+ context -> last_saved_raw_pos = ftello(context->input_file.input_fp);
+
+ return 0;
+}
+
+int LRMsplit_read_to_segments(LRMcontext_t * context, LRMread_iteration_context_t* iteration_context){
+ int seg_curs = 0;
+ iteration_context->total_segments = 0;
+ if(iteration_context->read_length<=16) return 1;
+
+ int increment_step = LRMSEGMENT_MIN_LENGTH - context -> segment_overlapping; //;LRMSEGMENT_MIN_LENGTH - LRMSEGMENT_OVERLAPPING;
+
+ while(1){
+ int seg_end = seg_curs + increment_step;
+ if(seg_end + LRMSEGMENT_MIN_LENGTH > iteration_context->read_length) seg_end = iteration_context->read_length;
+ assert(seg_curs >= 0);
+
+ iteration_context->segment_texts[iteration_context->total_segments] = iteration_context->read_text + seg_curs;
+ iteration_context->segment_quals[iteration_context->total_segments] = iteration_context->qual_text + seg_curs;
+ iteration_context->segment_lengths[iteration_context->total_segments] =(seg_end == iteration_context->read_length ? iteration_context->read_length - seg_curs : LRMSEGMENT_MIN_LENGTH);
+ //LRMprintf("SPLIT: [%d] len=%d\n", iteration_context->total_segments, iteration_context->segment_lengths[iteration_context->total_segments]);
+ iteration_context->total_segments ++;
+ seg_curs = seg_end;
+ if(seg_curs >= iteration_context->read_length) break;
+ }
+
+ //assert(iteration_context->total_segments <= LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_LENGTH);
+ return 0;
+}
+
+void LRMreverse_read_and_qual(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context){
+ LRMreverse_read(iteration_context -> read_text, iteration_context -> read_length);
+ LRMreverse_quality(iteration_context -> qual_text, iteration_context -> read_length);
+
+ int segi;
+ for(segi = 0; segi < iteration_context->total_segments / 2; segi++){
+ //#warning " positions of the subreads should be moved after reversing -- for the best sequencing quality "
+ unsigned int tmp_offsets[LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT];
+ memcpy(tmp_offsets, iteration_context->subread_offsets[segi], sizeof(int)* LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT);
+ memcpy(iteration_context->subread_offsets[segi], iteration_context->subread_offsets[ iteration_context->total_segments-segi-1 ],sizeof(int)* LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT);
+ memcpy(iteration_context->subread_offsets[iteration_context->total_segments-segi-1], tmp_offsets, sizeof(int)* LRMMAX_SUBREAD_PER_SEGMENT_HARDLIMIT);
+ }
+
+ for(segi = 0; segi < iteration_context->total_segments / 2; segi++){
+ int old_left_length = iteration_context->segment_lengths[segi];
+ int old_right_length = iteration_context->segment_lengths[iteration_context->total_segments - segi -1];
+
+ int old_left_start = iteration_context -> segment_texts[segi] - iteration_context->read_text;
+ int old_right_start = iteration_context -> segment_texts[ iteration_context->total_segments - segi -1 ] - iteration_context->read_text;
+ int old_left_new_start = iteration_context -> read_length - (old_left_start + old_left_length);
+ int old_right_new_start = iteration_context -> read_length - (old_right_start + old_right_length);
+
+ iteration_context -> segment_texts[segi] = iteration_context->read_text + old_right_new_start;
+ iteration_context -> segment_quals[segi] = iteration_context->qual_text + old_right_new_start;
+ iteration_context -> segment_texts[iteration_context->total_segments - segi -1] = iteration_context->read_text + old_left_new_start;
+ iteration_context -> segment_quals[iteration_context->total_segments - segi -1] = iteration_context->qual_text + old_left_new_start;
+
+ int tmpi = iteration_context -> segment_lengths[ iteration_context->total_segments - segi -1];
+ iteration_context -> segment_lengths[ iteration_context->total_segments - segi -1] = iteration_context -> segment_lengths[segi];
+ iteration_context -> segment_lengths[segi] = tmpi;
+ }
+
+ if(iteration_context->total_segments % 2){
+ int old_len = iteration_context -> segment_lengths[segi];
+ int segi = iteration_context->total_segments / 2;
+ int old_start = iteration_context -> segment_texts[segi] - iteration_context->read_text;
+ int new_start = iteration_context->read_length - (old_start + old_len);
+
+ iteration_context -> segment_texts[segi] = iteration_context->read_text + new_start;
+ iteration_context -> segment_quals[segi] = iteration_context->qual_text + new_start;
+ }
+}
+
+void LRMdo_one_voting_read_process_setres(LRMcontext_t * context, LRMread_iteration_context_t * iteration_context, LRMsegment_mapping_result_t * seg_result, int replace_index, LRMgene_vote_t *vote_table, int iii, int jjj, int this_seg_id){
+ int x1;
+ for(x1 = LRMSEGMENT_MAX_CANDIDATES-2; x1 >= replace_index; x1--)
+ memcpy(seg_result->candidates + x1 + 1, seg_result->candidates + x1, sizeof(LRMsegment_mapping_candidate_t));
+
+ seg_result->candidates[replace_index].first_base_position = vote_table -> pos[iii][jjj];
+ seg_result->candidates[replace_index].indel_length_inside = vote_table -> current_indel_cursor[iii][jjj];
+ seg_result->candidates[replace_index].confident_coverage_start = vote_table -> coverage_start[iii][jjj];
+ seg_result->candidates[replace_index].confident_coverage_end = vote_table -> coverage_end[iii][jjj];
+ seg_result->candidates[replace_index].votes = vote_table -> votes[iii][jjj];
+ seg_result->candidates[replace_index].masks = vote_table -> masks[iii][jjj];
+
+ if(0 && seg_result->candidates[replace_index].votes>=2){
+ char postxt[100];
+ LRMpos2txt(context, seg_result->candidates[replace_index].first_base_position , postxt);
+ LRMprintf("REPLACE CANDIDATE %d : to %s (%s), V=%d SEG=%d\n", replace_index, postxt, (seg_result->candidates[replace_index].masks & LRMIS_NEGATIVE_STRAND)?"NEG":"POS", seg_result->candidates[replace_index].votes, this_seg_id);
+ }
+
+ memcpy(seg_result->candidates[replace_index].indel_records, vote_table ->indel_recorder[iii][jjj], sizeof(short)*3*LRMMAX_INDEL_SECTIONS);
+}
+
+int LRMdo_one_voting_read_process_samechro(LRMcontext_t * context, unsigned int p1, unsigned int p2){
+ char * chro_name1, *chro_name2;
+ int chro_pos1, chro_pos2;
+ LRMlocate_gene_position(context, p1, &chro_name1, & chro_pos1);
+ LRMlocate_gene_position(context, p2, &chro_name2, & chro_pos2);
+
+ return chro_name1 == chro_name2; // they can be compared in this way because they are pointers in the sam_bam_chromosome_list.
+}
+
+
+#define LRMseg_fetch_result(mapr, sid) ( (mapr) -> segment_results + ((iteration_context -> is_reversed == 0)?(sid):( iteration_context -> total_segments - sid - 1 ) ) )
+
+void LRMdo_one_voting_read_process_votetab(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int this_seg_id){
+ LRMread_mapping_result_t * mapping_result = context -> read_mapping_results + iteration_context -> read_no_in_chunk;
+ LRMsegment_mapping_result_t * seg_result = LRMseg_fetch_result( mapping_result, this_seg_id );
+ seg_result -> extracted_subreads = iteration_context -> extracted_subreads;
+
+ //if(this_seg_id == 13) LRMprint_v(context, iteration_context, 2);
+
+ int iii, jjj, hhh;
+ for(iii=0; iii < LRMGENE_VOTE_TABLE_SIZE; iii++){
+ for(jjj = 0; jjj < iteration_context -> vote_table.items[iii]; jjj++){
+ if( iteration_context -> vote_table.votes [iii][jjj] < context -> min_voting_number || iteration_context -> vote_table.votes [iii][jjj] <= seg_result -> candidates[LRMSEGMENT_MAX_CANDIDATES - 1].votes) continue;
+ int replace_index = LRMSEGMENT_MAX_CANDIDATES - 1;
+ int kkk;
+ for(kkk = 0; kkk < LRMSEGMENT_MAX_CANDIDATES; kkk ++){
+ if(seg_result->candidates[kkk].votes < iteration_context -> vote_table.votes [iii][jjj]){
+ replace_index = kkk;
+ break;
+ }
+ }
+ LRMdo_one_voting_read_process_setres(context, iteration_context, seg_result, replace_index, &iteration_context -> vote_table, iii, jjj, this_seg_id);
+ }
+ }
+
+ if(context -> do_junction_detection)
+ for(hhh = 0; hhh < LRMSEGMENT_MAX_CANDIDATES; hhh++){
+
+ if(0){
+ char p1txt[100];
+ LRMpos2txt(context, seg_result->candidates[hhh].first_base_position, p1txt);
+ LRMprintf("process_votetab: [%d] votes=%d, pos=%s (%u)\n", hhh, seg_result->candidates[hhh].votes, p1txt, seg_result->candidates[hhh].first_base_position);
+ }
+
+ if(seg_result -> candidates[hhh].votes<1)break;
+ if((seg_result -> candidates[hhh].masks & LRMIS_NEGATIVE_STRAND) != (iteration_context -> is_reversed ? LRMIS_NEGATIVE_STRAND : 0)) continue;
+ seg_result -> candidates[hhh].secondary_votes = 0;
+
+ unsigned int best_secondary_half_pos = 0;
+ int best_secondary_score = -1, best_secondary_split_point = -1, best_secondary_is_GT_AG = -1, best_secondary_votes = -1, best_left_offset_indels = 0;
+
+ for(iii=0; iii < LRMGENE_VOTE_TABLE_SIZE; iii++){
+ for(jjj = 0; jjj < iteration_context -> vote_table.items[iii]; jjj++){
+ if(iteration_context -> vote_table.votes [iii][jjj] > seg_result -> candidates[hhh].votes || iteration_context -> vote_table.votes [iii][jjj] <1) continue;
+
+ long long dist0 = seg_result->candidates[hhh].first_base_position;
+ dist0 -= iteration_context -> vote_table.pos[iii][jjj];
+ int is_junction_distance = abs(dist0) > 3 && abs(dist0) < context -> max_junction_distance;
+ int is_same_chro = LRMdo_one_voting_read_process_samechro(context, seg_result->candidates[hhh].first_base_position, iteration_context -> vote_table.pos[iii][jjj]);
+ //LRMprintf("TEST JUNCTION COND: %u ~ %u : DIST=%d, SAME=%d\n", seg_result->candidates[hhh].first_base_position, iteration_context -> vote_table.pos[iii][jjj], is_junction_distance, is_same_chro );
+ if(is_junction_distance && is_same_chro){
+ int this_split_point = -1, this_is_GT_AG = -1, left_indel_offset = 0;
+ int indel_length_in_anchor = seg_result->candidates[hhh].indel_length_inside;
+ int indel_length_in_secondary = iteration_context -> vote_table.current_indel_cursor[iii][jjj];
+ int this_score = LRMdonor_score(context, thread_context, iteration_context, seg_result -> candidates + hhh, this_seg_id, iteration_context ->vote_table.pos[iii][jjj] , iteration_context ->vote_table.coverage_start[iii][jjj] , iteration_context -> vote_table.coverage_end[iii][jjj], indel_length_in_anchor, indel_length_in_secondary, & this_split_point, & this_is_GT_AG , &left_indel_offset);
+
+ if(0 && this_score > 0){
+ char pos1txt[100], pos2txt[100];
+ LRMpos2txt(context, seg_result -> candidates[hhh].first_base_position, pos1txt);
+ LRMpos2txt(context, iteration_context ->vote_table. pos[iii][jjj], pos2txt);
+ LRMprintf("TEST JUNCTION SCORE CAND %d: %s ~ %s = %d, <?< %d ; VOTES=%d + %d\n", hhh, pos1txt, pos2txt, this_score, best_secondary_score, seg_result -> candidates[hhh].votes, iteration_context -> vote_table.votes [iii][jjj]);
+ }
+
+ if(this_score > best_secondary_score ){
+ best_secondary_half_pos = iteration_context ->vote_table. pos[iii][jjj];
+ best_secondary_split_point = this_split_point;
+ best_secondary_is_GT_AG = this_is_GT_AG;
+ best_secondary_votes = iteration_context -> vote_table.votes [iii][jjj];
+ best_secondary_score = this_score;
+ best_left_offset_indels = left_indel_offset;
+ }
+ }
+ }
+ }
+
+ if(best_secondary_score > 0){
+ seg_result -> candidates[hhh].secondary_position = best_secondary_half_pos;
+ seg_result -> candidates[hhh].secondary_votes = best_secondary_votes;
+ seg_result -> candidates[hhh].junction_split_point = best_secondary_split_point;
+ seg_result -> candidates[hhh].junction_is_GT_AG = best_secondary_is_GT_AG;
+ seg_result -> candidates[hhh].junction_left_offset_indels = best_left_offset_indels;
+ }
+ }
+ //END: if context -> do_junction_detection
+}
+
+void LRMdo_one_voting_read_segment_extraction(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int this_seg_id){
+ int seg_len = iteration_context -> segment_lengths[this_seg_id];
+ char * seg_qual = iteration_context -> segment_quals[this_seg_id];
+
+ //LRMprintf("EXTR: seg %d\n", this_seg_id);
+
+ iteration_context -> extracted_subreads = min(context -> max_subreads_per_segment ,(seg_len - 15 - context -> current_index.index_gap)/context -> current_index.index_gap + 1);
+ float subread_gap = (seg_len - 15 - context -> current_index.index_gap + 1)*1. / iteration_context -> extracted_subreads;
+ int subr_i, pos_i;
+
+
+ //LRMprintf("EXTSUB: %s [%d] LEN=%d\t" , iteration_context -> read_name, this_seg_id , seg_len);
+ for(subr_i = 0; subr_i < iteration_context -> extracted_subreads; subr_i++){
+ pos_i = subr_i * subread_gap;
+ iteration_context -> subread_offsets[this_seg_id][subr_i]=pos_i;
+
+ //#warning "FOR COMPARISON ONLY ===================="
+ continue;
+
+ int highest_qual = -1;
+ int total_qual = 0;
+ int search_end = pos_i + subread_gap - 1, search_end_1 = pos_i + 15 + context -> current_index.index_gap;
+
+ for(; pos_i < search_end_1; pos_i++)
+ total_qual += seg_qual[ pos_i ];
+ highest_qual = total_qual;
+
+ for(; pos_i < search_end; pos_i++){
+ total_qual += seg_qual[ pos_i ];
+ total_qual -= seg_qual[ pos_i -15 - context -> current_index.index_gap];
+ if(total_qual > highest_qual) {
+ highest_qual = total_qual;
+ iteration_context -> subread_offsets[this_seg_id][subr_i]=pos_i-14 - context -> current_index.index_gap;
+ }
+ }
+ // LRMprintf("%d:%d\t", (int)(subr_i * subread_gap), iteration_context -> subread_offsets[this_seg_id][subr_i]);
+ }
+ //LRMprintf("\n");
+ //LRMprintf(" %d len = %d reads (seg %d)\n", iteration_context -> segment_lengths[this_seg_id], iteration_context -> extracted_subreads, this_seg_id);
+}
+
+void LRMdo_one_voting_read_segment(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int this_seg_id){
+ LRMinit_gene_vote((& iteration_context-> vote_table));
+
+ char * seg_text = iteration_context -> segment_texts[this_seg_id];
+ int seg_len = iteration_context -> segment_lengths[this_seg_id];
+
+ LRMdo_one_voting_read_segment_extraction(context, thread_context,iteration_context , this_seg_id);
+
+ int this_subread_no, this_gap_offset;
+ for(this_subread_no=0; this_subread_no< iteration_context ->extracted_subreads;this_subread_no++){
+ for(this_gap_offset=0; this_gap_offset<context -> current_index.index_gap; this_gap_offset++){
+ int this_subread_offset = this_gap_offset + iteration_context -> subread_offsets[this_seg_id][this_subread_no];
+
+ char * subread_string = seg_text + this_subread_offset;
+
+
+ LRMgehash_key_t subread_integer = LRMgenekey2int(subread_string);
+ if(0){
+ int cc = subread_string[16];
+ subread_string[16]=0;
+ LRMprintf("Extract Subread: %s\n", subread_string);
+ subread_string[16]=cc;
+ }
+
+ int mm_offset =0 ;//(iteration_context -> read_length < 2000)? 1:0;
+ //mm_offset =(iteration_context -> read_length < 500)?2:mm_offset;
+
+ LRMgehash_go_tolerance(context, thread_context, iteration_context,& context->current_index, subread_integer , this_subread_offset, seg_len, iteration_context -> is_reversed, & iteration_context-> vote_table, context -> max_read_indel_length, this_subread_no, ( context -> max_mismatched_bases_in_subread + mm_offset ));
+ }
+ }
+
+ if(0 && this_seg_id == 47){
+ int cch = seg_text[ seg_len ];
+ seg_text[ seg_len] = 0;
+ LRMprintf("\nREAD %s [seg %d] STAGE %s : %.*s\n", iteration_context -> read_name, this_seg_id, iteration_context -> is_reversed?"NEG":"POS", seg_len, seg_text);
+ LRMprint_v(context, iteration_context, 2);
+ seg_text[ seg_len ] = cch;
+ }
+ LRMdo_one_voting_read_process_votetab(context, thread_context, iteration_context, this_seg_id);
+
+ LRMread_mapping_result_t * mapping_result = context -> read_mapping_results + iteration_context -> read_no_in_chunk;
+ LRMsegment_mapping_result_t * seg_result = LRMseg_fetch_result( mapping_result, this_seg_id );
+
+ //if(iteration_context -> is_reversed)LRMprintf("MAPPED SEG VOTE[0]=%d\n", seg_result -> candidates[0].votes);
+ if(iteration_context -> is_reversed && seg_result -> candidates[0].votes> 1) iteration_context -> mapped_segments ++;
+}
+
+int LRMfind_subread_end(int len, int total_subreads, int subread){
+ return subread * 16;
+}
+
+void LRMdo_one_voting_insert_chro_events(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int seg_id){
+ int x2;
+
+ LRMread_mapping_result_t * mapping_result = context -> read_mapping_results + iteration_context -> read_no_in_chunk;
+ LRMsegment_mapping_result_t * seg_result = LRMseg_fetch_result( mapping_result, seg_id );
+
+ if(context -> do_junction_detection)
+ for(x2 = 0; x2 < LRMSEGMENT_MAX_CANDIDATES; x2++){
+ LRMsegment_mapping_candidate_t * cand_res = seg_result -> candidates + x2;
+ //LRMprintf("TRY INSERT JUNCTION: [%d] , VOTE=%d, 2ndVOTE=%d\n", x2, cand_res -> votes, cand_res -> secondary_votes);
+ if(cand_res -> secondary_votes > 0){
+ LRMevent_t new_event;
+ memset(&new_event, 0, sizeof(LRMevent_t));
+ new_event.event_type = LRMEVENT_TYPE_JUNCTION;
+ new_event.small_side = - 1 + min(cand_res->first_base_position, cand_res->secondary_position) + cand_res->junction_split_point + cand_res->junction_left_offset_indels;
+ new_event.large_side = max(cand_res->first_base_position, cand_res->secondary_position) + cand_res->junction_split_point;
+ new_event.masks = cand_res-> junction_is_GT_AG?LRM_EVENT_IS_GT_AT_DONOR:0;
+
+ //LRMprintf("INSERT JUNCTION EVENT: %u~%u\n", new_event.small_side, new_event.large_side);
+
+ int retv = LRMchro_event_new(context, thread_context, iteration_context, &new_event);
+ if(retv) return;
+ }
+ }
+ //END: if context -> do_junction_detection
+
+ // find and insert indels
+ for(x2 = 0; x2 < LRMSEGMENT_MAX_CANDIDATES; x2++){
+ LRMsegment_mapping_candidate_t * cand_res = seg_result -> candidates + x2;
+ int last_correct_subread = cand_res -> indel_records[1]-1, last_indel = 0;
+ int indel_i = 0;
+
+ if(0){
+ char ptxt[100];
+ LRMpos2txt(context, cand_res -> first_base_position, ptxt);
+ LRMprintf("CANDIDATE %s [SEG-%d][%d] START AT %s\n", iteration_context -> read_name, seg_id, x2, ptxt);
+ }
+
+ for(indel_i=1; indel_i<LRMMAX_INDEL_SECTIONS; indel_i++){
+ //LRMprintf("CANDIDATE INDEL[%d] = %d %d %d\n", indel_i, cand_res -> indel_records[indel_i*3], cand_res -> indel_records[indel_i*3+1], cand_res -> indel_records[indel_i*3+2] );
+ if( cand_res -> indel_records[indel_i*3]<1)break;
+
+ int next_correct_subread = cand_res -> indel_records[indel_i*3] - 1;
+ int last_correct_base = iteration_context -> subread_offsets[seg_id][last_correct_subread] - 10;
+ int first_correct_base = iteration_context -> subread_offsets[seg_id][next_correct_subread]+ 13;
+
+ int expected_indels_in_region=cand_res->indel_records[indel_i*3+2] - last_indel;
+ last_correct_base = max(0, last_correct_base);
+ last_correct_base = min(iteration_context -> read_length-1, last_correct_base);
+ first_correct_base = min(first_correct_base, iteration_context -> read_length-1);
+ first_correct_base = max(0, first_correct_base);
+ first_correct_base = max(first_correct_base, last_correct_base);
+ last_correct_subread = cand_res->indel_records[indel_i*3+1]-1;
+
+ //LRMprintf("CANDIDATE EXPINDEL=%d , GAP_BASES = %d, %d\n", expected_indels_in_region, first_correct_base, last_correct_base);
+
+ if(abs(expected_indels_in_region) <= context -> max_read_indel_length && first_correct_base - last_correct_base > 1){
+ int currently_reversed = 1;
+ char * corrected_read= iteration_context -> segment_texts[seg_id];
+
+ if(( (cand_res -> masks & LRMIS_NEGATIVE_STRAND ) == 0 && currently_reversed) ||
+ (( cand_res -> masks & LRMIS_NEGATIVE_STRAND ) != 0 && !currently_reversed)) {
+ LRMreverse_read( corrected_read , iteration_context -> segment_lengths[seg_id] );
+ currently_reversed = !currently_reversed;
+ }
+ unsigned int chro_cursor = cand_res -> first_base_position + last_correct_base + last_indel, total_mismatched;
+ int move_i, indel_movements = LRMindel_dynamic_search(context, thread_context, - expected_indels_in_region /* inversed definition */ , chro_cursor, corrected_read, last_correct_base, first_correct_base, &total_mismatched, 0, iteration_context -> read_name);
+ //LRMprintf("%s from %d MOVES=%s\n", (cand_res -> masks & LRMIS_NEGATIVE_STRAND )?"REV":"STD", last_correct_base , indel_movement_buff);
+
+ if(total_mismatched <= 1 || (total_mismatched <= 2 && first_correct_base - last_correct_base > 30) || (total_mismatched <= 10 && first_correct_base - last_correct_base > 100)){
+ int current_chr=-1, current_len = 0;
+ for(move_i = 0; move_i < 1+ indel_movements; move_i++){
+ int nch = thread_context -> dynamic_programming_movement_buffer[move_i];
+ nch = (nch=='X')?'M':nch;
+ if(current_chr!=nch){
+ if(current_chr>0 && current_chr != 'M'){
+ LRMevent_t new_event;
+ memset(&new_event, 0, sizeof(LRMevent_t));
+ new_event.indel_length = current_chr == 'D' ? current_len : - current_len;
+ new_event.event_type = LRMEVENT_TYPE_INDEL;
+ new_event.large_side = chro_cursor;
+ new_event.small_side = current_chr == 'D' ? chro_cursor - current_len - 1 : (chro_cursor - 1);
+ new_event.masks = cand_res-> junction_is_GT_AG?LRM_EVENT_IS_GT_AT_DONOR:0;
+
+ if(0){
+ char p1txt[100], p2txt[100], p0txt[100];
+ LRMpos2txt(context, new_event.small_side , p1txt);
+ LRMpos2txt(context, new_event.large_side , p2txt);
+ LRMpos2txt(context, cand_res -> first_base_position , p0txt);
+ if(1|| ( new_event.small_side >= 197828782 - 3 && new_event.small_side <= 197828782 + 5)){
+ LRMprintf("\nINSERT INDEL EVENT FROM %s: %s~%s ; LEN=%d\n", iteration_context -> read_name, p1txt, p2txt, new_event.indel_length);
+ LRMprintf("INSERT INDEL AT %s + %d + %d\n" , p0txt, last_correct_base, last_indel);
+ LRMprintf("%s MOVES=%s\n\n", (cand_res -> masks & LRMIS_NEGATIVE_STRAND )?"REV":"STD" , thread_context -> dynamic_programming_movement_buffer);
+ }
+ }
+
+ int retv = LRMchro_event_new(context, thread_context, iteration_context, &new_event);
+ if(retv) return;
+ }
+ current_chr = nch;
+ current_len = 0;
+ }
+ current_len++;
+ if(nch !='I') chro_cursor ++;
+ }
+ }
+ if(currently_reversed == 0) LRMreverse_read( corrected_read , iteration_context -> segment_lengths[seg_id] );
+ }
+ last_indel = cand_res->indel_records[indel_i*3+2];
+ }
+ }
+}
+
+void LRMdo_one_voting_read(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context){
+ LRMread_mapping_result_t * mapping_result = context -> read_mapping_results + iteration_context -> read_no_in_chunk;
+ if(iteration_context -> total_segments < 1) return;
+
+ mapping_result -> segment_results = malloc(sizeof(LRMsegment_mapping_result_t) * iteration_context->total_segments);
+
+ for( iteration_context -> added_mismatch_allowed_in_subread = 0; iteration_context -> added_mismatch_allowed_in_subread < 1; iteration_context -> added_mismatch_allowed_in_subread ++){
+ iteration_context -> mapped_segments = 0;
+ memset(mapping_result -> segment_results, 0, sizeof(LRMsegment_mapping_result_t) * iteration_context->total_segments);
+
+ for(iteration_context->is_reversed = 0; iteration_context->is_reversed< 2; iteration_context->is_reversed++){
+ int seg_id;
+ for(seg_id=0; seg_id<iteration_context -> total_segments; seg_id++){
+ LRMdo_one_voting_read_segment(context, thread_context, iteration_context, seg_id);
+
+ if(0)if(iteration_context->is_reversed) LRMdo_one_voting_insert_chro_events(context, thread_context, iteration_context, seg_id);
+ }
+
+ if(0 == iteration_context->is_reversed) LRMreverse_read_and_qual(context, thread_context, iteration_context);
+ }
+
+ // LRMprintf("Trying MM Offset = %d ; mapped segs = %d >= %d\n",iteration_context -> added_mismatch_allowed_in_subread , iteration_context -> mapped_segments , max(3, iteration_context -> total_segments / 10));
+ if(iteration_context -> mapped_segments >= 1+0* max(3, iteration_context -> total_segments / 10))break;
+ else LRMreverse_read_and_qual(context, thread_context, iteration_context);
+ }
+}
+
+
+typedef struct{
+ unsigned int read_head_pos;
+ unsigned int votes;
+ short masks;
+ int segment_number;
+ int segment_id[ LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST +1 ];
+ int realign_cand_id[ LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST +1 ];
+} LRMread_final_candidate_t;
+
+void LRMfix_cigar(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, char * cigar){
+ int ci, nch;
+ unsigned int tmpi = 0;
+ unsigned int lastlen = 0;
+ int lastopt = 0, outi = 0;
+ int read_cursor = 0;
+ for(ci=0; 0!=(nch=cigar[ci]); ci++){
+ if(nch <= '9' && nch >= '0'){
+ tmpi = 10*tmpi +(nch-'0');
+ }else if(nch >= 'A' && nch <= 'Z'){
+ int nnch = cigar[ci+1];
+ if(tmpi>0) outi+=sprintf(cigar+outi, "%u%c", tmpi, nch);
+ cigar[ci+1] = nnch;
+ tmpi = 0;
+ }
+ }
+
+ outi = 0;
+ tmpi = 0;
+ for(ci=0; 0!=(nch=cigar[ci]); ci++){
+ if(nch <= '9' && nch >= '0'){
+ tmpi = 10*tmpi +(nch-'0');
+ }else{
+ if(nch >= 'A' && nch <= 'Z'){
+ if(nch != lastopt){
+ if(lastlen>0){
+
+ if(1)if((lastopt == 'D' && nch == 'I')||(lastopt == 'I' && nch == 'D')){
+ if(lastlen > tmpi){
+ lastlen -= tmpi;
+ nch = 'M';
+ }else if(lastlen < tmpi){
+ lastopt = 'M';
+ tmpi -= lastlen;
+ }else{
+ nch = 'M';
+ lastlen = 0;
+ }
+ }
+
+ if(lastopt == 'S' || lastopt == 'M' || lastopt == 'I'){
+ lastlen = min(lastlen, iteration_context -> read_length - read_cursor);
+ read_cursor += lastlen;
+ }
+
+ if(lastlen>0)outi+=sprintf(cigar+outi, "%u%c", lastlen, lastopt);
+ }
+
+ lastopt = nch;
+ lastlen = 0;
+ }
+ lastlen += tmpi;
+ }
+ tmpi = 0;
+ }
+ }
+
+ if(lastlen>0){
+ //LRMprintf("LASTLEN=MIN %u %d - %u\n", lastlen, iteration_context -> read_length , read_cursor);
+ if(lastopt == 'S' || lastopt == 'M' || lastopt == 'I')
+ lastlen = min(lastlen, iteration_context -> read_length - read_cursor);
+
+ if(lastlen>0)outi+=sprintf(cigar+outi, "%u%c", lastlen, lastopt);
+ }
+}
+
+long long LRMcalculate_written_chro_pos(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int written_read_cursor, int start_read_cursor, char * cigar, unsigned int start_chro_pos){
+ if(written_read_cursor <=start_read_cursor) return start_chro_pos;
+ int tmpi = 0;
+ int ci=0, nch;
+
+ while(0!=(nch = cigar[ci++])){
+ if(nch <='9' && nch >= '0') tmpi = tmpi*10+(nch-'0');
+ else{
+ if(nch == 'M'||nch == 'S' ||nch == 'N'||nch == 'D') start_chro_pos += tmpi;
+ if(nch == 'M'||nch == 'S' ||nch == 'I') start_read_cursor += tmpi;
+ if(start_read_cursor >=written_read_cursor) {
+ if(nch == 'I') return start_chro_pos;
+ else return start_chro_pos - (start_read_cursor - written_read_cursor);
+ }
+
+ tmpi = 0;
+ }
+ }
+
+ //if( start_read_cursor == written_read_cursor ) return start_chro_pos;
+ return -1;
+}
+
+int LRMmoves_to_cigar(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, long long * cigar_chro_cursor, long long * cigar_read_cursor, int * target_cigar_ptr, long long *last_written_cigar_chro, int * correct_mapping, int moves, int * cigar_opts, int * last_written_cigar_read, int * softclipping_bases, int * matched_bases, int * mismatched_bases, int align_i){
+ int tmpi = 0, move_i;
+ char * target_cigar = iteration_context -> merged_cigar[align_i];
+ //LRMprintf("MOVES=%d\n", moves);
+ for(move_i = 0; move_i < moves; move_i++){
+ char nch = thread_context -> dynamic_programming_indel_movement_buf[move_i];
+ if(nch == '.') continue;
+ if(matched_bases && nch == 'M') (*matched_bases)++;
+ if(mismatched_bases && nch == 'X') (*mismatched_bases)++;
+ nch =(nch == 'X'?'M':nch);
+ char nnh = thread_context -> dynamic_programming_indel_movement_buf[move_i + 1];
+ nnh =(nnh == 'X'?'M':nnh);
+ tmpi ++;
+ if(nnh != nch){
+ if(nch == 'M' || nch == 'D' || nch == 'S') (*cigar_chro_cursor) += tmpi;
+ if(nch == 'M' || nch == 'I' || nch == 'S') (*cigar_read_cursor) += tmpi;
+ if(softclipping_bases&& nch == 'S') (*softclipping_bases) += tmpi;
+
+ if((*cigar_opts) < context -> max_cigar_opts_in_read - 5 && (*target_cigar_ptr) < LRMMERGE_CIGAR_SIZE - 70){
+ (*target_cigar_ptr) += snprintf( target_cigar + (*target_cigar_ptr), LRMMERGE_CIGAR_SIZE - 40 - (*target_cigar_ptr), "%d%c", tmpi, nch);
+ *last_written_cigar_read = (*cigar_read_cursor);
+ *last_written_cigar_chro = max((*last_written_cigar_chro), (*cigar_chro_cursor));
+
+ if(nch == 'M') (*correct_mapping)=1;
+ }
+ (*cigar_opts) ++;
+ tmpi = 0;
+ }
+ }
+ return 0;
+}
+
+#define LRM_USE_CIGAR_RESCURE 1
+int cluster_debug = 0;
+
+int sort_segvote_compare(void * a, int l, int r){
+ void ** vsort = a;
+ unsigned int * segment_vote_list = vsort[2];
+ if(segment_vote_list[l] > segment_vote_list[r]) return 1;
+ if(segment_vote_list[l] < segment_vote_list[r]) return -1;
+ return 0;
+}
+
+void sort_segvote_exchange(void * a, int l, int r){
+ unsigned int x,t;
+ void ** vsort = a;
+ for(x=0;x<4;x++){
+ unsigned int * aa = vsort[x];
+ if(!aa) continue;
+ t = aa[l];
+ aa[l] = aa[r];
+ aa[r] = t;
+ }
+}
+
+void sort_segvote_merge(void * arr, int start, int items, int items2){
+ void ** arrr = (void **) arr;
+ unsigned int * m1 = malloc(sizeof(int) * (items+items2)), *m2 = malloc(sizeof(int) * (items+items2)), *m3 = malloc(sizeof(int) * (items+items2)), *m4 = malloc(sizeof(int) * (items+items2));
+ unsigned int * i1 = arrr[0], * i2 = arrr[1], * i3 = arrr[2], * i4 = arrr[3];
+
+ int read_1_ptr = start, read_2_ptr = start+items, write_ptr;
+ for(write_ptr=0; write_ptr<items+items2; write_ptr++){
+ if((read_1_ptr >= start+items)||(read_2_ptr < start+items+items2 && sort_segvote_compare(arr, read_1_ptr, read_2_ptr) > 0)){
+ m1[write_ptr] = i1[read_2_ptr];
+ m2[write_ptr] = i2[read_2_ptr];
+ m3[write_ptr] = i3[read_2_ptr];
+ if(i4)m4[write_ptr] = i4[read_2_ptr];
+ read_2_ptr ++;
+ }else{
+ m1[write_ptr] = i1[read_1_ptr];
+ m2[write_ptr] = i2[read_1_ptr];
+ m3[write_ptr] = i3[read_1_ptr];
+ if(i4)m4[write_ptr] = i4[read_1_ptr];
+ read_1_ptr ++;
+ }
+ }
+ memcpy(i1 + start, m1, sizeof(int) * (items+items2));
+ memcpy(i2 + start, m2, sizeof(int) * (items+items2));
+ memcpy(i3 + start, m3, sizeof(int) * (items+items2));
+ if(i4)memcpy(i4 + start, m4, sizeof(int) * (items+items2));
+
+ free(m1);
+ free(m2);
+ free(m3);
+ free(m4);
+}
+
+int LRMfind_leftS_size(char * c){
+ int tmpi=0,x=0,nch;
+ while(0!=(nch = c[x++] )){
+ if(isdigit(nch)){
+ tmpi=tmpi*10+(nch-'0');
+ }else{
+ if(nch =='S') return tmpi;
+ return 0;
+ }
+ }
+ return -1;
+}
+
+#define LRM_MERGE_SEEDS 300
+int reconcile_debug = 0;
+int LRMread_final_result_merge(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMread_mapping_result_t * read_res){
+ int x1, x2, x3, correct_mapping = 0;
+ long long this_chromosome_end = 0;
+ char * this_chromosome_name = NULL;
+ read_res -> best_candidate = NULL;
+
+ unsigned int * segment_id_list, * segment_cand_id_list, * segment_vote_list;
+ unsigned int * segment_linear_list;
+ int segment_id_list_size = 100;
+ int segment_id_list_no = 0;
+
+ segment_id_list = malloc(sizeof(int) * segment_id_list_size);
+ segment_cand_id_list = malloc(sizeof(int) * segment_id_list_size);
+ segment_vote_list = malloc(sizeof(int) * segment_id_list_size);
+ segment_linear_list = malloc(sizeof(int) * segment_id_list_size);
+
+ for(x1=0;x1<iteration_context->total_segments;x1++){
+ for(x2 = 0; x2 < LRMMERGING_MAX_CANDIDATES; x2++){
+ unsigned int seg_cand_votes = iteration_context -> segment_best_votes[x1][x2];
+ if(seg_cand_votes < 1) break;
+ if(segment_id_list_no >= segment_id_list_size - 1){
+ segment_id_list_size = segment_id_list_size * 14/10;
+ segment_id_list = realloc(segment_id_list,sizeof(int) * segment_id_list_size);
+ segment_cand_id_list = realloc(segment_cand_id_list,sizeof(int) * segment_id_list_size);
+ segment_vote_list = realloc(segment_vote_list,sizeof(int) * segment_id_list_size);
+ segment_linear_list = realloc(segment_linear_list,sizeof(int) * segment_id_list_size);
+ }
+
+ segment_id_list[segment_id_list_no] = x1;
+ segment_cand_id_list[segment_id_list_no] = x2;
+ segment_vote_list[segment_id_list_no] = seg_cand_votes;
+ segment_linear_list[segment_id_list_no] = iteration_context -> segment_best_pos[x1][x2] + LRMfind_leftS_size(iteration_context -> segment_cigars[x1][x2]);
+ segment_id_list_no ++;
+
+ if(reconcile_debug){
+ char ptxt[100];
+ int read_pos = iteration_context -> segment_texts[x1] - iteration_context -> read_text + LRMfind_leftS_size(iteration_context -> segment_cigars[x1][x2]);
+ if(iteration_context -> segment_best_masks[x1][x2] & LRMIS_NEGATIVE_STRAND) read_pos += iteration_context -> segment_lengths[x1];
+ LRMpos2txt(context, segment_linear_list[segment_id_list_no-1], ptxt);
+ LRMprintf("INIT_LOC of SEG %d, CAND %d: strand: %s read_pos: %ld + %d, chro_pos: %s\tV=%d\n",x1,x2,(iteration_context -> segment_best_masks[x1][x2] & LRMIS_NEGATIVE_STRAND)?"NEG":"POS" , iteration_context -> segment_texts[x1] - iteration_context -> read_text, LRMfind_leftS_size(iteration_context -> segment_cigars[x1][x2]) ,ptxt, seg_cand_votes);
+ }
+ }
+ }
+
+ void * vsort[4];
+ vsort[0] = segment_id_list;
+ vsort[1] = segment_cand_id_list;
+ vsort[2] = segment_vote_list;
+ vsort[3] = segment_linear_list;
+ merge_sort(vsort, segment_id_list_no, sort_segvote_compare, sort_segvote_exchange, sort_segvote_merge);
+ free(segment_vote_list);
+ segment_vote_list = NULL;
+
+ unsigned int topseg_id_list[LRM_MERGE_SEEDS];
+ unsigned int topseg_cand_id_list[LRM_MERGE_SEEDS];
+ unsigned int topseg_pos_list[LRM_MERGE_SEEDS];
+ int topseg_masks[LRM_MERGE_SEEDS];
+ int topseg_no = 0;
+
+ for(; topseg_no < LRM_MERGE_SEEDS; topseg_no++){
+ if ( topseg_no > segment_id_list_no - 1)break;
+ topseg_id_list[ topseg_no ] = segment_id_list[segment_id_list_no - 1 - topseg_no];
+ topseg_cand_id_list[ topseg_no ] = segment_cand_id_list[segment_id_list_no - 1 - topseg_no];
+ topseg_pos_list[topseg_no] = segment_linear_list[segment_id_list_no - 1 - topseg_no];
+ }
+
+ vsort[2] = segment_linear_list;
+ vsort[3] = NULL;
+ merge_sort(vsort, segment_id_list_no, sort_segvote_compare, sort_segvote_exchange, sort_segvote_merge);
+
+ if(reconcile_debug) for(x2 = 0; x2 < segment_id_list_no; x2++){
+ char ptxt[100];
+ LRMpos2txt(context, segment_linear_list[x2], ptxt);
+ LRMprintf("ALL_LOC: %s\tV=%d\n", ptxt, iteration_context -> segment_best_votes[segment_id_list[x2]][ segment_cand_id_list[x2]]);
+ }
+
+ int best_align_i;
+ int trying_aligns = max(2, context -> max_best_alignments);
+
+ HashTable * candarray_used_segment_id = HashTableCreate(100);
+ HashTable * candarray_used_cand_id = HashTableCreate(100);
+
+ for(best_align_i = 0; best_align_i<trying_aligns; best_align_i++){
+
+ ArrayList * candarray_going_large_segid[LRM_MERGE_SEEDS];
+ ArrayList * candarray_going_large_candid[LRM_MERGE_SEEDS];
+ ArrayList * candarray_going_small_segid[LRM_MERGE_SEEDS];
+ ArrayList * candarray_going_small_candid[LRM_MERGE_SEEDS];
+ int candarray_votes[LRM_MERGE_SEEDS];
+ int best_best_votes[trying_aligns] , best_best_chains[trying_aligns];
+ memset(best_best_votes,0,sizeof(int)*trying_aligns);
+
+ for(x1 = 0; x1 < topseg_no; x1++){
+ unsigned int left_max_pos = topseg_pos_list[x1] - max(500000, 2*iteration_context -> read_length);
+ if(topseg_pos_list[x1] < max(500000, 2*iteration_context -> read_length)) left_max_pos = 0;
+ unsigned int right_max_pos = topseg_pos_list[x1] + max(500000, 2*iteration_context -> read_length);
+
+ int left_index = binary_search_less_equal(segment_linear_list, segment_id_list_no, left_max_pos)-1;
+ left_index = max(0,left_index);
+ int right_index = binary_search_less_equal(segment_linear_list, segment_id_list_no, right_max_pos)+1;
+ right_index = min(segment_id_list_no, right_index);
+
+ if(left_index>0) assert(segment_linear_list[left_index] < topseg_pos_list[x1] );
+ if(right_index < segment_id_list_no){
+ if(segment_linear_list[right_index] < topseg_pos_list[x1]) LRMprintf("ERROR_RIGHT: rmax=%d, total=%d, %u < %u\n", right_index, segment_id_list_no, segment_linear_list[right_index] , topseg_pos_list[x1]);
+ assert( segment_linear_list[right_index] > topseg_pos_list[x1] );
+ }
+
+ int this_seed_cand_index = -1, going_large_cand_index;
+ for(x2 = left_index; x2 < right_index; x2++){
+ if(reconcile_debug){
+ char p1txt[100], p2txt[100];
+ LRMpos2txt(context, topseg_pos_list[x1],p1txt);
+ LRMpos2txt(context, segment_linear_list[x2],p2txt);
+ LRMprintf("SEED[%d] :%s\tTEST:%s\n", x1, p1txt, p2txt);
+ }
+ if(reconcile_debug) if(x2+1 < segment_id_list_no){
+ if(segment_linear_list[x2]>segment_linear_list[x2+1])LRMprintf("ERROR ORDER: x2=%d, %u > %u\n",x2, segment_linear_list[x2], segment_linear_list[x2+1]);
+ assert( segment_linear_list[x2] <= segment_linear_list[x2+1]);
+ }
+ if(segment_id_list[x2] == topseg_id_list[x1] && segment_cand_id_list[x2] == topseg_cand_id_list[x1])
+ this_seed_cand_index = x2;
+ }
+
+ assert(this_seed_cand_index >=0);
+
+ candarray_going_large_segid[x1] = ArrayListCreate(10);
+ candarray_going_large_candid[x1] = ArrayListCreate(10);
+ candarray_going_small_segid[x1] = ArrayListCreate(10);
+ candarray_going_small_candid[x1] = ArrayListCreate(10);
+
+ if(0){
+ char p1txt[100];
+ LRMpos2txt(context, topseg_pos_list[x1],p1txt);
+ if(HashTableGet(candarray_used_segment_id, NULL + 1 + segment_id_list[this_seed_cand_index]) && HashTableGet(candarray_used_cand_id, NULL + 1 + segment_cand_id_list[this_seed_cand_index]))
+ LRMprintf("IGNORE #%d : %s\n", x1, p1txt);
+ else
+ LRMprintf("ACCEPT #%d : %s\n", x1, p1txt);
+ }
+
+ if(HashTableGet(candarray_used_segment_id, NULL + 1 + segment_id_list[this_seed_cand_index]) && HashTableGet(candarray_used_cand_id, NULL + 1 + segment_cand_id_list[this_seed_cand_index]))
+ continue;
+
+ candarray_votes[x1]=iteration_context -> segment_best_votes[segment_id_list[this_seed_cand_index]][ segment_cand_id_list[this_seed_cand_index]];
+
+ for(going_large_cand_index=0; going_large_cand_index<2; going_large_cand_index++){
+ int current_cand_index = this_seed_cand_index;
+ int x2_delta = going_large_cand_index?1:-1;
+ if(this_seed_cand_index == left_index && (!going_large_cand_index))continue;
+ if(this_seed_cand_index == right_index && going_large_cand_index)continue;
+
+ for(x2 = this_seed_cand_index + x2_delta; x2!= left_index -1 && x2 != right_index ; x2 += x2_delta){
+ int test_cand_seg_id = segment_id_list[x2];
+ int test_cand_cand_id = segment_cand_id_list[x2];
+ char * test_cand_cigar = iteration_context -> segment_cigars[test_cand_seg_id][test_cand_cand_id];
+ int test_cand_read_pos = iteration_context -> segment_texts[test_cand_seg_id] - iteration_context -> read_text;
+ // LRMprintf("ROUND %d , GV: %s\n" , best_align_i, test_cand_cigar);
+ test_cand_read_pos += LRMfind_leftS_size(test_cand_cigar);
+ unsigned int test_cand_linear_location = segment_linear_list[x2];
+ int test_cand_masks = iteration_context -> segment_best_masks[test_cand_seg_id][test_cand_cand_id];
+
+ int current_cand_seg_id = segment_id_list[current_cand_index];
+ int current_cand_cand_id = segment_cand_id_list[current_cand_index];
+ int current_cand_read_pos = iteration_context -> segment_texts[current_cand_seg_id] - iteration_context -> read_text + LRMfind_leftS_size(iteration_context -> segment_cigars[current_cand_seg_id][current_cand_cand_id]);
+ unsigned int current_cand_read_chro = segment_linear_list[current_cand_index];
+ int current_cand_masks = iteration_context -> segment_best_masks[current_cand_seg_id][current_cand_cand_id];
+ if(current_cand_index == this_seed_cand_index) topseg_masks[x1] = current_cand_masks;
+
+ if( (test_cand_masks & LRMIS_NEGATIVE_STRAND) != (current_cand_masks & LRMIS_NEGATIVE_STRAND) ) continue;
+ if(current_cand_seg_id == test_cand_seg_id) continue;
+
+ ArrayList * candarray_going_segid = going_large_cand_index?candarray_going_large_segid[x1]:candarray_going_small_segid[x1];
+ ArrayList * candarray_going_candid = going_large_cand_index?candarray_going_large_candid[x1]:candarray_going_small_candid[x1];
+
+ if( going_large_cand_index && (current_cand_masks & LRMIS_NEGATIVE_STRAND) && test_cand_seg_id >= segment_id_list[this_seed_cand_index]) continue;
+ if( (!going_large_cand_index) && (current_cand_masks & LRMIS_NEGATIVE_STRAND) && test_cand_seg_id <= segment_id_list[this_seed_cand_index]) continue;
+ if( going_large_cand_index && 0==(current_cand_masks & LRMIS_NEGATIVE_STRAND) && test_cand_seg_id <= segment_id_list[this_seed_cand_index]) continue;
+ if( (!going_large_cand_index) && 0==(current_cand_masks & LRMIS_NEGATIVE_STRAND) && test_cand_seg_id >= segment_id_list[this_seed_cand_index]) continue;
+
+ int is_repeated = 0;
+ for(x3 = 0; x3 < candarray_going_segid -> numOfElements; x3++){
+ int cand_oldid = ArrayListGet(candarray_going_segid, x3) - NULL - 1;
+ if(cand_oldid == test_cand_seg_id) is_repeated = 1;
+ }
+ if(is_repeated) continue;
+
+ int move_on_read = test_cand_read_pos - current_cand_read_pos;
+ long long int move_on_chro = test_cand_linear_location;
+ move_on_chro -= current_cand_read_chro;
+
+ if(reconcile_debug){
+ char p1txt[100], p2txt[100];
+ LRMpos2txt(context, current_cand_read_chro , p1txt);
+ LRMpos2txt(context, test_cand_linear_location , p2txt);
+ if(this_seed_cand_index == 324)LRMprintf("Trying Curr=%s (seg %d), Test=%s (seg %d), MoveChr=%lld, MoveRead=%d, current_cand_masks=%d, going_large_cand_index=%d\n", p1txt, current_cand_seg_id, p2txt, x2, move_on_chro, move_on_read, current_cand_masks, going_large_cand_index);
+ }
+
+ if(abs(move_on_read)<3 || abs(move_on_chro)<3) continue;
+
+ if(1){
+ int segid_diff = test_cand_seg_id - current_cand_seg_id;
+ if( going_large_cand_index && (current_cand_masks & LRMIS_NEGATIVE_STRAND) && segid_diff >= 0) continue;
+ if( (!going_large_cand_index) && (current_cand_masks & LRMIS_NEGATIVE_STRAND) && segid_diff <= 0) continue;
+
+ if( going_large_cand_index && 0==(current_cand_masks & LRMIS_NEGATIVE_STRAND) && segid_diff <= 0) continue;
+ if( (!going_large_cand_index) && 0==(current_cand_masks & LRMIS_NEGATIVE_STRAND) && segid_diff >= 0) continue;
+ }
+
+ if( going_large_cand_index && (current_cand_masks & LRMIS_NEGATIVE_STRAND) && move_on_read >=0 ) continue;
+ if((! going_large_cand_index) && (current_cand_masks & LRMIS_NEGATIVE_STRAND) && move_on_read <=0 ) continue;
+
+ if( going_large_cand_index && 0==(current_cand_masks & LRMIS_NEGATIVE_STRAND) && move_on_read <=0 ) continue;
+ if((! going_large_cand_index) && 0==(current_cand_masks & LRMIS_NEGATIVE_STRAND) && move_on_read >=0 ) continue;
+
+ int allowed_differential = max(abs(move_on_read), abs(move_on_chro)) *4 / 5;
+ allowed_differential = max(160, allowed_differential);
+
+ if(abs(move_on_chro) > abs(move_on_read) + allowed_differential || abs(move_on_chro) < abs(move_on_read) - allowed_differential) continue;
+ if(abs(move_on_chro) > context -> result_merge_tolerance) continue;
+
+ ArrayListPush(candarray_going_segid, NULL+1+test_cand_seg_id);
+ ArrayListPush(candarray_going_candid, NULL+1+test_cand_cand_id);
+
+ HashTablePut(candarray_used_segment_id, NULL+1+test_cand_seg_id, NULL+1);
+ HashTablePut(candarray_used_cand_id, NULL+1+test_cand_cand_id, NULL+1);
+
+ if(reconcile_debug){
+ char ptxt[100];
+ LRMpos2txt(context, test_cand_linear_location, ptxt);
+ LRMprintf(" Added %s; new V = %d + %d\n", ptxt, iteration_context -> segment_best_votes[test_cand_seg_id][test_cand_cand_id], candarray_votes[x1]);
+ }
+ candarray_votes[x1] += iteration_context -> segment_best_votes[test_cand_seg_id][test_cand_cand_id];
+
+ int replace_index = -1, repeat_index = -1;
+ if(candarray_votes[x1] > best_best_votes[trying_aligns-1]){
+
+ for(x3 = 0 ; x3 < trying_aligns; x3++){
+ if(best_best_chains[x3] == x1){
+ repeat_index = x3;
+ break;
+ }
+ }
+
+ if(repeat_index >=0){
+ for(x3 = repeat_index; x3 < trying_aligns-1; x3++){
+ best_best_votes[x3] = best_best_votes[x3 +1];
+ best_best_chains[x3] = best_best_chains[x3 +1];
+ }
+
+ best_best_votes[trying_aligns-1] = 0;
+ best_best_chains[trying_aligns-1] = 0;
+ }
+
+ for(x3 = 0 ; x3 < trying_aligns; x3++){
+ if(best_best_votes[x3] < candarray_votes[x1]){
+ replace_index = x3;
+ break;
+ }
+ }
+ if(replace_index>=0){
+ for(x3 = trying_aligns-1; x3 > replace_index; x3--){
+ best_best_votes[x3] = best_best_votes[x3 -1];
+ best_best_chains[x3] = best_best_chains[x3 -1];
+ }
+
+ best_best_votes[replace_index] = candarray_votes[x1];
+ best_best_chains[replace_index] = x1;
+ }
+ }
+ current_cand_index = x2;
+ }
+ }
+ }
+
+ if(reconcile_debug){
+ LRMprintf("\nREAD LEN = %d\n", iteration_context -> read_length);
+ for(x1 = 0; x1 < topseg_no; x1++){
+ LRMprintf("CHAIN (%d) %s: votes=%d, segs=%ld + %ld + 1\n", x1,(topseg_masks[x1]&LRMIS_NEGATIVE_STRAND)?"NEG":"POS", candarray_votes[x1], candarray_going_large_segid[x1]->numOfElements , candarray_going_small_segid[x1]->numOfElements);
+ }
+ }
+
+ int best_best_chain = -1;
+ if(best_best_votes[best_align_i] >0)
+ best_best_chain = best_best_chains[best_align_i];
+
+ LRMread_final_candidate_t * best_cand_rec = NULL;
+ if(best_best_chain >= 0){
+ if(reconcile_debug)LRMprintf("==========================\nBEST_BEST chain #%d, seed index: %d\n", best_best_chain, topseg_id_list[best_best_chain]);
+
+ ArrayList * resarr_left_segid = (topseg_masks[best_best_chain] & LRMIS_NEGATIVE_STRAND) ?candarray_going_large_segid[best_best_chain]:candarray_going_small_segid[best_best_chain];
+ ArrayList * resarr_left_candid = (topseg_masks[best_best_chain] & LRMIS_NEGATIVE_STRAND) ?candarray_going_large_candid[best_best_chain]:candarray_going_small_candid[best_best_chain];
+
+ ArrayList * resarr_right_segid = (topseg_masks[best_best_chain] & LRMIS_NEGATIVE_STRAND) ?candarray_going_small_segid[best_best_chain]:candarray_going_large_segid[best_best_chain];
+ ArrayList * resarr_right_candid = (topseg_masks[best_best_chain] & LRMIS_NEGATIVE_STRAND) ?candarray_going_small_candid[best_best_chain]:candarray_going_large_candid[best_best_chain];
+
+ best_cand_rec = malloc(sizeof(LRMread_final_candidate_t));
+ for(x2 = 0 ; x2 < resarr_left_segid -> numOfElements + 1 + resarr_right_candid -> numOfElements; x2++){
+ ArrayList * mesegid = NULL, *mecandid = NULL;
+ int meindex = -1;
+ if(x2 < resarr_left_segid -> numOfElements){
+ mesegid = resarr_left_segid;
+ mecandid = resarr_left_candid;
+ meindex = resarr_left_segid -> numOfElements - 1 - x2;
+ }
+ if(x2 >= resarr_left_segid -> numOfElements + 1){
+ mesegid = resarr_right_segid;
+ mecandid = resarr_right_candid;
+ meindex = x2 - resarr_left_segid -> numOfElements - 1;
+ }
+
+ int seed_seg_id = topseg_id_list[best_best_chain];
+ int seed_cand_id = topseg_cand_id_list[best_best_chain];
+ if(meindex>=0){
+ seed_seg_id = ArrayListGet(mesegid, meindex)-NULL-1;
+ seed_cand_id = ArrayListGet(mecandid, meindex)-NULL-1;
+ }
+
+ if(reconcile_debug){
+ char ptxt[100];
+ LRMpos2txt(context, iteration_context -> segment_best_pos[seed_seg_id][seed_cand_id], ptxt);
+ LRMprintf("BEST_BEST [%d] is SEG %d: %s %u\n",x2, seed_seg_id, ptxt, iteration_context -> segment_best_pos[seed_seg_id][seed_cand_id]);
+ }
+
+ if(meindex == 0){
+ best_cand_rec -> read_head_pos = iteration_context -> segment_best_pos[seed_seg_id][seed_cand_id] - (iteration_context -> segment_texts [seed_seg_id] - iteration_context -> read_text);
+ best_cand_rec -> masks = iteration_context -> segment_best_masks[seed_seg_id][seed_cand_id];
+ }
+ best_cand_rec -> segment_id[x2] = seed_seg_id;
+ best_cand_rec -> realign_cand_id[x2] = seed_cand_id;
+ }
+
+ read_res -> final_pos = best_cand_rec -> read_head_pos;
+ read_res -> masks = best_cand_rec -> masks;
+ read_res -> votes[best_align_i] = best_best_votes[best_align_i];
+ best_cand_rec -> segment_number = resarr_left_segid -> numOfElements + 1 + resarr_right_candid -> numOfElements;
+
+ for(x1 = 0; x1 < topseg_no; x1++){
+ ArrayListDestroy(candarray_going_large_segid[x1]);
+ ArrayListDestroy(candarray_going_large_candid[x1]);
+ ArrayListDestroy(candarray_going_small_segid[x1]);
+ ArrayListDestroy(candarray_going_small_candid[x1]);
+ }
+ }
+
+ if(best_cand_rec){
+ long long final_mapping_pos = -1;
+ long long merged_chro_cursor = -1, merged_read_cursor = -1;
+ long long cigar_read_cursor = -1, cigar_chro_cursor = -1;
+ char * target_cigar = iteration_context -> merged_cigar[best_align_i], *old_cigar, ** old_txt_ptr = NULL;
+ int target_cigar_ptr = 0, last_seg_last_base_read = -1, cigar_opts = 0, last_written_cigar_read = 0, *old_seg_length, read_matched_bases = 0, read_mismatched_bases = 0;
+ long long last_seg_last_base_chro = -1, last_written_cigar_chro = 0;
+ unsigned int * old_best_pos;
+
+ old_cigar = malloc((LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1) * (LRMSEGMENT_CIGAR_SIZE + 1));
+ old_txt_ptr = malloc(sizeof(void *) *(LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1));
+ old_seg_length = malloc(sizeof(int) *(LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1));
+ old_best_pos = malloc(sizeof(int) *(LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1));
+
+ memset(target_cigar, 0, LRMMERGE_CIGAR_SIZE+1);
+ if(best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) LRMreverse_read(iteration_context -> read_text, iteration_context -> read_length);
+ // remove "S" sections from the middle parts
+
+ long long validateCigar_cursor = -1;
+ int written_read_cursor = 0, dynamic_done;
+ for(x1 = 0; x1 < best_cand_rec -> segment_number ; x1 ++){
+ int this_seg_id = ( best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) ? best_cand_rec -> segment_id[best_cand_rec -> segment_number - x1 - 1] : best_cand_rec -> segment_id[x1];
+ int this_segcand_id = ( best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) ? best_cand_rec -> realign_cand_id[best_cand_rec -> segment_number - x1 - 1] : best_cand_rec -> realign_cand_id[x1];
+
+ strcpy(old_cigar + this_seg_id * (LRMSEGMENT_CIGAR_SIZE + 1), iteration_context -> segment_cigars[this_seg_id][this_segcand_id]);
+ old_seg_length[this_seg_id] = iteration_context -> segment_lengths[this_seg_id] ;
+ old_txt_ptr[this_seg_id] = iteration_context -> segment_texts[this_seg_id];
+ old_best_pos[this_seg_id] = iteration_context -> segment_best_pos[this_seg_id][this_segcand_id];
+
+ if(1){
+ int cigar_max = strlen( iteration_context -> segment_cigars[this_seg_id][this_segcand_id]) + 1;
+ cigar_max = min( LRMMERGE_CIGAR_SIZE, cigar_max );
+ char * new_cigar = malloc(cigar_max+1);
+ new_cigar[0]=0;
+ int new_cigar_ptr = 0;
+ int cci = 0, tmpi = 0, nch, is_first_section = 1;
+
+ if(0){//&& FIXLENstrcmp("0a7d1c2c-1acd-4073-8497-e8766a77fff9_Basecall_1D_template" , iteration_context -> read_name)==0){
+ char och = iteration_context -> segment_texts[this_seg_id][ iteration_context -> segment_lengths[this_seg_id] ];
+ iteration_context -> segment_texts[this_seg_id][ iteration_context -> segment_lengths[this_seg_id] ] = 0;
+ LRMprintf("CIGAR OLD of SEG %d :%s, NEW: %p, LEN=%d NCH='%c'(%d) PTR SIZE=%d\nREAD = %s\n", this_seg_id, iteration_context -> segment_cigars[this_seg_id][this_segcand_id], new_cigar, tmpi, nch, nch, cigar_max - new_cigar_ptr, iteration_context -> segment_texts[this_seg_id]);
+ iteration_context -> segment_texts[this_seg_id][ iteration_context -> segment_lengths[this_seg_id] ] = och;
+ }
+
+ while(0!=(nch = iteration_context -> segment_cigars[this_seg_id][this_segcand_id] [cci++])){
+ if(nch >='0' && nch <='9')
+ tmpi = tmpi*10+(nch - '0');
+ else{
+ if(nch == 'S'){
+ if(is_first_section == ((best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND)?0:1))
+ iteration_context -> segment_texts[this_seg_id] += tmpi;
+ if(is_first_section)
+ iteration_context -> segment_best_pos[this_seg_id][this_segcand_id] += tmpi;
+ iteration_context -> segment_lengths[this_seg_id] -= tmpi;
+ }else if(tmpi > 0){
+ int has_substantial = nch == 'M' || nch == 'I';
+ if(!has_substantial){
+ int ncci, nnch, ntmpi=0;
+ for(ncci = cci; 0!=(nnch = iteration_context -> segment_cigars[this_seg_id][this_segcand_id] [ncci]); ncci++){
+ if(nnch !='0' && isdigit(nnch)) ntmpi = 1;
+ if(( nnch == 'I' || nnch == 'M' ) && ntmpi){
+ has_substantial = 1;
+ break;
+ }
+ if(isalpha(nnch)) ntmpi=0;
+ }
+ }
+ if(has_substantial) new_cigar_ptr += snprintf(new_cigar+new_cigar_ptr, cigar_max - new_cigar_ptr, "%d%c", tmpi, nch);
+ }
+ is_first_section = 0;
+ tmpi=0;
+ }
+ }
+ strncpy(iteration_context -> segment_cigars[this_seg_id][this_segcand_id], new_cigar, LRMSEGMENT_CIGAR_SIZE);
+ free(new_cigar);
+ assert(0 < iteration_context -> segment_lengths[this_seg_id]);
+ }
+
+ dynamic_done = 0;
+ if(1){
+ int this_start_offset = iteration_context -> segment_texts[this_seg_id] - iteration_context -> read_text;
+
+ assert(iteration_context -> segment_best_pos[this_seg_id][this_segcand_id] >=this_start_offset);
+
+ if(best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ){
+ int R_pos = iteration_context -> read_length - (iteration_context -> segment_texts[this_seg_id] - iteration_context -> read_text + iteration_context -> segment_lengths[this_seg_id]);
+ //assert(iteration_context -> segment_best_pos[this_seg_id][this_segcand_id] >=R_pos);
+ if(iteration_context -> segment_best_pos[this_seg_id][this_segcand_id]< R_pos){
+ LRMprintf("REVERSED_RPOS: %s\n", iteration_context -> read_name);
+ }
+ this_start_offset = R_pos;
+ }
+
+ if(0){
+ char postxt[100];
+ LRMpos2txt(context, iteration_context -> segment_best_pos[this_seg_id][this_segcand_id], postxt);
+ LRMprintf("TRY SEGMENT: LEN %d, READ+%d, CHRO %s\n", iteration_context -> read_length, this_start_offset , postxt);
+ }
+
+ if(validateCigar_cursor<0) validateCigar_cursor = this_start_offset;
+
+ if(final_mapping_pos < 0){
+ final_mapping_pos = iteration_context -> segment_best_pos[this_seg_id][this_segcand_id];// - this_start_offset;
+
+ if(this_start_offset > 0 && this_start_offset < LRMDYNAMIC_MAXIMUM_GAP_LENGTH -1 ){
+ unsigned int total_mismatched_bases = 0;
+ int moves = LRMindel_dynamic_search_unknownregion(context, thread_context,0, final_mapping_pos, iteration_context -> read_text , 0, this_start_offset, &total_mismatched_bases, iteration_context -> read_length / LRMLONGREAD_DENOMINATOR, iteration_context -> read_name);
+ if(1 && moves >0){
+ int softclipping_bases = 0;
+ //LRMprintf("HEAD_MOVES=%d : %s\n", moves, thread_context -> dynamic_programming_indel_movement_buf);
+ long long old_cigar_chro_cursor = cigar_chro_cursor;
+ LRMmoves_to_cigar(context, thread_context, iteration_context, &cigar_chro_cursor, &cigar_read_cursor, &target_cigar_ptr, &last_written_cigar_chro, &correct_mapping, moves, &cigar_opts, &last_written_cigar_read, &softclipping_bases, &read_matched_bases, &read_mismatched_bases, best_align_i);
+ final_mapping_pos -= (cigar_chro_cursor - old_cigar_chro_cursor) - softclipping_bases;
+ dynamic_done = 1;
+ }
+ }
+ if(this_start_offset > 0 && !dynamic_done)
+ if(cigar_opts < context -> max_cigar_opts_in_read - 5 && target_cigar_ptr < LRMMERGE_CIGAR_SIZE - 70)
+ target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - 40 - target_cigar_ptr, "%dS", this_start_offset);
+
+ written_read_cursor = this_start_offset;
+ cigar_read_cursor = this_start_offset;
+ cigar_opts ++;
+ cigar_chro_cursor = iteration_context -> segment_best_pos[this_seg_id][this_segcand_id];
+ last_written_cigar_chro = cigar_chro_cursor;
+ LRMlocate_chro_length(context, final_mapping_pos, &this_chromosome_name ,&this_chromosome_end);
+ }
+ merged_chro_cursor = iteration_context -> segment_best_pos[this_seg_id][this_segcand_id];
+ merged_read_cursor = this_start_offset;
+
+ if(0){
+ char postxt[100];
+ LRMpos2txt(context, iteration_context -> segment_best_pos[this_seg_id][this_segcand_id], postxt );
+ LRMprintf("FINAL MERGING : %s [%d / %d] ; CHRO POS = %s (%s) ; READ POS = %d ; CIGAR = %s\n", iteration_context -> read_name, this_seg_id, iteration_context -> total_segments, postxt, (best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND)?"NEG":"POS", this_start_offset, iteration_context -> segment_cigars[this_seg_id][this_segcand_id] );
+ }
+
+ if(1){
+ long long chro_pos_from_new = LRMcalculate_written_chro_pos(context, thread_context, iteration_context, written_read_cursor, merged_read_cursor, iteration_context -> segment_cigars[this_seg_id][this_segcand_id], merged_chro_cursor);
+ if(chro_pos_from_new < 0){
+ continue;
+ }
+
+ long long delta = 0;
+ if(last_seg_last_base_chro>0)
+ delta =chro_pos_from_new - last_seg_last_base_chro;
+
+ long long deltaOFF = delta - (this_start_offset - last_seg_last_base_read);
+
+ if(0){
+ char postxt[100];
+ LRMpos2txt(context, last_seg_last_base_chro , postxt);
+ LRMprintf(" ========== DYNAMIC : %d ~ %d FROM %s ============= DELTA=%lld, %d > %lld\n", last_seg_last_base_read, this_start_offset , postxt , deltaOFF, LRMDYNAMIC_MAXIMUM_GAP_LENGTH -1 , this_start_offset - last_seg_last_base_read - min(0, deltaOFF));
+ }
+
+ if( this_start_offset > last_seg_last_base_read + 5 && delta > 0 && abs(deltaOFF) <= context -> max_dynamic_indel_length && LRMDYNAMIC_MAXIMUM_GAP_LENGTH -1 > this_start_offset - last_seg_last_base_read - min(0, deltaOFF)){
+ unsigned int total_mismatched_bases = 0;
+ int moves = LRMindel_dynamic_search(context, thread_context,-(int)deltaOFF, last_seg_last_base_chro, iteration_context -> read_text , last_seg_last_base_read, this_start_offset , &total_mismatched_bases, iteration_context -> read_length / LRMLONGREAD_DENOMINATOR , iteration_context -> read_name);
+ //LRMprintf("MIDDLE_MOVES=%d : %s\n", moves, thread_context -> dynamic_programming_indel_movement_buf);
+ LRMmoves_to_cigar(context, thread_context, iteration_context, &cigar_chro_cursor, &cigar_read_cursor, &target_cigar_ptr, &last_written_cigar_chro, &correct_mapping, moves, &cigar_opts, &last_written_cigar_read, NULL, &read_matched_bases, &read_mismatched_bases, best_align_i);
+ written_read_cursor = this_start_offset;
+ }else{
+ if(last_seg_last_base_chro>0 && this_start_offset > last_seg_last_base_read) {
+
+ cigar_chro_cursor += this_start_offset - last_seg_last_base_read;
+ cigar_read_cursor += this_start_offset - last_seg_last_base_read;
+
+ if(cigar_opts < context -> max_cigar_opts_in_read - 5 && target_cigar_ptr < LRMMERGE_CIGAR_SIZE - 70){
+ last_written_cigar_read = cigar_read_cursor;
+ last_written_cigar_chro = max(last_written_cigar_chro,cigar_chro_cursor);
+ target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - 40 - target_cigar_ptr, "%dM", this_start_offset - last_seg_last_base_read);
+ if(this_start_offset - last_seg_last_base_read > 0)correct_mapping = 1;
+ }
+ cigar_opts ++;
+
+ //LRMprintf("======== GAP MMM = %d\n", this_start_offset - last_seg_last_base_read);
+ delta -= this_start_offset - last_seg_last_base_read;
+ written_read_cursor = this_start_offset;
+ }
+
+ if(delta){
+
+ long long delta_written = abs(delta);
+
+ if(delta < 0) delta_written = min(delta_written, iteration_context -> read_length - cigar_read_cursor);
+
+ if(delta>0) cigar_chro_cursor += delta_written;
+ else cigar_read_cursor += delta_written;
+
+ if(cigar_opts < context -> max_cigar_opts_in_read - 5 && target_cigar_ptr < LRMMERGE_CIGAR_SIZE - 70){
+ target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - 40 - target_cigar_ptr, "%lld%c" , delta_written, delta > 0?'D':'I' );
+ last_written_cigar_read = cigar_read_cursor;
+ last_written_cigar_chro = max(last_written_cigar_chro,cigar_chro_cursor);
+ }
+ cigar_opts ++;
+ //LRMprintf("======== GAP NNN = %lld\n", delta);
+
+ if(delta < 0)
+ written_read_cursor += delta_written;
+ }
+ }
+ }
+
+ if(merged_chro_cursor >= 0){
+ int cci = 0, tmpi = 0, nch;
+ while(0!=(nch = iteration_context -> segment_cigars[this_seg_id][this_segcand_id] [cci++])){
+ if(nch >='0' && nch <='9')
+ tmpi = tmpi*10+(nch - '0');
+ else{
+ if(nch == 'M' || nch == 'S' || nch == 'D' || nch == 'N')merged_chro_cursor += tmpi;
+ if(nch == 'M' || nch == 'S' || nch == 'I') merged_read_cursor += tmpi;
+
+ if(written_read_cursor <=merged_read_cursor){
+ int writting_optlen = tmpi;
+ if(nch == 'M' || nch == 'S' || nch == 'I'){
+ writting_optlen = (merged_read_cursor - written_read_cursor);
+ cigar_read_cursor += writting_optlen;
+ }
+ if(nch == 'M' || nch == 'S' || nch == 'D' || nch == 'N')
+ cigar_chro_cursor += writting_optlen;
+
+ if(cigar_opts < context -> max_cigar_opts_in_read - 5 && target_cigar_ptr < LRMMERGE_CIGAR_SIZE - 70){
+ target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - 40 - target_cigar_ptr, "%d%c" , writting_optlen, nch);
+ last_written_cigar_read = cigar_read_cursor;
+ last_written_cigar_chro = max(last_written_cigar_chro,cigar_chro_cursor);
+
+ if(nch=='M' && writting_optlen > 0){
+ correct_mapping =1;
+ read_matched_bases += writting_optlen;
+ }
+ }
+ cigar_opts ++;
+ written_read_cursor = merged_read_cursor;
+ }
+ tmpi = 0;
+ }
+ }
+ last_seg_last_base_read = LRM_USE_CIGAR_RESCURE? cigar_read_cursor :merged_read_cursor;
+ last_seg_last_base_chro = LRM_USE_CIGAR_RESCURE? cigar_chro_cursor :merged_chro_cursor;
+ }
+ }
+
+
+ }
+
+ dynamic_done = 0;
+ if(last_written_cigar_read < iteration_context -> read_length && (iteration_context -> read_length - last_written_cigar_read) < LRMDYNAMIC_MAXIMUM_GAP_LENGTH -1){
+ unsigned int total_mismatched_bases = 0;
+ int moves = LRMindel_dynamic_search_unknownregion(context, thread_context,1, last_written_cigar_chro, iteration_context -> read_text , last_written_cigar_read, iteration_context -> read_length, &total_mismatched_bases, iteration_context -> read_length / LRMLONGREAD_DENOMINATOR, iteration_context -> read_name);
+ if(1 && moves >0){
+ //LRMprintf("TAIL_MOVES=%d : %s\n", moves, thread_context -> dynamic_programming_indel_movement_buf);
+ LRMmoves_to_cigar(context, thread_context, iteration_context, &cigar_chro_cursor, &cigar_read_cursor, &target_cigar_ptr, &last_written_cigar_chro, &correct_mapping, moves, &cigar_opts, &last_written_cigar_read, NULL, &read_matched_bases, &read_mismatched_bases, best_align_i);
+ dynamic_done = 1;
+ }
+ //if(target_cigar_ptr < LRMMERGE_CIGAR_SIZE)target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - target_cigar_ptr, "%dS", iteration_context -> read_length - last_written_cigar_read );
+ }
+
+ if(last_written_cigar_read < iteration_context -> read_length && !dynamic_done){
+ if(cigar_opts < context -> max_cigar_opts_in_read - 5 && target_cigar_ptr < LRMMERGE_CIGAR_SIZE - 70){
+ target_cigar_ptr += snprintf( target_cigar + target_cigar_ptr, LRMMERGE_CIGAR_SIZE - 40 - target_cigar_ptr, "%dS" , iteration_context -> read_length - last_written_cigar_read);
+ last_written_cigar_read = iteration_context -> read_length;
+ }
+ }
+
+ char * chro_name_end=NULL;
+ int chro_pos_end = 0;
+ LRMlocate_gene_position(context, last_written_cigar_chro, &chro_name_end, &chro_pos_end);
+
+ if( read_matched_bases < (read_matched_bases + read_mismatched_bases) * 75 / 100 || read_matched_bases < context -> min_matched_bases || last_written_cigar_read != iteration_context -> read_length || last_written_cigar_chro > this_chromosome_end || chro_name_end != this_chromosome_name ){
+ if(context -> show_read_validation)LRMprintf("Matched bases: %d in %d, last_written_cigar_read %d == %d, last_written_cigar_chro %s == %s\n",read_matched_bases, (read_matched_bases + read_mismatched_bases), last_written_cigar_read, iteration_context -> read_length, chro_name_end, this_chromosome_name );
+ correct_mapping = 0;
+ }
+
+ if(best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) LRMreverse_read(iteration_context -> read_text, iteration_context -> read_length);
+
+ iteration_context -> merged_position[best_align_i] = final_mapping_pos;
+ iteration_context -> merged_masks[best_align_i] = best_cand_rec -> masks;
+
+ if(context -> show_read_validation){// || FIXLENstrcmp("1e5f9005-2d77-464c-ba1f-2dda2559d782_Basecall_1D_template" , iteration_context -> read_name)==0){
+ if( correct_mapping ){
+ char postxt[100];
+ int mapped_length = 0;
+
+ LRMfix_cigar(context, thread_context, iteration_context, target_cigar);
+ LRMfix_cigar(context, thread_context, iteration_context, target_cigar);
+ LRMfix_cigar(context, thread_context, iteration_context, target_cigar);
+
+ LRMpos2txt(context , final_mapping_pos, postxt);
+ LRMprintf("\nFINAL READ %s to %s (%s) (%lld)\n", iteration_context -> read_name, postxt, best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND?"NEG":"POS", final_mapping_pos);
+ int matched_bases = LRMvalidate_mapping(context , iteration_context -> read_text, target_cigar, &context -> current_base_index, final_mapping_pos, best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND, &mapped_length , 1);
+ LRMprintf("Matched %d in %d : %s\n", matched_bases, mapped_length, target_cigar);
+ //LRMprintf("TWO CHROS: %s, %s ; MAPPED: %lld - %lld , END: %lld\n", chro_name_end, this_chromosome_name, final_mapping_pos, last_written_cigar_chro, this_chromosome_end);
+ LRMprintf("Segments mapped = %d, %d ~ %d, out of %d\n", best_cand_rec -> segment_number , best_cand_rec -> segment_id[0], best_cand_rec -> segment_id[best_cand_rec -> segment_number - 1], iteration_context -> total_segments);
+ LRMprintf("\n\n");
+ }
+ LRMprintf("Matched bases = %d ; Mismatched bases = %d\n", read_matched_bases, read_mismatched_bases);
+ if(!correct_mapping) LRMprintf("ERROR IN MAPPING !!!\n");
+ }
+ //if (!correct_mapping)LRMprintf("WARNING: unexpected mapping result of %s\n", iteration_context -> read_name);
+
+ for(x1 = 0; x1 < best_cand_rec -> segment_number ; x1 ++){
+ int this_seg_id = ( best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) ? best_cand_rec -> segment_id[best_cand_rec -> segment_number - x1 - 1] : best_cand_rec -> segment_id[x1];
+ int this_segcand_id = ( best_cand_rec -> masks & LRMIS_NEGATIVE_STRAND ) ? best_cand_rec -> realign_cand_id[best_cand_rec -> segment_number - x1 - 1] : best_cand_rec -> realign_cand_id[x1];
+
+
+ //LRMprintf("SEG_ID=%d\n", this_seg_id);
+ strcpy(iteration_context -> segment_cigars[this_seg_id][this_segcand_id], old_cigar + this_seg_id * (LRMSEGMENT_CIGAR_SIZE + 1));
+ iteration_context -> segment_lengths[this_seg_id] = old_seg_length[this_seg_id];
+ iteration_context -> segment_texts[this_seg_id] = old_txt_ptr[this_seg_id];
+ iteration_context -> segment_best_pos[this_seg_id][this_segcand_id] = old_best_pos[this_seg_id];
+ }
+ iteration_context -> merged_matched_bases[best_align_i] = read_matched_bases;
+ free(best_cand_rec);
+
+ free(old_seg_length);
+ free(old_cigar);
+ free(old_txt_ptr);
+ free(old_best_pos);
+
+ if(0)if(1||cigar_opts >66000 || target_cigar_ptr > LRMMERGE_CIGAR_SIZE - 1000){
+ LRMprintf("CLOSE_TO_LIM: [%s , %d] %s, Ops=%d, Len=%d ~ %d\n", correct_mapping?"CORRECT": "ERROR", best_align_i, iteration_context -> read_name , cigar_opts, target_cigar_ptr, LRMMERGE_CIGAR_SIZE);
+ }
+
+ } // end : if the best candidate list is not NULL
+
+ if(0 &&FIXLENstrcmp("0a7d1c2c-1acd-4073-8497-e8766a77fff9_Basecall_1D_template" , iteration_context -> read_name)==0){
+ LRMprintf("TEST_M %s ; Has_M = %d ; Cigar = %s\n", iteration_context -> read_name, correct_mapping, iteration_context -> merged_cigar[best_align_i]);
+ }
+ if(!correct_mapping){
+ read_res -> votes[best_align_i] =0;
+ iteration_context -> merged_cigar[best_align_i][0]=0;
+ }
+
+ }
+
+ free(segment_id_list);
+ free(segment_cand_id_list);
+ free(segment_linear_list);
+ HashTableDestroy(candarray_used_segment_id);
+ HashTableDestroy(candarray_used_cand_id);
+
+ return 0;
+}
+
+
+void LRMdo_one_realign_read(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context){
+ int flags=4;
+ int map_quality = 0;
+ int mis_matched = 0;
+ int seg_i;
+
+ LRMread_mapping_result_t * read_res = context -> read_mapping_results + iteration_context -> read_no_in_chunk;
+ memset(iteration_context -> segment_best_candidate_score, 0, sizeof(int) *(LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1) * LRMMERGING_MAX_CANDIDATES);
+ memset(iteration_context -> segment_best_votes, 0, sizeof(int) *(LRMMAX_READ_LENGTH / LRMSEGMENT_MIN_DIST+1) * LRMMERGING_MAX_CANDIDATES);
+
+ for(seg_i = 0; seg_i < iteration_context -> total_segments ; seg_i++){
+ LRMrealign_context_t realign_context;
+ int * this_best_scores = iteration_context -> segment_best_candidate_score[seg_i];
+
+ int cand_i;
+ for(cand_i = 0; cand_i < LRMSEGMENT_MAX_CANDIDATES; cand_i ++){
+ LRMsegment_mapping_candidate_t * cand_res = read_res -> segment_results[seg_i].candidates + cand_i;
+
+ if(0 && FIXLENstrcmp("0a7d1c2c-1acd-4073-8497-e8766a77fff9_Basecall_1D_template", iteration_context -> read_name) == 0){
+ char postxt[100];
+ LRMpos2txt(context, cand_res -> first_base_position , postxt);
+ LRMprintf("TRY REALIGN READ %s [%d , %d] : V=%d ; POS=%s (%s)\n", iteration_context -> read_name, seg_i, cand_i, cand_res -> votes, postxt, (cand_res -> masks & LRMIS_NEGATIVE_STRAND)?"NEG":"POS");
+ }
+ if(cand_res -> votes < 1) break;
+
+ memset(&realign_context, 0, sizeof(LRMrealign_context_t));
+ realign_context.current_segment_id = seg_i;
+ realign_context.current_candidate_id = cand_i;
+ LRMrealign_one_segment(context, thread_context, iteration_context, &realign_context);
+ if(realign_context.best_stack_score[0]+realign_context.best_stack_score[1] > this_best_scores[LRMMERGING_MAX_CANDIDATES-1]){
+ int replace_i, replace_index = LRMMERGING_MAX_CANDIDATES-1, is_repeated = 0;
+ for(replace_i = LRMMERGING_MAX_CANDIDATES-1; replace_i >= 0; replace_i --){
+ if(realign_context.best_stack_score[0]+realign_context.best_stack_score[1] > this_best_scores[replace_i])
+ replace_index = replace_i;
+ if(iteration_context -> segment_best_pos[seg_i][replace_i] == realign_context.best_chro_pos) is_repeated = 1;
+ }
+
+ if(0 == is_repeated){
+ for(replace_i = LRMMERGING_MAX_CANDIDATES - 2 ; replace_i >= replace_index; replace_i--){
+ strncpy(iteration_context -> segment_cigars[seg_i][replace_i+1], iteration_context -> segment_cigars[seg_i][replace_i], LRMSEGMENT_CIGAR_SIZE);
+ iteration_context -> segment_best_pos[seg_i][replace_i+1] = iteration_context -> segment_best_pos[seg_i][replace_i];
+ iteration_context -> segment_best_masks[seg_i][replace_i+1] = iteration_context -> segment_best_masks[seg_i][replace_i];
+ iteration_context -> segment_best_candidate_score[seg_i][replace_i+1] = iteration_context -> segment_best_candidate_score[seg_i][replace_i];
+ iteration_context -> segment_best_votes[seg_i][replace_i+1] = iteration_context -> segment_best_votes[seg_i][replace_i];
+ }
+
+ strncpy(iteration_context -> segment_cigars[seg_i][replace_index], realign_context.best_cigar, LRMSEGMENT_CIGAR_SIZE);
+ iteration_context -> segment_best_pos[seg_i][replace_index] = realign_context.best_chro_pos;
+ iteration_context -> segment_best_masks[seg_i][replace_index] = cand_res -> masks;
+ iteration_context -> segment_best_candidate_score[seg_i][replace_index] = realign_context.best_stack_score[0]+realign_context.best_stack_score[1] ;
+ iteration_context -> segment_best_votes[seg_i][replace_index] = cand_res -> votes;
+ }
+ }
+ }
+ }
+
+ int best_align_i;
+ for(best_align_i = 0; best_align_i < LRMMAX_MULTI_BEST; best_align_i++)
+ strcpy(iteration_context -> merged_cigar[best_align_i], "*");
+ LRMread_final_result_merge(context, thread_context, iteration_context, read_res);
+
+
+ int best_matched_bases = 0;
+ for(best_align_i = 0; best_align_i < LRMMAX_MULTI_BEST; best_align_i++){
+ int x2, repeated = 0;
+ for(x2 = 0; x2 < best_align_i; x2++){
+ if(iteration_context -> merged_position[x2] == iteration_context -> merged_position[best_align_i]){
+ repeated=1;
+ break;
+ }
+ }
+ if(repeated) strcpy(iteration_context -> merged_cigar[best_align_i], "*");
+
+ char * chro_name="*";
+ int chro_pos = -1;
+ int not_found = LRMlocate_gene_position(context, iteration_context -> merged_position[best_align_i], &chro_name, & chro_pos);
+ if(read_res -> votes[best_align_i] < 1 ||not_found) strcpy(iteration_context -> merged_cigar[best_align_i], "*");
+ }
+ for(best_align_i = 0; best_align_i < LRMMAX_MULTI_BEST; best_align_i++){
+ if(iteration_context -> merged_cigar[best_align_i][0]==0) continue;
+ if(iteration_context -> merged_cigar[best_align_i][0]=='*') continue;
+ if(0){
+ char p1txt[100];
+ LRMpos2txt(context, iteration_context -> merged_position[best_align_i], p1txt);
+ LRMprintf("TEQLL #%d, score=%d, pos=%s %u\n", best_align_i, iteration_context -> merged_matched_bases[best_align_i],p1txt, iteration_context -> merged_position[best_align_i]);
+ }
+ if(iteration_context -> merged_matched_bases[best_align_i] > best_matched_bases){
+ best_matched_bases = iteration_context -> merged_matched_bases[best_align_i];
+ }
+ }
+
+ int equally_best_results = 0;
+ for(best_align_i = 0; best_align_i < LRMMAX_MULTI_BEST; best_align_i++){
+ if(iteration_context -> merged_cigar[best_align_i][0]==0) continue;
+ if(iteration_context -> merged_cigar[best_align_i][0]=='*') continue;
+ if(iteration_context -> merged_matched_bases[best_align_i] < best_matched_bases){
+ strcpy(iteration_context -> merged_cigar[best_align_i], "*");
+ }else equally_best_results++;
+ }
+
+ if(context -> unique_only && equally_best_results>1){
+ for(best_align_i = 0; best_align_i < LRMMAX_MULTI_BEST; best_align_i++)
+ strcpy(iteration_context -> merged_cigar[best_align_i], "*");
+ equally_best_results = 0;
+ }
+
+
+ int this_mapping_loc = 1;
+ int total_output_records = max(1, min(context -> max_best_alignments, equally_best_results));
+ //LRMprintf("EQLL %s #=%d, score=%d\tTOTAL_LOC=%d\tBB=%d\n", iteration_context -> read_name, equally_best_results, best_matched_bases, total_output_records, context -> max_best_alignments);
+ for(best_align_i = 0; best_align_i < LRMMAX_MULTI_BEST; best_align_i++){
+ char * chro_name="*";
+ int chro_pos = -1;
+ if(best_align_i>0 && equally_best_results < 1){
+ if(iteration_context -> merged_cigar[best_align_i][0]==0) continue;
+ if(iteration_context -> merged_cigar[best_align_i][0]=='*') continue;
+ }
+
+ if(equally_best_results >0){
+ if(iteration_context -> merged_cigar[best_align_i][0]==0) continue;
+ if(iteration_context -> merged_cigar[best_align_i][0]=='*') continue;
+ }
+
+ if(this_mapping_loc >=total_output_records+1)continue;
+
+ if( iteration_context -> merged_cigar[best_align_i][0]!=0 && iteration_context -> merged_cigar[best_align_i][0]!='*' ){
+ int not_found = LRMlocate_gene_position(context, iteration_context -> merged_position[best_align_i], &chro_name, & chro_pos);
+ if(not_found){
+ flags = 4;
+ strcpy(iteration_context -> merged_cigar[best_align_i], "*");
+ chro_pos = -1;
+ chro_name = "*";
+ }else{
+ map_quality = 10;// read_res -> votes + 10;
+ if( iteration_context -> merged_masks[best_align_i] & LRMIS_NEGATIVE_STRAND){
+ flags=16;
+ LRMreverse_read_and_qual(context, thread_context, iteration_context);
+ }else flags = 0;
+ }
+ }else flags = 4;
+
+ LRMfix_cigar(context, thread_context, iteration_context, iteration_context -> merged_cigar[best_align_i]);
+ LRMfix_cigar(context, thread_context, iteration_context, iteration_context -> merged_cigar[best_align_i]);
+ LRMfix_cigar(context, thread_context, iteration_context, iteration_context -> merged_cigar[best_align_i]);
+
+ LRMwrite_chunk_add_buffered_output(context, thread_context, iteration_context, flags, chro_name, chro_pos, map_quality, iteration_context -> merged_cigar[best_align_i], mis_matched, total_output_records, this_mapping_loc++);
+ }
+ if( equally_best_results>0 ) thread_context -> mapped_reads ++;
+}
+
+int LRMchunk_read_iteration(LRMcontext_t * context, int thread_id, int task){
+ LRMthread_context_t * thread_context = context -> thread_contexts+ thread_id;
+
+ LRMread_iteration_context_t * iteration_context;
+ iteration_context = malloc(sizeof(LRMread_iteration_context_t));
+ //LRMprintf(" ============ LITR_CONTEXT PTR=%p, SIZE=%lld \n", iteration_context, sizeof(LRMread_iteration_context_t));
+ memset(iteration_context, 0, sizeof(LRMread_iteration_context_t));
+ while(1){
+ int retv = LRMfetch_next_read(context, thread_context, &iteration_context-> read_length, iteration_context->read_name, iteration_context->read_text, iteration_context->qual_text, &iteration_context -> read_no_in_chunk);
+ if(retv) break;
+
+ LRMsplit_read_to_segments(context, iteration_context);
+ if(task==LRMRUNNING_STEP_VOTING)
+ LRMdo_one_voting_read(context, thread_context, iteration_context);
+ else if(task==LRMRUNNING_STEP_REALIGN)
+ LRMdo_one_realign_read(context, thread_context, iteration_context);
+ else assert(0);
+
+ if(iteration_context -> read_no_in_chunk % 2000 == 0)
+ LRMprintf("Processing %d-th read for task %d; used %.1f minutes\n", context -> all_processed_reads + iteration_context -> read_no_in_chunk, task, (LRMmiltime() - context -> start_running_time)/60);
+
+ //LRMprintf("R:%s, T:%s\n", iteration_context -> read_name, iteration_context -> read_text);
+ }
+ free(iteration_context);
+ return 0;
+}
+
+int LRMfinalise_chunk_reads(LRMcontext_t* context){
+ context -> all_processed_reads += context -> processed_reads_in_chunk;
+ return 0;
+}
+
+int FIXLENstrcmp(char * fixed_len, char * rname){
+ int x=0;
+ for(; fixed_len[x]; x++){
+ if(rname[x]!=fixed_len[x]) return 1;
+ }
+ return 0;
+}
+
diff --git a/src/longread-mapping/seek-zlib.c b/src/longread-mapping/seek-zlib.c
new file mode 100644
index 0000000..09d06b7
--- /dev/null
+++ b/src/longread-mapping/seek-zlib.c
@@ -0,0 +1,363 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+#include <assert.h>
+#include "seek-zlib.h"
+
+#define SEEKGZ_INIT_TEXT_SIZE (1024*1024)
+#define SEEKGZ_BINBUFF_SIZE (1*1024*1024)
+
+unsigned long long seekgz_ftello(seekable_zfile_t * fp){
+ unsigned long long ret = ftello(fp -> gz_fp);
+ ret -= fp -> stem.avail_in;
+ return ret;
+}
+
+unsigned int crc_pos(char * bin, int len){
+ unsigned int crc0 = crc32(0, NULL, 0);
+ unsigned int CRC32 = crc32(crc0, (unsigned char *) bin, len);
+ return CRC32;
+}
+
+void seekgz_binreadmore(seekable_zfile_t * fp){
+ if(feof(fp->gz_fp))return;
+
+ assert(fp -> stem.avail_in >= 0);
+ if(fp -> stem.avail_in < SEEKGZ_BINBUFF_SIZE / 2 ) {
+ if(fp -> in_pointer > 0 && fp -> stem.avail_in > 0){
+ int i;
+ for(i = 0 ; i < fp -> stem.avail_in ; i ++){
+ fp -> current_chunk_bin[i] = fp -> current_chunk_bin[i + fp -> in_pointer];
+ }
+ }
+ fp -> in_pointer = 0;
+
+ int readlen = fread(fp -> current_chunk_bin + fp -> stem.avail_in, 1 , SEEKGZ_BINBUFF_SIZE - fp -> stem.avail_in , fp -> gz_fp);
+ if(readlen>0)
+ fp -> stem.avail_in += readlen;
+ fp -> stem.next_in = (unsigned char *)fp -> current_chunk_bin;
+ //SEEKZLIBprintf("READIN: %d, POS: %llu, CRC:%u\n", fp -> stem.avail_in , ftello(fp -> gz_fp) , crc_pos(fp -> current_chunk_bin , fp -> stem.avail_in));
+ }
+}
+
+int seekgz_bingetc(seekable_zfile_t * fp){
+ seekgz_binreadmore(fp);
+ int ret = -1;
+
+ if(fp -> stem.avail_in > 0)
+ {
+ ret = fp -> current_chunk_bin [ fp -> in_pointer ++];
+ fp -> stem.next_in = (unsigned char *)(fp -> current_chunk_bin + fp -> in_pointer);
+ fp -> stem.avail_in --;
+ if(ret<0) ret=256+ret;
+ }
+ return ret;
+
+}
+
+int seekgz_skip_header(seekable_zfile_t * fp, int tail_size){
+ int id1, id2;
+
+ if(tail_size){
+ for(id1=0; id1<tail_size; id1++)
+ seekgz_bingetc(fp);
+ }
+ id1 = seekgz_bingetc(fp);
+ id2 = seekgz_bingetc(fp);
+
+ if(id1 != 31 || id2 != 139){
+ //SEEKZLIBprintf("header:%d,%d\n", id1, id2);
+ return 1;
+ }
+
+ seekgz_bingetc(fp); // CM
+ int FLG= seekgz_bingetc(fp); // FLG
+ seekgz_bingetc(fp);
+ seekgz_bingetc(fp);
+ seekgz_bingetc(fp);
+ seekgz_bingetc(fp);
+ seekgz_bingetc(fp); // XFL
+ seekgz_bingetc(fp); // OS
+
+ //fprintf(stderr, "FLG=%d, XFL=%d\n" , FLG, XFL);
+
+ if(FLG & (1<<2)){ // FEXT
+ unsigned short XLEN=0;
+ XLEN = seekgz_bingetc(fp);
+ XLEN += seekgz_bingetc(fp)*256;
+ for(; XLEN>0; XLEN--)
+ seekgz_bingetc(fp);
+ }
+
+ for(id1 = 3; id1 <=4; id1++){
+ if(FLG & (1<<id1)){ // FNAME or FCOMMENT
+ while(1){
+ int namec = seekgz_bingetc(fp);
+ if(0==namec) break;
+ }
+ }
+ }
+ if(FLG & (1<<1)){ // FCRC
+ seekgz_bingetc(fp);
+ seekgz_bingetc(fp);
+ }
+
+ fp -> next_block_file_offset = seekgz_ftello(fp);
+ if(fp -> block_start_in_file_offset<1)
+ fp -> block_start_in_file_offset = fp -> next_block_file_offset;
+ fp -> next_block_file_bits = 0;
+ fp -> dict_window_used = 0;
+ fp -> dict_window_pointer = 0;
+
+ fp -> is_the_last_chunk = 2;
+ return 0;
+}
+
+int seekgz_decompress_next_chunk(seekable_zfile_t * fp);
+int seekgz_open(const char * fname, seekable_zfile_t * fp){
+ memset(fp, 0, sizeof(seekable_zfile_t));
+ fp -> gz_fp = fopen(fname, "rb");
+ if(NULL==fp -> gz_fp)return -1;
+ fp -> current_chunk_bin = malloc(SEEKGZ_BINBUFF_SIZE);
+ fp -> current_chunk_txt = malloc(SEEKGZ_INIT_TEXT_SIZE);
+ fp -> current_chunk_txt_size = SEEKGZ_INIT_TEXT_SIZE;
+ //fp -> txt_buffer_size = SEEKGZ_INIT_TEXT_SIZE;
+
+ fp -> stem.zalloc = Z_NULL;
+ fp -> stem.zfree = Z_NULL;
+ fp -> stem.opaque = Z_NULL;
+ fp -> stem.avail_in = 0;
+ fp -> stem.next_in = Z_NULL;
+
+ int ret = seekgz_skip_header(fp,0);
+ if(ret) return 1;
+ ret = inflateInit2(&(fp -> stem), -15);
+ if(ret) return 1;
+ return 0;
+}
+
+void seekgz_tell(seekable_zfile_t * fp, seekable_position_t * pos){
+ pos -> block_gzfile_offset = fp -> block_start_in_file_offset;
+ pos -> block_gzfile_bits = fp -> block_start_in_file_bits;
+ memcpy(pos -> dict_window, fp -> block_dict_window, fp -> block_dict_window_size);
+ pos -> block_dict_window_size = fp -> block_dict_window_size;
+ pos -> in_block_text_offset = fp -> in_block_offset;
+}
+
+void seekgz_seek(seekable_zfile_t * fp, seekable_position_t * pos){
+ //#warning "COMMENT THIS LINE !!!!!"
+ //fprintf(stderr, "SEEK => %llu[%d] + %u ; WIN=%d CRC=%u\n", pos -> block_gzfile_offset, pos -> block_gzfile_bits, pos -> in_block_text_offset, pos -> block_dict_window_size, crc_pos( pos -> dict_window, pos -> block_dict_window_size));
+ fseeko(fp->gz_fp, pos -> block_gzfile_offset - (pos -> block_gzfile_bits?1:0), SEEK_SET);
+
+ if(Z_OK!=inflateReset(&fp->stem))
+ SEEKZLIBprintf("FATAL: UNABLE TO INIT STREAM!\n\n\n");
+ if(pos -> block_dict_window_size>0){
+ if(pos -> block_gzfile_bits){
+ char nch = fgetc(fp->gz_fp);
+ //fprintf(stderr, "SEEK 2 FPPOS:%llu, NCH=%d\n", ftello(fp->gz_fp) , nch);
+ inflatePrime(&fp->stem, pos -> block_gzfile_bits, nch>>(8-pos -> block_gzfile_bits));
+ }
+ if(Z_OK != inflateSetDictionary(&fp->stem, (unsigned char *)pos -> dict_window, pos -> block_dict_window_size))
+ SEEKZLIBprintf("FATAL: UNABLE TO RESET STREAM!\n\n\n");
+ }
+
+ fp -> stem.avail_in = 0;
+ fp -> in_pointer = 0;
+ fp -> txt_buffer_used = 0;
+ fp -> in_chunk_offset = 0;
+ memcpy(fp -> block_dict_window, pos -> dict_window, pos -> block_dict_window_size);
+ memcpy(fp -> dict_window, pos -> dict_window, pos -> block_dict_window_size);
+ fp -> block_dict_window_size = fp -> dict_window_used = pos -> block_dict_window_size;
+ fp -> dict_window_pointer = (pos -> block_dict_window_size<SEEKGZ_ZLIB_WINDOW_SIZE)?pos -> block_dict_window_size:0;
+ fp -> in_block_offset = 0;
+ fp -> block_start_in_file_offset = pos -> block_gzfile_offset;
+ fp -> block_start_in_file_bits = pos -> block_gzfile_bits;
+
+ unsigned int chunk_end_block_offset=0;
+ while(1){
+ seekgz_decompress_next_chunk(fp);
+ if(fp -> internal_error) break;
+ chunk_end_block_offset += fp -> txt_buffer_used;
+
+ if(chunk_end_block_offset >= pos -> in_block_text_offset){
+ fp -> in_chunk_offset = fp -> txt_buffer_used - (chunk_end_block_offset - pos -> in_block_text_offset);
+ fp -> in_block_offset = pos -> in_block_text_offset;
+ break;
+ }
+ assert(chunk_end_block_offset < fp -> current_chunk_txt_size && !feof(fp->gz_fp));
+ fp -> txt_buffer_used=0;
+ }
+}
+
+
+
+int seekgz_decompress_next_chunk(seekable_zfile_t * fp){
+ unsigned int this_chunk_size = 0;
+ int loaded_blocks = 0;
+ while(1){
+ seekgz_binreadmore(fp);
+ if(loaded_blocks > 0)
+ //SEEKZLIBprintf("LOADED BLOCKS=%d\n", loaded_blocks);
+ if(fp -> txt_buffer_used >= fp -> current_chunk_txt_size * 7 / 8){
+ //SEEKZLIBprintf("TRE ALLOC CHUNK_TXT: %d -> %d\n", fp -> current_chunk_txt_size, (int)(fp -> current_chunk_txt_size*1.5));
+ fp -> current_chunk_txt_size *= 1.5;
+ assert(fp -> current_chunk_txt_size < 3*512*1024*1024);
+ fp -> current_chunk_txt = realloc(fp -> current_chunk_txt, fp -> current_chunk_txt_size );
+ }
+
+ fp -> stem.avail_out = fp -> current_chunk_txt_size - fp -> txt_buffer_used;
+ int out_start = fp -> txt_buffer_used;
+ fp -> stem.next_out = (unsigned char *)(fp -> current_chunk_txt + out_start);
+
+ int inlen = fp -> stem.avail_in ;
+ //fprintf(stderr,"INFLATING_0 : LEN=%u, CRC=%u\n", fp -> stem.avail_in , crc_pos( fp -> stem.next_in , fp -> stem.avail_in ));
+
+ int ret = inflate(&(fp -> stem), Z_BLOCK);
+ int have = ( fp -> current_chunk_txt_size - fp -> txt_buffer_used) - fp -> stem.avail_out;
+ int is_chunk_end = 0;
+
+ //#warning "COMMENT NEXT LINE!!!!!!"
+ //fprintf(stderr,"INFLATING: INLEN=%d , OLEN=%d, POS=%lld, RET=%d, TOOL=%s\n", inlen , have, seekgz_ftello(fp), ret, zlibVersion());
+ if(ret != Z_OK && ret != Z_STREAM_END){ //any error
+ SEEKZLIBprintf("FATAL: INFLATE-ERROR=%d POS=%lld\n", ret, seekgz_ftello(fp));
+ fp -> internal_error = 1;
+ return -1;
+ }
+
+ fp -> in_pointer += inlen - fp -> stem.avail_in ;
+
+ if(have > 0){
+ fp -> txt_buffer_used += have;
+ int one_length = 0, one_src_start = 0, one_dst_start = 0;
+ int two_length = 0, two_src_start = 0, two_dst_start = 0;
+ int new_pntr = 0;
+ if(have <= SEEKGZ_ZLIB_WINDOW_SIZE - fp -> dict_window_pointer){
+ one_length = 0;
+ two_src_start = out_start;
+ two_dst_start = fp -> dict_window_pointer;
+ two_length = have;
+ new_pntr = two_dst_start + two_length;
+ }else if(have > SEEKGZ_ZLIB_WINDOW_SIZE - fp -> dict_window_pointer && have <= SEEKGZ_ZLIB_WINDOW_SIZE){
+ one_src_start = out_start + SEEKGZ_ZLIB_WINDOW_SIZE - fp -> dict_window_pointer;
+ one_dst_start = 0;
+ one_length = have - SEEKGZ_ZLIB_WINDOW_SIZE + fp -> dict_window_pointer;
+ two_src_start = out_start;
+ two_dst_start = fp -> dict_window_pointer;
+ two_length = SEEKGZ_ZLIB_WINDOW_SIZE - fp -> dict_window_pointer;
+ new_pntr = one_dst_start + one_length;
+ }else{
+ one_src_start = out_start + have - fp -> dict_window_pointer;
+ one_dst_start = 0;
+ one_length = fp -> dict_window_pointer;
+ two_src_start = out_start + have - SEEKGZ_ZLIB_WINDOW_SIZE;
+ two_dst_start = fp -> dict_window_pointer;
+ two_length = SEEKGZ_ZLIB_WINDOW_SIZE - fp -> dict_window_pointer;
+ new_pntr = fp -> dict_window_pointer;
+ }
+
+ if(one_length > 0)memcpy(fp -> dict_window + one_dst_start, fp -> current_chunk_txt + one_src_start, one_length);
+ //fprintf(stderr,"CPY: %d -> %d [%d] ; PNTR=%d, NEWPNTR=%d, have=%d\n", two_src_start, two_dst_start, two_length, fp -> dict_window_pointer, new_pntr, have);
+ memcpy(fp -> dict_window + two_dst_start, fp -> current_chunk_txt + two_src_start, two_length);
+ fp -> dict_window_pointer = new_pntr;
+ fp -> dict_window_used = SEEKZLIBmin(fp -> dict_window_used + have, SEEKGZ_ZLIB_WINDOW_SIZE);
+
+ is_chunk_end = (fp -> stem.data_type & 128) && !(fp -> stem.data_type & 64);
+ if(is_chunk_end){
+ fp -> is_the_last_chunk = 1;
+ unsigned long long file_pos_after_avail = seekgz_ftello(fp);
+ fp -> next_block_file_offset = file_pos_after_avail;
+ fp -> next_block_file_bits = fp->stem.data_type & 7;
+ }
+ this_chunk_size += have;
+ }
+ loaded_blocks ++;
+
+ if( 0 == fp -> stem.avail_in ) this_chunk_size = 0;
+
+ if(Z_STREAM_END == ret || ((is_chunk_end || 0 == fp -> stem.avail_in) && fp -> txt_buffer_used >=10)){
+ if(Z_STREAM_END == ret){
+ seekgz_skip_header(fp, 8);
+ inflateReset(&fp->stem);
+ }
+ break;
+ }
+ }
+ return 0;
+}
+
+int seekgz_next_char(seekable_zfile_t * fp){
+ if(fp -> internal_error) return -1;
+ while(fp -> in_chunk_offset >= fp -> txt_buffer_used){
+ if(feof(fp -> gz_fp) && fp -> stem.avail_in < 10 )
+ return EOF;
+ else {
+ fp -> txt_buffer_used = 0;
+ fp -> in_chunk_offset = 0;
+ int decompress_ret = seekgz_decompress_next_chunk(fp);
+ if(decompress_ret) return -1;
+ }
+ }
+ fp -> in_block_offset ++;
+ char retc = fp -> current_chunk_txt[fp -> in_chunk_offset++];
+
+ if(fp -> is_the_last_chunk && fp -> in_chunk_offset == fp -> txt_buffer_used){
+ fp -> in_block_offset = 0;
+ fp -> block_start_in_file_offset = fp -> next_block_file_offset;
+ fp -> block_start_in_file_bits = fp -> next_block_file_bits;
+
+ if(1 == fp -> is_the_last_chunk){
+ fp -> block_dict_window_size = fp -> dict_window_used;
+
+ if(fp -> dict_window_used < SEEKGZ_ZLIB_WINDOW_SIZE)
+ memcpy(fp -> block_dict_window , fp -> dict_window, fp -> dict_window_used);
+ else{
+ memcpy(fp -> block_dict_window , fp -> dict_window + fp -> dict_window_pointer, SEEKGZ_ZLIB_WINDOW_SIZE - fp -> dict_window_pointer);
+ memcpy(fp -> block_dict_window + SEEKGZ_ZLIB_WINDOW_SIZE - fp -> dict_window_pointer, fp -> dict_window, fp -> dict_window_pointer);
+ }
+ }else
+ fp -> block_dict_window_size = 0;
+
+ fp -> is_the_last_chunk = 0;
+ }
+
+ return retc;
+}
+
+int seekgz_gets(seekable_zfile_t * fp, char * buf, int buf_size){
+ int i=0;
+ buf[0]=0;
+ while(1){
+ if(i >= buf_size - 1){
+ buf[i]=0;
+ return i;
+ }
+ int nch = seekgz_next_char(fp);
+ if(nch<0 || nch == '\n'){
+ if(i<1 && nch <0) return 0;
+ buf[i] = '\n';
+ buf[i+1]=0;
+ return i+1;
+ }else buf[i++]=nch;
+ }
+}
+
+void seekgz_close(seekable_zfile_t * fp){
+ fclose(fp -> gz_fp);
+ free(fp -> current_chunk_txt);
+ free(fp -> current_chunk_bin);
+}
diff --git a/src/longread-mapping/seek-zlib.h b/src/longread-mapping/seek-zlib.h
new file mode 100644
index 0000000..5a1cf62
--- /dev/null
+++ b/src/longread-mapping/seek-zlib.h
@@ -0,0 +1,86 @@
+/***************************************************************
+
+ The Subread software package is free software package:
+ you can redistribute it and/or modify it under the terms
+ of the GNU General Public License as published by the
+ Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version.
+
+ Subread is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty
+ of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ See the GNU General Public License for more details.
+
+ Authors: Drs Yang Liao and Wei Shi
+
+ ***************************************************************/
+
+
+#ifndef __SEEK_ZLIB_H_
+#define __SEEK_ZLIB_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+
+
+#ifndef SEEKGZ_ZLIB_WINDOW_SIZE
+#define SEEKGZ_ZLIB_WINDOW_SIZE (32*1024)
+typedef struct {
+ FILE * gz_fp;
+ char * current_chunk_txt;
+ char * current_chunk_bin;
+ z_stream stem;
+ int current_chunk_txt_size;
+ unsigned int in_pointer;
+ unsigned int in_chunk_offset;
+ unsigned int in_block_offset;
+ //unsigned int txt_buffer_size;
+ unsigned int txt_buffer_used;
+ unsigned long long block_start_in_file_offset;
+ unsigned int block_start_in_file_bits;
+
+ unsigned long long next_block_file_offset;
+ unsigned int next_block_file_bits;
+
+ int is_the_last_chunk;
+ int internal_error;
+
+ unsigned int dict_window_pointer;
+ unsigned int dict_window_used;
+ char dict_window[SEEKGZ_ZLIB_WINDOW_SIZE];
+
+ unsigned int block_dict_window_size;
+ char block_dict_window[SEEKGZ_ZLIB_WINDOW_SIZE];
+} seekable_zfile_t;
+
+typedef struct{
+ char dict_window[SEEKGZ_ZLIB_WINDOW_SIZE];
+ unsigned long long block_gzfile_offset;
+ unsigned int block_gzfile_bits;
+ unsigned int block_dict_window_size;
+
+ unsigned int in_block_text_offset;
+} seekable_position_t;
+
+#endif
+
+#define SEEKZLIBprintf printf
+#define SEEKZLIBmin(a,b) ( (a)<(b)?(a):(b) )
+
+// returns 0 if OK; returns 1 if the file is not indexable; returns -1 if file doesn't exist.
+int seekgz_open(const char * fname, seekable_zfile_t * fp);
+
+// returns length in bytes if OK (length includes the line break at the end); returns 0 if EOF
+int seekgz_gets(seekable_zfile_t * fp, char * buf, int buf_size);
+
+void seekgz_tell(seekable_zfile_t * fp, seekable_position_t * pos);
+
+void seekgz_seek(seekable_zfile_t * fp, seekable_position_t * pos);
+
+int seekgz_next_char(seekable_zfile_t * fp);
+
+void seekgz_close(seekable_zfile_t * fp);
+#endif
diff --git a/src/makefile.version b/src/makefile.version
index dd7ddb4..83e1a64 100644
--- a/src/makefile.version
+++ b/src/makefile.version
@@ -1,4 +1,4 @@
-SUBREAD_VERSION_BASE=1.5.3
+SUBREAD_VERSION_BASE=1.6.0
SUBREAD_VERSION_DATE=$(SUBREAD_VERSION_BASE)-$(shell date +"%d%b%Y")
SUBREAD_VERSION="$(SUBREAD_VERSION_DATE)"
SUBREAD_VERSION="$(SUBREAD_VERSION_BASE)"
diff --git a/src/propmapped.c b/src/propmapped.c
index 6d87cf8..acb4d39 100644
--- a/src/propmapped.c
+++ b/src/propmapped.c
@@ -51,6 +51,7 @@ typedef struct {
int is_fragments_counted;
int is_proppair_needed;
int sort_buckets;
+ int verbose;
HashTable * split_fp_table;
@@ -167,7 +168,7 @@ int write_result(propMapped_context * context)
fclose(outfp);
}
char * objname = context -> is_fragments_counted? "fragment":"read";
- SUBREADprintf("Finished. All records: %llu; all %ss: %llu; mapped %ss: %llu; the mappability is %.2f%%\n", context->all_records, objname, context -> all_reads, objname, context -> mapped_reads, context -> mapped_reads*100./context -> all_reads);
+ if(context -> verbose) SUBREADprintf("Finished. All records: %llu; all %ss: %llu; mapped %ss: %llu; the mappability is %.2f%%\n", context->all_records, objname, context -> all_reads, objname, context -> mapped_reads, context -> mapped_reads*100./context -> all_reads);
return 0;
}
@@ -431,8 +432,7 @@ void ppm_warning_file_limit()
getrlimit(RLIMIT_NOFILE, & limit_st);
{
- if(min(limit_st.rlim_cur , limit_st.rlim_max) < 400)
- {
+ if(min(limit_st.rlim_cur , limit_st.rlim_max) < 400) {
SUBREADprintf("Your operation system does not allow a single process to open more then 400 files. You may need to change this setting by using a 'ulimit -n 500' command, or the program may crash.\n");
}
}
@@ -461,7 +461,7 @@ int propmapped(int argc, char ** argv)
context -> sort_buckets = 253;
- while((c = getopt_long (argc, argv, "i:o:bfph", propm_long_options, &option_index)) != -1)
+ while((c = getopt_long (argc, argv, "Vi:o:bfph", propm_long_options, &option_index)) != -1)
{
switch(c){
case 'i':
@@ -473,6 +473,8 @@ int propmapped(int argc, char ** argv)
case 'f':
context -> is_fragments_counted = 1;
break;
+ case 'V':
+ context -> verbose = 1;
case 'p':
context -> is_proppair_needed = 1;
break;
@@ -501,8 +503,7 @@ int propmapped(int argc, char ** argv)
ret = -1;
SUBREADprintf("Unable to open input file '%s' or the input file is empty!\n", context -> input_file_name);
}
-
- SUBREADprintf("The input file is opened as a %cAM file.\nThe %ss in the input file are being counted.\n", context -> is_BAM_input?'B':'S', context -> is_fragments_counted?"fragment":"read");
+ if(context -> verbose) SUBREADprintf("The input file is opened as a %cAM file.\nThe %ss in the input file are being counted.\n", context -> is_BAM_input?'B':'S', context -> is_fragments_counted?"fragment":"read");
ppm_warning_file_limit ();
ret = ret || init_PE_sambam(context);
diff --git a/src/readSummary.c b/src/readSummary.c
index 13432d5..c2b03a0 100644
--- a/src/readSummary.c
+++ b/src/readSummary.c
@@ -163,7 +163,7 @@ typedef struct {
unsigned int scoring_buff_numbers[MAX_HIT_NUMBER * 2];
unsigned int scoring_buff_flags[MAX_HIT_NUMBER * 2];
- unsigned short scoring_buff_overlappings[MAX_HIT_NUMBER * 2];
+ unsigned int scoring_buff_overlappings[MAX_HIT_NUMBER * 2];
long scoring_buff_exon_ids[MAX_HIT_NUMBER * 2];
char * chro_name_buff;
@@ -241,6 +241,7 @@ typedef struct {
int three_end_extension;
int fragment_minimum_overlapping;
float fractional_minimum_overlapping;
+ float fractional_minimum_feature_overlapping;
int use_overlapping_break_tie;
unsigned long long int all_reads;
@@ -266,6 +267,11 @@ typedef struct {
fasta_contigs_t * fasta_contigs;
HashTable * gene_name_table; // gene_name -> gene_number
HashTable * BAM_chros_to_anno_table; // name in annotation file -> alias name
+
+ char * RGnames_set;
+ int RGnames_capacity;
+ int RGnames_ptr;
+
char alias_file_name[300];
char input_file_name[300];
char * input_file_short_name;
@@ -299,7 +305,6 @@ typedef struct {
double start_time;
char * cmd_rebuilt;
-
char redo;
fc_read_counters read_counters;
@@ -697,7 +702,9 @@ int print_FC_configuration(fc_thread_global_context_t * global_context, char * a
print_in_box(80,0,0," Split alignments : %s", (1 == global_context -> is_split_or_exonic_only)?"only split alignments":"only exonic alignments");
print_in_box(80,0,0," Min overlapping bases : %d", global_context -> fragment_minimum_overlapping);
if(global_context -> fractional_minimum_overlapping > 0.000001)
- print_in_box(81,0,0," Min overlapping frac. : %0.1f%%%%", global_context -> fractional_minimum_overlapping*100);
+ print_in_box(81,0,0," Min overlapping frac. : %0.1f%%%% to reads", global_context -> fractional_minimum_overlapping*100);
+ if(global_context -> fractional_minimum_feature_overlapping > 0.000001)
+ print_in_box(81,0,0," Min overlapping frac. : %0.1f%%%% to features", global_context -> fractional_minimum_feature_overlapping*100);
if(global_context -> five_end_extension || global_context -> three_end_extension)
print_in_box(80,0,0," Read extensions : %d on 5' and %d on 3' ends", global_context -> five_end_extension , global_context -> three_end_extension);
if(global_context -> reduce_5_3_ends_to_one)
@@ -737,7 +744,7 @@ void print_FC_results(fc_thread_global_context_t * global_context, char * out)
print_in_box(89,0,1,"%c[36mRead assignment finished.%c[0m", CHAR_ESC, CHAR_ESC);
print_in_box(80,0,0,"");
#ifdef MAKE_STANDALONE
- print_in_box(80,0,PRINT_BOX_WRAPPED,"Summary of counting results can be found in file \"%s\"", out);
+ print_in_box(80,0,PRINT_BOX_WRAPPED,"Summary of counting results can be found in file \"%s.summary\"", out);
print_in_box(80,0,0,"");
#endif
print_in_box(80,2,1,"http://subread.sourceforge.net/");
@@ -1562,6 +1569,7 @@ void process_pairer_reset(void * pairer_vp){
if(global_context -> sambam_chro_table) free(global_context -> sambam_chro_table);
global_context -> sambam_chro_table = NULL;
global_context -> sambam_chro_table_items = 0;
+ if(global_context -> assign_reads_to_RG) free(global_context -> RGnames_set);
int xk1, xk2;
for(xk1=0; xk1<global_context-> thread_number; xk1++)
@@ -1631,6 +1639,7 @@ int is_value_contig_name(char * n, int l){
}
+void ** get_RG_tables(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * rg_name);
int compress_read_detail_BAM(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, int write_start, int write_end, char * bam_buf);
int process_pairer_header (void * pairer_vp, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len){
@@ -1660,7 +1669,70 @@ int process_pairer_header (void * pairer_vp, int thread_no, int is_text, unsigne
}else if( global_context -> is_read_details_out == FILE_TYPE_SAM && is_text ){
fwrite( bin, 1, bin_len, global_context -> read_details_out_FP);
}
- if(!is_text ){
+ if(is_text ){
+ if( global_context -> assign_reads_to_RG ){
+ global_context->RGnames_capacity = 10000;
+ global_context->RGnames_ptr = 0;
+ global_context->RGnames_set = malloc( global_context->RGnames_capacity );
+
+ int rcursor=0;
+ for(;rcursor<bin_len; rcursor++){
+ assert(bin[rcursor] == '@'&& bin[rcursor+3] == '\t');
+ if(bin[rcursor+1]=='R' && bin[rcursor+2]=='G'){
+ int id_start = -1, id_end;
+ for(; rcursor < bin_len; rcursor++){
+ if(bin[rcursor]=='I' && bin[rcursor+1]=='D'){
+ id_start = rcursor + 3;
+ id_end = 0;
+ }
+ for(; rcursor < bin_len; rcursor++){
+ if(bin[rcursor]=='\t' || bin[rcursor]=='\n'){
+ if(id_end < 1)id_end = rcursor;
+ break;
+ }
+ }
+ if(bin[rcursor]=='\n') break;
+ }
+
+ if(id_start > 0){
+ int id_len = id_end - id_start;
+ if(global_context->RGnames_capacity < global_context->RGnames_ptr + id_len + 3){
+ global_context->RGnames_capacity = global_context->RGnames_capacity * 17 / 10;
+ global_context->RGnames_set = realloc( global_context->RGnames_set , global_context->RGnames_capacity );
+ }
+ memcpy(global_context->RGnames_set + global_context->RGnames_ptr, bin + id_start, id_len);
+ global_context->RGnames_set[global_context->RGnames_ptr+id_len]='\t';
+ global_context->RGnames_ptr += id_len+1;
+ }
+ }
+ for( ;rcursor<bin_len; rcursor++ ) if(bin[rcursor] == '\n')break;
+ }
+ if(global_context->RGnames_ptr>0){
+ global_context->RGnames_set[global_context->RGnames_ptr-1]=0;
+ global_context->RGnames_ptr--;
+ }
+ //SUBREADprintf("RGList: %s\n", global_context->RGnames_set);
+
+ int thread_no;
+ for(thread_no = 0; thread_no < global_context -> thread_number; thread_no ++){
+ fc_thread_thread_context_t * RGthread_context = global_context -> thread_contexts + thread_no;
+ int RGcursor = 0;
+ char *lastRGptr = global_context->RGnames_set;
+ for(; RGcursor < global_context->RGnames_ptr+1; RGcursor++){
+ if(global_context->RGnames_set[ RGcursor ] == '\t' || global_context->RGnames_set[ RGcursor ] == 0){
+ global_context->RGnames_set[ RGcursor ] = 0;
+ if(strlen(lastRGptr)>0){
+ // SUBREADprintf("PUT 4Tab:'%s'\n", lastRGptr);
+ get_RG_tables(global_context, RGthread_context, lastRGptr);
+ lastRGptr = global_context->RGnames_set + RGcursor +1;
+ if(RGcursor < global_context->RGnames_ptr)
+ global_context->RGnames_set[ RGcursor ] = '\t';
+ }
+ }
+ }
+ }
+ }
+ }else{
if(global_context -> sambam_chro_table)
global_context -> sambam_chro_table = delay_realloc(global_context -> sambam_chro_table, global_context -> sambam_chro_table_items * sizeof(SamBam_Reference_Info), (items + global_context -> sambam_chro_table_items) * sizeof(SamBam_Reference_Info));
else global_context -> sambam_chro_table = malloc(items * sizeof(SamBam_Reference_Info));
@@ -2171,6 +2243,7 @@ void parse_bin(SamBam_Reference_Info * sambam_chro_table, char * bin, char * bin
if(assign_reads_to_RG){
char RG_type = 0;
SAM_pairer_iterate_tags((unsigned char *)bin+bin_ptr, block_len + 4 - bin_ptr, "RG", &RG_type, RG_ptr);
+ //SUBREADprintf("RG_TEST: PTR=%p, VAL=`%s`, TY=%c\n", *RG_ptr, *RG_ptr, RG_type);
if(RG_type != 'Z') (*RG_ptr) = NULL;
}
//SUBREADprintf("FOUND=%d, NH=%d, TAG=%.*s\n", found_NH, *(NH_value), 3 , bin+bin_ptr);
@@ -2620,11 +2693,8 @@ void process_line_buffer(fc_thread_global_context_t * global_context, fc_thread_
parse_bin(global_context -> sambam_chro_table, is_second_read?bin2:bin1, is_second_read?bin1:bin2 , &read_name, &alignment_masks , &read_chr, &read_pos, &mapping_qual, &mate_chr, &mate_pos, &fragment_length, &is_junction_read, &cigar_sections, Starting_Chro_Points, Starting_Read_Points, Section_Read_Lengths, ChroNames, Event_After_Section, &NH_value, global_context -> max_M , global_context -> need_calculate_overlap_len?(is_second_read?CIGAR_intervals_R2:CIGAR_intervals_R1):NULL, is_s [...]
if(global_context -> assign_reads_to_RG && NULL == RG_ptr)return;
- // SUBREADprintf(" RNAME=%s\n", read_name);
+ //SUBREADprintf("TEST_RG: '%s'\n", RG_ptr);
- //#warning "==================== REMOVE WHEN RELEASE ========================"
- //if(global_context -> read_details_out_FP)
- // fprintf(global_context -> read_details_out_FP, "SAMDEBUG: %s\t\t%s, %ld\n", read_name, read_chr, read_pos);
if(is_second_read == 0)
{
//skip the read if unmapped (its mate will be skipped as well if paired-end)
@@ -3114,7 +3184,7 @@ void overlap_exchange(void * arr, int L, int R){
pos[R*2+1] = tt;
}
-unsigned short calc_score_overlaps(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char ** chros, unsigned int * start_poses, unsigned short * lens, int sections){
+unsigned int calc_score_overlaps(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char ** chros, unsigned int * start_poses, unsigned short * lens, int sections){
unsigned int in_intervals[ 2*sections ];
unsigned int out_intervals[ 2*sections ], x1;
char used_interval[ sections ];
@@ -3156,6 +3226,8 @@ void vote_and_add_count(fc_thread_global_context_t * global_context, fc_thread_t
if(global_context -> need_calculate_overlap_len == 0 && nhits2+nhits1==1) {
long hit_exon_id = nhits2?hits_indices2[0]:hits_indices1[0];
+ //SUBREADprintf("V_AND_A: '%p'\n", RG_name);
+
if(RG_name){
void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
fc_read_counters * sumtab = tab4s[1];
@@ -3216,20 +3288,88 @@ void vote_and_add_count(fc_thread_global_context_t * global_context, fc_thread_t
unsigned int * scoring_numbers = thread_context -> scoring_buff_numbers; // size is : MAX_HIT_NUMBER *2
unsigned int * scoring_flags = thread_context -> scoring_buff_flags; // size is : MAX_HIT_NUMBER *2
- unsigned short * scoring_overlappings = thread_context -> scoring_buff_overlappings; // size is : MAX_HIT_NUMBER *2
+ unsigned int * scoring_overlappings = thread_context -> scoring_buff_overlappings; // size is : MAX_HIT_NUMBER *2
long * scoring_exon_ids = thread_context -> scoring_buff_exon_ids; // size is : MAX_HIT_NUMBER *2
int scoring_count = 0, score_x1;
if( global_context -> need_calculate_overlap_len ){
-
+ int end1, end2, hit_x1, hit_x2;
char ** scoring_gap_chros = thread_context -> scoring_buff_gap_chros;
unsigned int * scoring_gap_starts = thread_context -> scoring_buff_gap_starts; // size is : MAX_HIT_NUMBER *2;
unsigned short * scoring_gap_lengths = thread_context -> scoring_buff_gap_lengths; // size is : MAX_HIT_NUMBER *2* global_context -> max_M*2
- int end1, end2, hit_x1, hit_x2;
char used_hit1 [nhits1];
char used_hit2 [nhits2];
+
+ if( global_context -> fractional_minimum_feature_overlapping > 1E-10 ){
+ memset(used_hit1 , 0 , nhits1);
+ memset(used_hit2 , 0 , nhits2);
+ for(end1 = 0; end1 < global_context -> is_paired_end_mode_assign + 1 ; end1++){
+ int allhits = end1?nhits2:nhits1;
+ long * hits_indices_X1 = end1?hits_indices2:hits_indices1;
+ char * used_hit_X1 = end1?used_hit2:used_hit1;
+
+ //#warning "DEBUG OUT 0"
+ if(0 && FIXLENstrcmp("V0112_0155:7:1102:12486:34235", read_name) == 0)
+ SUBREADprintf("OVERLAP START %d hits\n", allhits);
+ for(hit_x1 = 0; hit_x1 < allhits; hit_x1++){
+ if(used_hit_X1[hit_x1])continue;
+
+ long tested_exon_id = hits_indices_X1[hit_x1];
+ long exon_span = global_context -> exontable_stop[tested_exon_id] +1;
+ exon_span -= global_context -> exontable_start[tested_exon_id];
+
+ long applied_overlapping_threshold = (int)(exon_span * global_context -> fractional_minimum_feature_overlapping+0.5);
+
+ scoring_gap_chros[0 ] = (end1?hits_chro2:hits_chro1)[hit_x1];
+ scoring_gap_starts[0 ] = (end1?hits_start_pos2:hits_start_pos1)[hit_x1];
+ scoring_gap_lengths[0 ] = (end1?hits_length2:hits_length1)[hit_x1];
+ int gaps=1;
+
+ for(end2 = 0; end2 < global_context -> is_paired_end_mode_assign + 1 ; end2++){
+ int allhits2 = end2?nhits2:nhits1;
+ char * used_hit_X2 = end2?used_hit2:used_hit1;
+ long * hits_indices_X2 = end2?hits_indices2:hits_indices1;
+
+
+ for(hit_x2 = 0; hit_x2 < allhits2; hit_x2++){
+ if(used_hit_X2[hit_x2]) continue;
+ long other_exon_id = hits_indices_X2[hit_x2];
+ if(other_exon_id == tested_exon_id){
+ used_hit_X2[ hit_x2 ]=1;
+ scoring_gap_chros[ gaps ] = (end2?hits_chro2:hits_chro1)[hit_x2];
+ scoring_gap_starts[ gaps ] = (end2?hits_start_pos2:hits_start_pos1)[hit_x2];
+ scoring_gap_lengths[ gaps ] = (end2?hits_length2:hits_length1)[hit_x2];
+ gaps ++;
+ }
+ }
+ }
+
+
+ unsigned int tested_exon_overlap_any_read = calc_score_overlaps(global_context, thread_context, scoring_gap_chros, scoring_gap_starts, scoring_gap_lengths, gaps);
+ if(applied_overlapping_threshold > tested_exon_overlap_any_read){
+ // remove this exon from lists
+
+ for(end2 = 0; end2 < global_context -> is_paired_end_mode_assign + 1 ; end2++){
+ int allhits2 = end2?nhits2:nhits1;
+ long * hits_indices_X2 = end2?hits_indices2:hits_indices1;
+
+ for(hit_x2 = 0; hit_x2 < allhits2; hit_x2++){
+ long other_exon_id = hits_indices_X2[hit_x2];
+ if(other_exon_id == tested_exon_id){
+ hits_indices_X2[hit_x2] = -1;
+ }
+ }
+ }
+ }
+ //#warning "DEBUG OUT 1"
+ if(0 && FIXLENstrcmp("V0112_0155:7:1102:12486:34235", read_name) == 0)
+ SUBREADprintf("OVERLAP TO %ld : %u >= %u ; EXON_SPAN=%ld ( %ld ~ %ld)\n", tested_exon_id, tested_exon_overlap_any_read, applied_overlapping_threshold, exon_span, global_context -> exontable_start[tested_exon_id], global_context -> exontable_stop[tested_exon_id]);
+ }
+ }
+ }
+
memset(used_hit1 , 0 , nhits1);
memset(used_hit2 , 0 , nhits2);
@@ -3243,6 +3383,7 @@ void vote_and_add_count(fc_thread_global_context_t * global_context, fc_thread_t
int gaps = 0;
long tmp_exon_id = hits_indices_X1[hit_x1];
+ if(tmp_exon_id < 0) continue;
long score_merge_key;
if (global_context -> is_gene_level )
score_merge_key = global_context -> exontable_geneid[tmp_exon_id];
@@ -3268,6 +3409,7 @@ void vote_and_add_count(fc_thread_global_context_t * global_context, fc_thread_t
for( hit_x2 = 0 ; hit_x2 < nhit_X2; hit_x2 ++ ){
if(used_hit_X2[hit_x2])continue;
+ if(hits_indices_X2[hit_x2] < 0) continue;
long X2_merge_key;
if (global_context -> is_gene_level )
@@ -3359,7 +3501,8 @@ void vote_and_add_count(fc_thread_global_context_t * global_context, fc_thread_t
}else thread_context->read_counters.unassigned_nofeatures ++;
}else{
for(score_x1 = 0; score_x1 < scoring_count ; score_x1++){
- if(0 && FIXLENstrcmp("V0112_0155:7:1101:5387:6362", read_name)==0) SUBREADprintf("Scoring Overlap %s = %d >=%d, score=%d, exonid=%ld\n", read_name, scoring_overlappings[score_x1], applied_fragment_minimum_overlapping, scoring_numbers[score_x1], scoring_exon_ids[score_x1]);
+ //#warning "DEBUG OUT 2"
+ if(0 && FIXLENstrcmp("V0112_0155:7:1102:12486:34235", read_name)==0) SUBREADprintf("Scoring Overlap %s = %d >=%d, score=%d, exonid=%ld\n", read_name, scoring_overlappings[score_x1], applied_fragment_minimum_overlapping, scoring_numbers[score_x1], scoring_exon_ids[score_x1]);
//SUBREADprintf("RLTEST: %s %d\n", read_name, scoring_overlappings[score_x1]);
if( applied_fragment_minimum_overlapping > 1 )
if( applied_fragment_minimum_overlapping > scoring_overlappings[score_x1] ){
@@ -3731,7 +3874,7 @@ void fc_thread_init_input_files(fc_thread_global_context_t * global_context, cha
}
-void fc_thread_init_global_context(fc_thread_global_context_t * global_context, unsigned int buffer_size, unsigned short threads, int line_length , int is_PE_data, int min_pe_dist, int max_pe_dist, int is_gene_level, int is_overlap_allowed, int is_strand_checked, char * output_fname, int is_sam_out, int is_both_end_required, int is_chimertc_disallowed, int is_PE_distance_checked, char *feature_name_column, char * gene_id_column, int min_map_qual_score, int is_multi_mapping_allowed, int i [...]
+void fc_thread_init_global_context(fc_thread_global_context_t * global_context, unsigned int buffer_size, unsigned short threads, int line_length , int is_PE_data, int min_pe_dist, int max_pe_dist, int is_gene_level, int is_overlap_allowed, int is_strand_checked, char * output_fname, int is_sam_out, int is_both_end_required, int is_chimertc_disallowed, int is_PE_distance_checked, char *feature_name_column, char * gene_id_column, int min_map_qual_score, int is_multi_mapping_allowed, int i [...]
{
int x1;
@@ -3779,9 +3922,10 @@ void fc_thread_init_global_context(fc_thread_global_context_t * global_context,
global_context -> three_end_extension = threeEndExtension;
global_context -> fragment_minimum_overlapping = minFragmentOverlap;
global_context -> fractional_minimum_overlapping = fracOverlap;
+ global_context -> fractional_minimum_feature_overlapping = frac_feature_overlap;
global_context -> use_overlapping_break_tie = useOverlappingBreakTie;
- global_context -> need_calculate_fragment_len = ( global_context -> fractional_minimum_overlapping > 1E-10 );
- global_context -> need_calculate_overlap_len = global_context -> fractional_minimum_overlapping > 1E-10 || (global_context -> fragment_minimum_overlapping > 1) || global_context -> use_overlapping_break_tie;
+ global_context -> need_calculate_fragment_len = ( global_context -> fractional_minimum_overlapping > 1E-10 ) || (global_context -> fractional_minimum_feature_overlapping > 1E-10);
+ global_context -> need_calculate_overlap_len = (global_context -> fractional_minimum_overlapping > 1E-10) || (global_context -> fragment_minimum_overlapping > 1) || global_context -> use_overlapping_break_tie || (global_context -> fractional_minimum_feature_overlapping > 1E-10);
global_context -> debug_command = debug_command;
global_context -> max_M = max_M;
global_context -> max_BAM_header_size = buffer_size;
@@ -4354,6 +4498,7 @@ static struct option long_options[] =
{"read2pos", required_argument, 0, 0},
{"minOverlap", required_argument, 0, 0},
{"fracOverlap", required_argument, 0, 0},
+ {"fracOverlapFeature", required_argument, 0, 0},
{"splitOnly", no_argument, 0, 0},
{"nonSplitOnly", no_argument, 0, 0},
{"debugCommand", required_argument, 0, 0},
@@ -4443,6 +4588,10 @@ void print_usage()
SUBREADputs(" and '--minOverlap' option need to be satisfied for read");
SUBREADputs(" assignment.");
SUBREADputs("");
+ SUBREADputs(" --fracOverlapFeature <float> Minimum fraction of bases included in a feature");
+ SUBREADputs(" that is required for overlapping with a read or a read-");
+ SUBREADputs(" pair. Value should be within range [0,1]. 0 by default.");
+ SUBREADputs("");
SUBREADputs(" --largestOverlap Assign reads to a meta-feature/feature that has the ");
SUBREADputs(" largest number of overlapping bases.");
SUBREADputs("");
@@ -5005,10 +5154,11 @@ int readSummary(int argc,char *argv[]){
43: as.numeric(assign_reads_to_RG) # 1: reads with "RG" tags will be assigned to read groups' 0: default setting
44: as.numeric(long_read_minimum_length) # Reads longer than this will be assigned as long reads (no multi-threading)
45: as.numeric(is_verbose) # 1: show the mismatched chromosome names on screet; 0: don't do so
+ 46: as.numeric(frac_feature_overlap) # fraction of the feature to be overlapped with a read
*/
int isStrandChecked, isCVersion, isChimericDisallowed, isPEDistChecked, minMappingQualityScore=0, isInputFileResortNeeded, feature_block_size = 20, reduce_5_3_ends_to_one, useStdinFile, assignReadsToRG, long_read_minimum_length, is_verbose;
- float fracOverlap;
+ float fracOverlap, fracOverlapFeature;
char **chr;
long *start, *stop;
int *geneid;
@@ -5094,7 +5244,7 @@ int readSummary(int argc,char *argv[]){
isInputFileResortNeeded = atoi(argv[23]);
else isInputFileResortNeeded = 0;
if(thread_number<1) thread_number=1;
- if(thread_number>16)thread_number=16;
+ if(thread_number>FC_MAX_THREADS)thread_number=FC_MAX_THREADS;
int Param_fiveEndExtension, Param_threeEndExtension;
if(argc>25)
@@ -5200,11 +5350,15 @@ int readSummary(int argc,char *argv[]){
is_verbose = (argv[45][0]=='1');
else is_verbose = 0;
+ if(argc>46)
+ fracOverlapFeature = atof(argv[46]);
+ else fracOverlapFeature = 0.0;
+
if(SAM_pairer_warning_file_open_limit()) return -1;
fc_thread_global_context_t global_context;
- fc_thread_init_global_context(& global_context, FEATURECOUNTS_BUFFER_SIZE, thread_number, MAX_LINE_LENGTH, isPE, minPEDistance, maxPEDistance,isGeneLevel, isMultiOverlapAllowed, isStrandChecked, (char *)argv[3] , isReadSummaryReport, isBothEndRequired, isChimericDisallowed, isPEDistChecked, nameFeatureTypeColumn, nameGeneIDColumn, minMappingQualityScore,isMultiMappingAllowed, 0, alias_file_name, cmd_rebuilt, isInputFileResortNeeded, feature_block_size, isCVersion, fiveEndExtension, thre [...]
+ fc_thread_init_global_context(& global_context, FEATURECOUNTS_BUFFER_SIZE, thread_number, MAX_LINE_LENGTH, isPE, minPEDistance, maxPEDistance,isGeneLevel, isMultiOverlapAllowed, isStrandChecked, (char *)argv[3] , isReadSummaryReport, isBothEndRequired, isChimericDisallowed, isPEDistChecked, nameFeatureTypeColumn, nameGeneIDColumn, minMappingQualityScore,isMultiMappingAllowed, 0, alias_file_name, cmd_rebuilt, isInputFileResortNeeded, feature_block_size, isCVersion, fiveEndExtension, thre [...]
fc_thread_init_input_files( & global_context, argv[2], &file_name_ptr );
@@ -5385,36 +5539,38 @@ int readSummary(int argc,char *argv[]){
}
if(global_context.assign_reads_to_RG){
- int buck_i;
- for(buck_i = 0; buck_i < merged_RG_table -> numOfBuckets; buck_i++){
- KeyValuePair * cursor = merged_RG_table -> bucketArray[buck_i];
- while(cursor){
- char * rg_name = (char*) cursor -> key;
- void ** tab4 = cursor -> value;
+ int rgcur;
+ char * rg_name = global_context.RGnames_set;
+ for(rgcur = 0; rgcur < global_context.RGnames_ptr+1; rgcur ++){
+ if(global_context.RGnames_set[rgcur] == '\t'||global_context.RGnames_set[rgcur] == '\0'){
+ global_context.RGnames_set[rgcur] = 0;
int rg_name_len = strlen(rg_name);
- int file_len = strlen(mem_file_name);
-
- char * rg_file_name = malloc(rg_name_len + 3 + file_len);
- sprintf(rg_file_name, "%s:%s", mem_file_name, rg_name);
- free(rg_name);
-
- ArrayListPush(table_column_names, rg_file_name);
- ArrayListPush(table_columns, tab4[0]);
- ArrayListPush(read_counters, tab4[1]);
- if(global_context.do_junction_counting){
- ArrayListPush(junction_global_table_list,tab4[2]);
- ArrayListPush(splicing_global_table_list,tab4[3]);
+ if(rg_name_len > 0){
+ // SUBREADprintf("GET 4Tab:'%s'\n", rg_name);
+ void ** tab4 = HashTableGet(merged_RG_table, rg_name);
+ int file_len = strlen(mem_file_name);
+
+ char * rg_file_name = malloc(rg_name_len + 3 + file_len);
+ sprintf(rg_file_name, "%s:%s", mem_file_name, rg_name);
+
+ ArrayListPush(table_column_names, rg_file_name);
+ ArrayListPush(table_columns, tab4[0]);
+ ArrayListPush(read_counters, tab4[1]);
+ if(global_context.do_junction_counting){
+ ArrayListPush(junction_global_table_list,tab4[2]);
+ ArrayListPush(splicing_global_table_list,tab4[3]);
+ }
+ rg_name = global_context.RGnames_set + rgcur + 1;
}
- cursor = cursor->next;
}
}
-
free(mem_file_name);
}
total_written_coulmns ++;
}
global_context.is_paired_end_mode_assign = orininal_isPE;
next_fn = strtok_r(NULL, ";", &tmp_pntr);
+ if(global_context.assign_reads_to_RG) free(global_context.RGnames_set);
if(merged_RG_table) HashTableDestroy(merged_RG_table);
}
@@ -5479,6 +5635,7 @@ int readSummary(int argc,char *argv[]){
HashTableDestroy(global_context.junction_features_table);
}
+
free(global_context.unistr_buffer_space);
free(loaded_features);
free(geneid);
@@ -5632,7 +5789,7 @@ int main(int argc, char ** argv)
int feature_count_main(int argc, char ** argv)
#endif
{
- char * Rargv[46];
+ char * Rargv[47];
char annot_name[300];
char temp_dir[300];
char * out_name = malloc(300);
@@ -5652,6 +5809,7 @@ int feature_count_main(int argc, char ** argv)
char min_qual_score_str[11];
char feature_block_size_str[11];
char Strand_Sensitive_Str[11];
+ char strFeatureFracOverlap[11];
char Pair_Orientations[3];
char * very_long_file_names;
int is_Input_Need_Reorder = 0;
@@ -5682,7 +5840,7 @@ int feature_count_main(int argc, char ** argv)
int c;
int very_long_file_names_size = 200;
int fiveEndExtension = 0, threeEndExtension = 0, minFragmentOverlap = 1;
- float fracOverlap = 0.0;
+ float fracOverlap = 0.0, fracOverlapFeature = 0.0;
int std_input_output_mode = 0, long_read_mode = 0, is_verbose = 0;
char strFiveEndExtension[11], strThreeEndExtension[11], strMinFragmentOverlap[11], fracOverlapStr[20], std_input_output_mode_str[11], long_read_mode_str[11];
very_long_file_names = malloc(very_long_file_names_size);
@@ -5762,7 +5920,7 @@ int feature_count_main(int argc, char ** argv)
strcpy(nameGeneIDColumn, optarg);
break;
case 'T':
- if(!is_valid_digit_range(optarg, "T", 1, 64))
+ if(!is_valid_digit_range(optarg, "T", 1, FC_MAX_THREADS))
STANDALONE_exit(-1);
threads = atoi(optarg);
@@ -5865,6 +6023,15 @@ int feature_count_main(int argc, char ** argv)
fracOverlap = atof(optarg);
}
+
+ if(strcmp("fracOverlapFeature", long_options[option_index].name)==0)
+ {
+ if(!is_valid_float(optarg, "fracOverlapFeature"))
+ STANDALONE_exit(-1);
+ fracOverlapFeature = atof(optarg);
+ }
+
+
if(strcmp("minOverlap", long_options[option_index].name)==0)
{
if(!is_valid_digit(optarg, "minOverlap"))
@@ -5989,6 +6156,7 @@ int feature_count_main(int argc, char ** argv)
sprintf(fracOverlapStr, "%g", fracOverlap);
sprintf(std_input_output_mode_str,"%d",std_input_output_mode);
sprintf(long_read_mode_str, "%d", long_read_mode);
+ sprintf(strFeatureFracOverlap, "%g", fracOverlapFeature);
Rargv[0] = "CreadSummary";
Rargv[1] = annot_name;
@@ -6036,10 +6204,11 @@ int feature_count_main(int argc, char ** argv)
Rargv[43] = assign_reads_to_RG?"1":"0";
Rargv[44] = long_read_mode_str;
Rargv[45] = is_verbose?"1":"0";
+ Rargv[46] = strFeatureFracOverlap;
int retvalue = -1;
if(is_ReadSummary_Report && (std_input_output_mode & 1)==1) SUBREADprintf("ERROR: no detailed assignment results can be written when the input is from STDIN. Please remove the '-R' option.\n");
- else retvalue = readSummary(46, Rargv);
+ else retvalue = readSummary(47, Rargv);
free(very_long_file_names);
free(out_name);
diff --git a/src/subread.h b/src/subread.h
index e024432..0b97098 100644
--- a/src/subread.h
+++ b/src/subread.h
@@ -54,6 +54,7 @@
#define MAX_THREADS 40
+#define FC_MAX_THREADS 64
#define MAX_EVENTS_IN_READ 8
//#warning "============== REMOVE '* 15' FROM THE NEXT LINE ================"
@@ -243,16 +244,6 @@ typedef struct {
} gene_t;
-typedef struct{
- unsigned int memory_block_size;
- unsigned int start_base_offset;
- unsigned int start_point;
- unsigned int length;
- unsigned char * values;
- unsigned int values_bytes;
-} gene_value_index_t;
-
-
struct gehash_bucket {
int current_items;
@@ -276,6 +267,18 @@ typedef struct {
} gehash_t;
+typedef struct{
+ unsigned int memory_block_size;
+ unsigned int start_base_offset;
+ unsigned int start_point;
+ unsigned int length;
+ unsigned char * values;
+ unsigned int values_bytes;
+ void * appendix1;
+ void * appendix2;
+} gene_value_index_t;
+
+
typedef struct {
gene_vote_number_t max_vote;
gehash_data_t max_position;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/subread.git
More information about the debian-med-commit
mailing list