[med-svn] [sniffles] 01/07: New upstream version 1.0.2+ds
Andreas Tille
tille at debian.org
Fri Jan 13 14:39:33 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository sniffles.
commit b4da08b633cd6529b757cc77d635978dc93d7b4c
Author: Andreas Tille <tille at debian.org>
Date: Fri Jan 13 15:00:51 2017 +0100
New upstream version 1.0.2+ds
---
CMakeLists.txt | 4 +-
README.md | 90 ++--------
src/Alignment.cpp | 367 ++++++++++++++++++++++++++-------------
src/Alignment.h | 9 +-
src/BamParser.cpp | 1 +
src/CMakeLists.txt | 2 +
src/Genotyper/Genotyper.cpp | 50 ++++--
src/Genotyper/Genotyper.h | 2 +-
src/Paramer.h | 8 +-
src/Sniffles.cpp | 240 ++++++++++++++++++--------
src/Version.h | 6 +-
src/cluster/Cluster_SVs.cpp | 53 +++---
src/cluster/Cluster_SVs.h | 8 +-
src/print/BedpePrinter.cpp | 2 +-
src/print/IPrinter.cpp | 151 +++++++++++++++-
src/print/IPrinter.h | 15 +-
src/print/VCFPrinter.cpp | 171 +++++++++++-------
src/sub/Breakpoint.cpp | 381 +++++++++++++++++++++++++---------------
src/sub/Breakpoint.h | 22 ++-
src/sub/Detect_Breakpoints.cpp | 384 ++++++++++++++++++++++++++++-------------
src/sub/Detect_Breakpoints.h | 14 +-
src/test | 0
src/tree/IntervallContainer.h | 27 +++
src/tree/IntervallList.cpp | 43 +++++
src/tree/IntervallList.h | 32 ++++
src/tree/IntervallTree.cpp | 66 +++++--
src/tree/IntervallTree.h | 10 +-
src/tree/TNode.h | 9 -
28 files changed, 1483 insertions(+), 684 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d20302..cdf545c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,9 +5,9 @@ set( SNIF_VERSION_MAJOR 1 )
set( SNIF_VERSION_MINOR 0 )
IF(CMAKE_BUILD_TYPE STREQUAL "Debug")
message(STATUS "Building in debug mode!")
- set( SNIF_VERSION_BUILD 0-debug )
+ set( SNIF_VERSION_BUILD 2-debug )
else()
- set( SNIF_VERSION_BUILD 0 )
+ set( SNIF_VERSION_BUILD 2 )
ENDIF()
diff --git a/README.md b/README.md
index 9630427..27c253d 100644
--- a/README.md
+++ b/README.md
@@ -1,88 +1,24 @@
# Sniffles
-Sniffles is a structural variation caller using third generation sequencing (PacBio or Oxford Nanopore). It detects all types of SVs using evidence from split-read alignments, high-mismatch regions, and coverage analysis. Please note the current version of Sniffles requires output from BWA-MEM with the optional SAM attributes enabled! If you experience problems or have suggestions please contact: fritz.sedlazeck at gmail.com
+Sniffles is a structural variation caller using third generation sequencing (PacBio or Oxford Nanopore). It detects all types of SVs using evidence from split-read alignments, high-mismatch regions, and coverage analysis. Please note the current version of Sniffles requires sorted output from BWA-MEM (use -M and -x parameter) or NGM-LR with the optional SAM attributes enabled! If you experience problems or have suggestions please contact: fritz.sedlazeck at gmail.com
-**************************************
-
-INSTALL:
-Download Sniffles:
-```
-git clone https://github.com/fritzsedlazeck/Sniffles
-```
+Please see our github wiki for more information (https://github.com/fritzsedlazeck/Sniffles/wiki)
- cd Sniffles
-
- mkdir build
-
- cd build
-
- cmake ..
-
- make
-
- cd ../bin/
-
-
**************************************
-USAGE:
-```
-./sniffles -m reads.bam -v calls.vcf
-```
-
-Options:
-
- ./sniffles -m <string> [-s <int>] [--max_num_splits <int>] [-q <int>]
- [-l <int>] [-v <string>] [--bede <string>] [-c <int>] [-t
- <int>] [-d <int>] [-n <int>] [--use_MD_Cigar] [--]
- [--version] [-h]
-
-
-Where:
-
- -m <string>, --mapped_reads <string>
- (required) Bam File
+## NextGenMap-LR: (NGM-LR)
- -s <int>, --min_support <int>
- Minimum number of reads that support a SV. Default: 10
- --max_num_splits <int>
- Maximum number of splits per read to be still taken into account.
- Default: 4
-
- -q <int>, --minmapping_qual <int>
- Minimum Mapping Quality. Default: 20
-
- -l <int>, --min_length <int>
- Minimum length of SV to be reported. Default:0
-
- -v <string>, --vcf <string>
- VCF output file name
-
- --bede <string>
- Simplified format of bede Format.
-
- -c <int>, --min_cigar_event <int>
- Minimum Cigar Event (e.g. Insertion, deletion) to take into account.
- Default:50
-
- -t <int>, --threads <int>
- Number of threads to use. Default: 3
-
- -d <int>, --max_distance <int>
- Maximum distance to group SV together. Default: 1kb
-
- -n <int>, --num_reads_report <int>
- Report up to N reads that support the SV. Default: 0
-
- --, --ignore_rest
- Ignores the rest of the labeled arguments following this flag.
-
- --version
- Displays version information and exits.
+Sniffles performs best with the mappings of NGM-LR our novel long read mapping method.
+Please see:
+https://github.com/philres/nextgenmap-lr
+
+**************************************
+## Poster & Talks:
- -h, --help
- Displays usage information and exits.
+[Accurate and fast detection of complex and nested structural variations using long read technologies](http://schatzlab.cshl.edu/presentations/2016/2016.10.28.BIODATA.PacBioSV.pdf)
+Biological Data Science, Cold Spring Harbor Laboratory, Cold Spring Harbor, NY, 26 - 29.10.2016
+[NextGenMap-LR: Highly accurate read mapping of third generation sequencing reads for improved structural variation analysis](http://www.cibiv.at/~philipp_/files/gi2016_poster_phr.pdf)
+Genome Informatics 2016, Wellcome Genome Campus Conference Centre, Hinxton, Cambridge, UK, 19.09.-2.09.2016
- Sniffles version 1.0.0
diff --git a/src/Alignment.cpp b/src/Alignment.cpp
index 68d1239..46e202d 100644
--- a/src/Alignment.cpp
+++ b/src/Alignment.cpp
@@ -88,7 +88,8 @@ vector<differences_str> Alignment::summarizeAlignment() {
pos += al->CigarData[i].Length;
} else if (al->CigarData[i].Type == 'N') {
pos += al->CigarData[i].Length;
- } else if (al->CigarData[i].Type == 'S' && al->CigarData[i].Length > 4000) {
+ }
+ /*else if (al->CigarData[i].Type == 'S' && al->CigarData[i].Length > Parameter::Instance()->huge_ins) { /// Used for reads ranging into an inser
string sa;
al->GetTag("SA", sa);
if (sa.empty()) { // == no split reads!
@@ -103,7 +104,7 @@ vector<differences_str> Alignment::summarizeAlignment() {
ev.type = Parameter::Instance()->huge_ins * -1; //insertion: WE have to fix the length since we cannot estimate it!]
events.push_back(ev);
}
- }
+ }*/
}
if (flag) {
@@ -357,7 +358,15 @@ int Alignment::getAlignmentFlag() {
return al->AlignmentFlag;
}
string Alignment::getQueryBases() {
- return al->QueryBases;
+ if (al != NULL) {
+ return al->QueryBases;
+ } else {
+ return "";
+ }
+}
+void Alignment::clear_QueryBases() {
+ al->QueryBases.clear();
+ al->QueryBases = "";
}
string Alignment::getQualities() {
return al->Qualities;
@@ -453,95 +462,216 @@ void Alignment::get_coords(aln_str tmp, int & start, int &stop) {
}
stop = get_readlen(tmp.cigar) + start;
}
-
void Alignment::check_entries(vector<aln_str> &entries) {
-//Parameter::Instance()->read_name="0097b24b-1052-4c76-8a41-b66980e076f7_Basecall_Alignment_template";
+
bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
-//Given that we start outside of the INV:
if (flag) {
- cout << "Check:" << endl;
+ std::cout << "Nested? " << std::endl;
for (size_t i = 0; i < entries.size(); i++) {
- cout << "ENT: " << entries[i].pos << " " << entries[i].pos + entries[i].length << " Read: " << entries[i].read_pos_start << " " << entries[i].read_pos_stop << " ";
+ std::cout << entries[i].pos << "-" << entries[i].pos + entries[i].length << "(" << entries[i].read_pos_start << "-" << entries[i].read_pos_stop << ")";
if (entries[i].strand) {
- cout << "+" << endl;
+ std::cout << "+ ";
} else {
- cout << "-" << endl;
+ std::cout << "- ";
+ }
+ }
+ std::cout << std::endl;
+ }
+ int chr = entries[0].RefID;
+ bool strand = entries[0].strand;
+ int strands = 1;
+ int valid = 1;
+ for (size_t i = 1; i < entries.size(); i++) {
+ if (entries[i].read_pos_stop - entries[i].read_pos_start > 200) {
+ valid++;
+ if (chr != entries[i].RefID) {
+ return;
+ }
+ if (strand != entries[i].strand) {
+ strands++;
+ strand = entries[i].strand;
}
}
}
- bool left_of = true;
- vector<aln_str> new_entries = entries;
+ if (flag) {
+ std::cout << "summary: " << strands << " " << valid << std::endl;
+ }
+ if (strands < 3 || valid < 3) {
+ return;
+ }
+
for (size_t i = 1; i < entries.size(); i++) {
- if (entries[i].strand != entries[i - 1].strand && entries[i].RefID == entries[i - 1].RefID) {
- int ref_dist = 0;
- int read_dist = 0;
+ int ref_dist = 0;
+ int read_dist = 0;
+ if (entries[i - 1].strand) {
+ ref_dist = abs((entries[i - 1].pos + entries[i - 1].length) - entries[i].pos);
+ read_dist = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start);
+ } else {
+ ref_dist = abs((entries[i - 1].pos) - (entries[i].pos + entries[i].length));
+ read_dist = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start);
+ }
+
+ if (abs(entries[i - 1].pos - entries[i].pos) < 100) { //inv dup:
+ aln_str tmp;
+ tmp.RefID = entries[i].RefID;
+ tmp.strand = !entries[i].strand;
+ tmp.mq = 60;
+ tmp.length = 1;
+ tmp.pos = entries[i].pos + entries[i].length;
+ tmp.read_pos_start = entries[i].read_pos_stop; //fake...
+
if (entries[0].strand) {
- ref_dist = abs((entries[i - 1].pos + entries[i - 1].length) - entries[i].pos);
- read_dist = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start);
+ tmp.pos = entries[i - 1].pos + entries[i - 1].length;
+ tmp.read_pos_start = entries[i - 1].read_pos_stop; //fake...
+ tmp.strand = !tmp.strand;
} else {
- ref_dist = abs((entries[i - 1].pos) - (entries[i].pos + entries[i].length));
- read_dist = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start);
-
+ tmp.pos = entries[i].pos + entries[i].length;
+ tmp.read_pos_start = entries[i].read_pos_stop; //fake...
}
- if (flag) {
- cout << "ref dist: " << ref_dist << " Read: " << read_dist<<" DIFF: "<<abs(ref_dist - read_dist) << endl;
- }
- if (abs(ref_dist - read_dist) > Parameter::Instance()->min_length) {
- //ref_dist > 30 &&
- if (read_dist < Parameter::Instance()->min_length && ref_dist / (read_dist + 1) > 3) { //+1 because otherwise there could be a division by 0!
- if (flag) {
- cout << "DEL? " << ref_dist << " " << read_dist << endl;
- }
- aln_str tmp;
- tmp.RefID = entries[i].RefID;
- if (left_of) {
- tmp.strand = entries[i - 1].strand;
- if (tmp.strand) {
- tmp.pos = entries[i].pos - 1;
- } else {
- tmp.pos = entries[i].pos + entries[i].length - 1;
- }
- left_of = false;
- } else {
- tmp.strand = entries[i].strand;
- if (tmp.strand) {
- tmp.pos = entries[i - 1].pos + entries[i - 1].length - 1;
- } else {
- tmp.pos = entries[i - 1].pos - 1;
- }
- left_of = true;
- }
- tmp.length = 1;
- tmp.read_pos_start = entries[i].read_pos_start - 1;
- tmp.read_pos_stop = tmp.read_pos_start + 1;
- tmp.mq = 60;
- sort_insert(tmp, new_entries);
- } else if (read_dist > Parameter::Instance()->min_length && ref_dist < 10) {
- //cout << "INS? " << this->getName() << endl;
- }
+ tmp.read_pos_stop = tmp.read_pos_start + 1;
+ entries.insert(entries.begin() + (i), tmp);
+ break;
+ }
+
+ if (abs(ref_dist - read_dist) > Parameter::Instance()->min_length) { //distances between the inversion and the other split reads!
+ aln_str tmp;
+ tmp.RefID = entries[i].RefID;
+ tmp.strand = !entries[i].strand;
+ tmp.length = 1;
+ tmp.mq = 60;
+
+ //before the current element:
+
+ tmp.pos = entries[i].pos - 1;
+ tmp.read_pos_start = entries[i].read_pos_start - 1;
+ tmp.read_pos_stop = tmp.read_pos_start + 1;
+
+ //sort_insert(tmp, new_entries); //read_pos_start
+ aln_str tmp2;
+ tmp2 = tmp;
+ //after the current element:
+ tmp2.pos = entries[i].pos + entries[i].length;
+ tmp2.read_pos_start = entries[i].read_pos_stop; //fake...
+ tmp2.read_pos_stop = tmp2.read_pos_start + 1;
+ //sort_insert(tmp, new_entries);
+ if (entries[i - 1].strand) {
+ entries.insert(entries.begin() + (i + 1), tmp2);
+ entries.insert(entries.begin() + (i), tmp);
+ } else {
+ int start = tmp.read_pos_start;
+ tmp.read_pos_start = tmp2.read_pos_start;
+ tmp2.read_pos_start = start;
+ tmp2.read_pos_stop = tmp2.read_pos_start + 1;
+ tmp.read_pos_stop = tmp.read_pos_start + 1;
+ entries.insert(entries.begin() + (i + 1), tmp);
+ entries.insert(entries.begin() + (i), tmp2);
}
- } else {
- left_of = true; //??
+ break;
}
+
}
- if (entries.size() < new_entries.size()) {
- entries = new_entries;
+ if (flag) {
+ for (size_t i = 0; i < entries.size(); i++) {
+ std::cout << entries[i].pos << "-" << entries[i].pos + entries[i].length << "(" << entries[i].read_pos_start << "-" << entries[i].read_pos_stop << ")";
+ if (entries[i].strand) {
+ std::cout << "+ ";
+ } else {
+ std::cout << "- ";
+ }
+ }
+ std::cout << std::endl;
}
}
+/*void Alignment::check_entries(vector<aln_str> &entries) { //something is not working!
+ bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
+
+ //Given that we start outside of the INV:
+ if (flag) {
+ cout << "Check:" << endl;
+ for (size_t i = 0; i < entries.size(); i++) {
+ cout << "ENT: " << entries[i].pos << " " << entries[i].pos + entries[i].length << " Read: " << entries[i].read_pos_start << " " << entries[i].read_pos_stop << " ";
+ if (entries[i].strand) {
+ cout << "+" << endl;
+ } else {
+ cout << "-" << endl;
+ }
+ }
+ check_entries2(entries);
+ }
+ bool left_of = true;
+ vector<aln_str> new_entries = entries;
+ for (size_t i = 1; i < entries.size(); i++) {
+ if (entries[i].strand != entries[i - 1].strand && entries[i].RefID == entries[i - 1].RefID) {
+ int ref_dist = 0;
+ int read_dist = 0;
+ //all comp with abs:
+ //maybe try without abs!:
+ if (entries[0].strand) {
+ ref_dist = abs((entries[i - 1].pos + entries[i - 1].length) - entries[i].pos);
+ read_dist = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start);
+ } else {
+ ref_dist = abs((entries[i - 1].pos) - (entries[i].pos + entries[i].length));
+ read_dist = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start);
+ }
+ if (flag) {
+ cout << "ref dist: " << ref_dist << " Read: " << read_dist << " DIFF: " << abs(ref_dist - read_dist) << endl;
+ }
+
+ if (abs(ref_dist - read_dist) > Parameter::Instance()->min_length) {
+ //ref_dist > 30 &&
+ if (read_dist < Parameter::Instance()->min_length && ref_dist / (read_dist + 1) > 3) { //+1 because otherwise there could be a division by 0!
+ if (flag) {
+ cout << "DEL? " << ref_dist << " " << read_dist << endl;
+ }
+ aln_str tmp;
+ tmp.RefID = entries[i].RefID;
+ if (left_of) {
+ tmp.strand = entries[i - 1].strand;
+ if (tmp.strand) {
+ tmp.pos = entries[i].pos - 1;
+ } else {
+ tmp.pos = entries[i].pos + entries[i].length - 1;
+ }
+ left_of = false;
+ } else {
+ tmp.strand = entries[i].strand;
+ if (tmp.strand) {
+ tmp.pos = entries[i - 1].pos + entries[i - 1].length - 1;
+ } else {
+ tmp.pos = entries[i - 1].pos - 1;
+ }
+ left_of = true;
+ }
+ tmp.length = 1;
+ tmp.read_pos_start = entries[i].read_pos_start - 1;
+ tmp.read_pos_stop = tmp.read_pos_start + 1;
+ tmp.mq = 60;
+ sort_insert(tmp, new_entries);
+ } else if (read_dist > Parameter::Instance()->min_length && ref_dist < 10) {
+ //cout << "INS? " << this->getName() << endl;
+ }
+ }
+ } else {
+ left_of = true; //??
+ }
+ }
+ if (entries.size() < new_entries.size()) {
+ entries = new_entries;
+ }
+ }*/
void Alignment::sort_insert(aln_str tmp, vector<aln_str> &entries) {
- //TODO detect abnormal distances + directions -> introduce a pseudo base to detect these things later?
for (vector<aln_str>::iterator i = entries.begin(); i != entries.end(); i++) {
- if ((tmp.read_pos_start == (*i).read_pos_start) && (tmp.pos == (*i).pos) && (tmp.strand == (*i).strand)) { //is the same! should not happen
- return;
- }
- if (!tmp.cigar.empty() && ((*i).read_pos_start <= tmp.read_pos_start && (*i).read_pos_stop >= tmp.read_pos_stop)) { //check for the additional introducded entries
- return;
- }
- if (!tmp.cigar.empty() && ((*i).read_pos_start >= tmp.read_pos_start && (*i).read_pos_stop <= tmp.read_pos_stop)) { //check for the additional introducded entries
- (*i) = tmp;
- return;
- }
+ /* if ((tmp.read_pos_start == (*i).read_pos_start) && (tmp.pos == (*i).pos) && (tmp.strand == (*i).strand)) { //is the same! should not happen
+ return;
+ }
+ if (!tmp.cigar.empty() && ((*i).read_pos_start <= tmp.read_pos_start && (*i).read_pos_stop >= tmp.read_pos_stop)) { //check for the additional introducded entries
+ return;
+ }
+ if (!tmp.cigar.empty() && ((*i).read_pos_start >= tmp.read_pos_start && (*i).read_pos_stop <= tmp.read_pos_stop)) { //check for the additional introducded entries
+ (*i) = tmp;
+ return;
+ }*/
if ((tmp.read_pos_start < (*i).read_pos_start)) { //insert before
entries.insert(i, tmp);
return;
@@ -549,6 +679,17 @@ void Alignment::sort_insert(aln_str tmp, vector<aln_str> &entries) {
}
entries.push_back(tmp);
}
+
+bool Alignment::overlapping_segments(vector<aln_str> entries) {
+ bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
+ if (flag) {
+ std::cout << "HO: " << entries.size() << std::endl;
+ for (size_t i = 0; i < entries.size(); i++) {
+ std::cout << "Seg: " << i << " " << entries[i].pos << " " << entries[i].length << std::endl;
+ }
+ }
+ return (entries.size() == 2 && abs(entries[0].pos - entries[1].pos) < 100);
+}
vector<aln_str> Alignment::getSA(RefVector ref) {
string sa;
vector<aln_str> entries;
@@ -631,7 +772,7 @@ vector<aln_str> Alignment::getSA(RefVector ref) {
}
i++;
}
- if (nested && entries.size() > 2) {
+ if (nested && entries.size() > 2 || overlapping_segments(entries)) {
check_entries(entries);
}
if (flag) {
@@ -721,7 +862,7 @@ vector<str_event> Alignment::get_events_CIGAR() {
if (al->CigarData[i].Type == 'H' || (al->CigarData[i].Type == 'S' || al->CigarData[i].Type == 'M')) {
read_pos += al->CigarData[i].Length;
}
- if (al->CigarData[i].Type == 'D' && al->CigarData[i].Length > Parameter::Instance()->min_cigar_event) {
+ if (al->CigarData[i].Type == 'D' && al->CigarData[i].Length > Parameter::Instance()->min_length) {
str_event ev;
ev.read_pos = read_pos;
ev.length = al->CigarData[i].Length; //deletion
@@ -729,7 +870,7 @@ vector<str_event> Alignment::get_events_CIGAR() {
includes_SV = true;
events.push_back(ev);
}
- if (al->CigarData[i].Type == 'I' && al->CigarData[i].Length > Parameter::Instance()->min_cigar_event) {
+ if (al->CigarData[i].Type == 'I' && al->CigarData[i].Length > Parameter::Instance()->min_length) {
// std::cout<<"CIGAR: "<<al->CigarData[i].Length<<" "<<this->getName()<<std::endl;
str_event ev;
ev.length = al->CigarData[i].Length * -1; //insertion;
@@ -907,15 +1048,19 @@ vector<str_event> Alignment::get_events_MD(int min_mis) {
vector<int> Alignment::get_avg_diff(double & dist) {
- //computeAlignment();
- //cout<<alignment.first<<endl;
- //cout<<alignment.second<<endl;
-
- vector<differences_str> event_aln = summarizeAlignment();
+//computeAlignment();
+//cout<<alignment.first<<endl;
+//cout<<alignment.second<<endl;
vector<int> mis_per_window;
+ vector<differences_str> event_aln = summarizeAlignment();
+ if (event_aln.empty()) {
+ dist = 0;
+ return mis_per_window;
+ }
+
PlaneSweep_slim * plane = new PlaneSweep_slim();
int min_tresh = 5; //reflects a 10% error rate.
- //compute the profile of differences:
+//compute the profile of differences:
for (size_t i = 0; i < event_aln.size(); i++) {
if (i != 0) {
dist += event_aln[i].position - event_aln[i - 1].position;
@@ -938,19 +1083,19 @@ vector<int> Alignment::get_avg_diff(double & dist) {
vector<str_event> Alignment::get_events_Aln() {
- //clock_t comp_aln = clock();
+//clock_t comp_aln = clock();
vector<differences_str> event_aln = summarizeAlignment();
- //double time2 = Parameter::Instance()->meassure_time(comp_aln, "\tcompAln Events: ");
+//double time2 = Parameter::Instance()->meassure_time(comp_aln, "\tcompAln Events: ");
vector<str_event> events;
PlaneSweep_slim * plane = new PlaneSweep_slim();
vector<pair_str> profile;
// comp_aln = clock();
- //Parameter::Instance()->read_name = "21_30705246";
+
bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
- //cout<<" IDENT: "<<(double)event_aln.size()/(double)this->al->Length << " "<<this->getName().c_str()<<endl;
- //compute the profile of differences:
+ int noise_events = 0;
+//compute the profile of differences:
for (size_t i = 0; i < event_aln.size(); i++) {
pair_str tmp;
tmp.position = -1;
@@ -961,28 +1106,18 @@ vector<str_event> Alignment::get_events_Aln() {
}
if (tmp.position != -1 && (profile.empty() || (tmp.position - profile[profile.size() - 1].position) > 100)) { //for noisy events;
profile.push_back(tmp);
- if (flag) {
- cout << "HIT: " << event_aln[i].position << " " << tmp.coverage << endl;
- }
} else if (abs(event_aln[i].type) > Parameter::Instance()->min_length) { //for single events like NGM-LR would produce them.
tmp.position = event_aln[i].position;
+ if (flag) {
+ std::cout << "HIT" << std::endl;
+ }
profile.push_back(tmp);
}
}
- //Parameter::Instance()->meassure_time(comp_aln, "\tcompProfile: ");
- /*if (strcmp(getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0) {
- int prev = 0;
- prev = getPosition();
- for (size_t i = 0; i < event_aln.size(); i++) {
- cout << event_aln[i].position - prev << " " << event_aln[i].type << endl;
-
- }
- }*/
- //comp_aln = clock();
+//comp_aln = clock();
size_t stop = 0;
size_t start = 0;
- int tot_len = 0;
for (size_t i = 0; i < profile.size(); i++) {
if (profile[i].position > event_aln[stop].position) {
//find the postion:
@@ -1038,10 +1173,12 @@ vector<str_event> Alignment::get_events_Aln() {
double insert = 0;
double del = 0;
double mismatch = 0;
-
- for (size_t k = start; k < stop; k++) {
+ if (flag) {
+ cout << start << " " << stop << endl;
+ }
+ for (size_t k = start; k <= stop; k++) {
if (flag) {
- // cout << event_aln[k].position << " " << event_aln[k].type << endl;
+ cout << event_aln[k].position << " " << event_aln[k].type << endl;
}
if (event_aln[k].type == 0) {
mismatch++;
@@ -1069,7 +1206,6 @@ vector<str_event> Alignment::get_events_Aln() {
tmp.length = (tmp.length - event_aln[start].position);
tmp.type = 0;
- //TODO constance used!
if (insert_max > Parameter::Instance()->min_length && insert > (del + del)) { //we have an insertion! //todo check || vs. &&
if (flag) {
cout << "store INS" << endl;
@@ -1077,19 +1213,24 @@ vector<str_event> Alignment::get_events_Aln() {
tmp.length = insert_max; //TODO not sure!
tmp.pos = insert_max_pos;
tmp.type |= INS;
- } else if (del_max > Parameter::Instance()->min_length && (mismatch < 2 && (insert + insert) < del)) { //deletion
+ tmp.is_noise = false;
+ } else if (del_max > Parameter::Instance()->min_length && (insert + insert) < del) { //deletion
if (flag) {
- cout << "store DEL" << endl;
+ cout << "store DEL " << this->getName() << endl;
}
+ tmp.length = del_max;
tmp.type |= DEL;
- } else if (mismatch > Parameter::Instance()->min_length) { //TODO
+ tmp.is_noise = false;
+ } else if ((mismatch+del+insert)/2 > Parameter::Instance()->min_length) { //TODO
if (flag) {
cout << "store Noise" << endl;
}
+ noise_events++;
tmp.type |= DEL;
tmp.type |= INV;
- }else{
- // std::cout<<"ERROR we did not set the type!"<<std::endl;
+ tmp.is_noise = true;
+ } else {
+ // std::cout<<"ERROR we did not set the type!"<<std::endl;
}
if (flag) {
@@ -1097,17 +1238,11 @@ vector<str_event> Alignment::get_events_Aln() {
cout << "INS max " << insert_max << " del_max " << del_max << std::endl;
cout << "INS:" << insert << " DEL: " << del << " MIS: " << mismatch << endl;
cout << event_aln[start].position << " " << event_aln[stop].position << endl;
- cout << tot_len << " " << this->getRefLength() << endl;
cout << "store: " << tmp.pos << " " << tmp.pos + abs(tmp.length) << " " << tmp.length << endl;
cout << endl;
}
- tot_len += tmp.length;
-
- if (tot_len > this->getRefLength() * 0.77) { // if the read is just noisy as hell:
- events.clear();
- return events;
- } else if(tmp.type!=0) {
+ if (tmp.type != 0) {
if (flag) {
cout << "STORE" << endl;
}
@@ -1116,7 +1251,7 @@ vector<str_event> Alignment::get_events_Aln() {
}
}
// Parameter::Instance()->meassure_time(comp_aln, "\tcompPosition: ");
- if (events.size() > 4) { //TODO very arbitrary! test?
+ if (noise_events > 4) {
events.clear();
}
return events;
diff --git a/src/Alignment.h b/src/Alignment.h
index 8d4e534..117cad4 100644
--- a/src/Alignment.h
+++ b/src/Alignment.h
@@ -22,8 +22,8 @@ const unsigned char DUP = 0x02; // hex for 0000 0010
const unsigned char INS = 0x04; // hex for 0000 0100
const unsigned char INV = 0x08; // hex for 0000 1000
const unsigned char TRA = 0x10; // hex for 0001 0000
-
-
+const unsigned char NEST =0x20; // hex for 0010 0000
+const unsigned char NA = 0x80; // hex for 1000 0000
using namespace BamTools;
using namespace std;
@@ -40,6 +40,7 @@ struct str_event{
int pos;
int read_pos;
char type;
+ bool is_noise;
};
struct aln_str{
int RefID;
@@ -69,8 +70,10 @@ private:
vector<differences_str> summarizeAlignment();
void sort_insert(aln_str tmp, vector<aln_str> &entries);
void check_entries(vector<aln_str> &entries);
+ bool overlapping_segments(vector<aln_str> entries);
public:
Alignment(){
+ al=NULL;
ref_len=0;
stop=0;
orig_length=0;
@@ -86,7 +89,7 @@ public:
void setAlignment(BamAlignment * al);
void setRef(string sequence);
void computeAlignment();
-
+ void clear_QueryBases();
pair<string,string> getSequence();
int32_t getPosition();
int32_t getRefID();
diff --git a/src/BamParser.cpp b/src/BamParser.cpp
index 8ed431d..92b92d6 100644
--- a/src/BamParser.cpp
+++ b/src/BamParser.cpp
@@ -39,6 +39,7 @@ void BamParser::parseReadFast(uint16_t mappingQv,Alignment*& align){
// getSequence().first
// align->initSequence();
align->getQueryBases().clear();
+ align->clear_QueryBases();
while(reader.GetNextAlignmentCore(al[0])){
if( al->IsMapped() && al->MapQuality > mappingQv){
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3f470b3..2ff0828 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -17,6 +17,7 @@ add_executable(sniffles
sub/Detect_Breakpoints.cpp
sub/Breakpoint.cpp
tree/IntervallTree.cpp
+ tree/IntervallList.cpp
realign/SWCPU.cpp
realign/Realign.cpp
print/VCFPrinter.cpp
@@ -40,6 +41,7 @@ add_executable(sniffles-debug
Sniffles.cpp
Ignore_Regions.cpp
tree/Intervall_bed.cpp
+ tree/IntervallList.cpp
sub/Detect_Breakpoints.cpp
sub/Breakpoint.cpp
tree/IntervallTree.cpp
diff --git a/src/Genotyper/Genotyper.cpp b/src/Genotyper/Genotyper.cpp
index ead049d..cd5bb93 100644
--- a/src/Genotyper/Genotyper.cpp
+++ b/src/Genotyper/Genotyper.cpp
@@ -57,7 +57,27 @@ std::string Genotyper::mod_breakpoint_bedpe(char *buffer, int ref) {
return entry;
}
+void Genotyper::parse_pos(char * buffer, int & pos, std::string & chr) {
+ chr="";
+ pos=-1;
+ size_t i = 0;
+ int count = 0;
+ while (buffer[i] != '\t') {
+ if (count == 1 && ((buffer[i] != '[' || buffer[i] != ']') && buffer[i] != ':')) {
+ chr += buffer[i];
+ }
+ if (count == 2 && buffer[i - 1] == ':') {
+ pos = atoi(&buffer[i]);
+ }
+ if ((buffer[i] == ']' || buffer[i] == '[') || buffer[i] == ':') {
+ count++;
+ }
+ i++;
+ }
+}
+
variant_str Genotyper::get_breakpoint_vcf(char *buffer) {
+ //TODO extend for TRA!
size_t i = 0;
int count = 0;
@@ -70,6 +90,9 @@ variant_str Genotyper::get_breakpoint_vcf(char *buffer) {
if (count == 1 && buffer[i - 1] == '\t') {
tmp.pos = atoi(&buffer[i]);
}
+ if (tmp.pos2 == -1 && (count == 4 && (buffer[i - 1] == '[' || buffer[i - 1] == ']'))) {
+ parse_pos(&buffer[i - 1],tmp.pos2,tmp.chr2);
+ }
if (count > 6 && strncmp(";CHR2=", &buffer[i], 6) == 0) {
i += 6;
@@ -124,19 +147,17 @@ void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) {
std::ifstream myfile;
bool is_vcf = !Parameter::Instance()->output_vcf.empty();
-
string file_name;
if (!Parameter::Instance()->output_vcf.empty()) {
- file_name=Parameter::Instance()->output_vcf;
+ file_name = Parameter::Instance()->output_vcf;
myfile.open(Parameter::Instance()->output_vcf.c_str(), std::ifstream::in);
} else if (!Parameter::Instance()->output_bedpe.empty()) {
- file_name=Parameter::Instance()->output_bedpe;
+ file_name = Parameter::Instance()->output_bedpe;
myfile.open(Parameter::Instance()->output_bedpe.c_str(), std::ifstream::in);
}
FILE*file = fopen(Parameter::Instance()->tmp_file.c_str(), "w");
-
if (!myfile.good()) {
std::cout << "SVParse: could not open file: " << std::endl;
exit(0);
@@ -157,9 +178,9 @@ void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) {
}
int ref = max(tree.get_ref(node, tmp.chr, tmp.pos), tree.get_ref(node, tmp.chr2, tmp.pos2));
if (is_vcf) {
- to_print=mod_breakpoint_vcf(buffer,ref);
- }else{
- to_print=mod_breakpoint_bedpe(buffer,ref);
+ to_print = mod_breakpoint_vcf(buffer, ref);
+ } else {
+ to_print = mod_breakpoint_bedpe(buffer, ref);
}
fprintf(file, "%s", to_print.c_str());
} else {
@@ -171,10 +192,10 @@ void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) {
myfile.close();
fclose(file);
- string move="mv ";
- move+=Parameter::Instance()->tmp_file;
- move+=" ";
- move+=file_name;
+ string move = "mv ";
+ move += Parameter::Instance()->tmp_file;
+ move += " ";
+ move += file_name;
system(move.c_str());
}
@@ -218,9 +239,12 @@ void Genotyper::read_SVs(Breakpoint_Tree & tree, breakpoint_node *& node) {
//tree.inorder(node);
}
void Genotyper::compute_cov(Breakpoint_Tree & tree, breakpoint_node *& node) {
- FILE * ref_allel_reads = fopen(Parameter::Instance()->tmp_file.c_str(), "r");
+ std::string output = Parameter::Instance()->tmp_file.c_str();
+ output += "ref_allele";
+
+ FILE * ref_allel_reads = fopen(output.c_str(), "r");
if (ref_allel_reads == NULL) {
- std::cerr << "CovParse: could not open file: " << Parameter::Instance()->tmp_file.c_str() << std::endl;
+ std::cerr << "CovParse: could not open file: " << output.c_str() << std::endl;
}
//check if we want to compute the full coverage!
diff --git a/src/Genotyper/Genotyper.h b/src/Genotyper/Genotyper.h
index 2aca323..f0aef9f 100644
--- a/src/Genotyper/Genotyper.h
+++ b/src/Genotyper/Genotyper.h
@@ -27,7 +27,7 @@ private:
variant_str get_breakpoint_bedpe(char *buffer);
std::string mod_breakpoint_vcf(char *buffer, int ref);
std::string mod_breakpoint_bedpe(char *buffer, int ref);
-
+ void parse_pos(char * buffer, int & pos, std::string & chr);
public:
Genotyper(){
node=NULL;
diff --git a/src/Paramer.h b/src/Paramer.h
index e68a654..e75411f 100644
--- a/src/Paramer.h
+++ b/src/Paramer.h
@@ -24,7 +24,8 @@ class Parameter {
private:
Parameter() {
window_thresh=10;//TODO check!
- version="1.0.1";
+ version="1.0.2";
+ huge_ins = 2000;//TODO check??
}
~Parameter() {
@@ -42,7 +43,6 @@ public:
std::vector<std::string> bam_files;
short min_mq;
- short min_cigar_event;
short report_n_reads;
short corridor;
@@ -62,9 +62,7 @@ public:
int huge_ins;
int max_dist_alns;
- bool realign;
- bool splitthreader_output;
- bool useMD_CIGAR;
+// bool splitthreader_output;
bool debug;
bool genotype;
bool phase;
diff --git a/src/Sniffles.cpp b/src/Sniffles.cpp
index 4c57278..b73d52f 100644
--- a/src/Sniffles.cpp
+++ b/src/Sniffles.cpp
@@ -22,65 +22,40 @@
#include "plane-sweep/PlaneSweep_slim.h"
#include "print/BedpePrinter.h"
-Parameter* Parameter::m_pInstance = NULL;
//cmake -D CMAKE_C_COMPILER=/opt/local/bin/gcc-mp-4.7 -D CMAKE_CXX_COMPILER=/opt/local/bin/g++-mp-4.7 ..
-//TODO: Think of ways to make it faster!
-//TODO: WHY? 1 119401196 189700 N <TRA> . PASS IMPRECISE;SVMETHOD=Snifflesv0.0.1;CHR2=15;END=51189238;RNAMES=m141229_044222_00118_c100750472550000001823151707081504_s1_p0/2205/0_8445;m141231_161924_00118_c1007507325500000018231517
-//TODO: 1 119400525 189701 N <TRA> . PASS IMPRECISE;SVMETHOD=Snifflesv0.0.1;CHR2=15;END=51189312;RNAMES=m141229_044222_00118_c100750472550000001823151707081504_s1_p0/2205/0_8445;m141231_161924_00118_c1007507325500000018231517
-//TODO: for read names stored for each event store the number of possible events they support.-> If number==1 do not print them in tmp file.
-//TODO: write comparison script taking bed or a vcf file!
-//TODO: make score threshold only on events on reads and not on split reads!
-// Think of method to filter out strange SV.
-// Think of multiple bam files -> setting genotypes
-// Think about overlapping SV, maybe flag to report if they share the same read -> phasing info?
-// Regular scan through the SV and move those where the end point lies far behind the current pos or reads. Eg. 1MB?
+//TODO:
+// Make hist of position%10. Take highest and if second highest that is at least Xbp away is over Parameter -> Split
+//Upload new version. Take care about version naming!
+
+Parameter* Parameter::m_pInstance = NULL;
void read_parameters(int argc, char *argv[]) {
- std::string lable="Sniffles version ";
- lable+=Parameter::Instance()->version;
- TCLAP::CmdLine cmd("Sniffles version 1.0.0", ' ', Parameter::Instance()->version);
- TCLAP::ValueArg<std::string> arg_bamfile("m", "mapped_reads", "Bam File", true, "", "string");
+ TCLAP::CmdLine cmd("Sniffles version ", ' ', Parameter::Instance()->version);
+ TCLAP::ValueArg<std::string> arg_bamfile("m", "mapped_reads", "Sorted bam File", true, "", "string");
TCLAP::ValueArg<std::string> arg_vcf("v", "vcf", "VCF output file name", false, "", "string");
- //TCLAP::ValueArg<std::string> arg_bede("b", "bede", "Bede output file name", false, "", "string");
TCLAP::ValueArg<std::string> arg_bedpe("b", "bedpe", " bedpe output file name", false, "", "string");
- //TCLAP::ValueArg<std::string> arg_noregions("", "bed", "Ignore SV overlapping with those regions.", false, "", "string");
-// TCLAP::ValueArg<std::string> arg_ref("r", "reference", "Reference fasta sequence. Activates realignment step", false, "", "string");
- //TCLAP::ValueArg<std::string> arg_region("", "regions", "List of regions CHR:start-stop; to check", false, "", "string");
TCLAP::ValueArg<int> arg_support("s", "min_support", "Minimum number of reads that support a SV. Default: 10", false, 10, "int");
TCLAP::ValueArg<int> arg_splits("", "max_num_splits", "Maximum number of splits per read to be still taken into account. Default: 7", false, 7, "int");
- TCLAP::ValueArg<int> arg_dist("d", "max_distance", "Maximum distance to group SV together. Default: 1kb", false, 500, "int");
+ TCLAP::ValueArg<int> arg_dist("d", "max_distance", "Maximum distance to group SV together. Default: 1kb", false, 1000, "int");
TCLAP::ValueArg<int> arg_threads("t", "threads", "Number of threads to use. Default: 3", false, 3, "int");
- //TCLAP::ValueArg<int> arg_corridor("", "corridor", "Maximum size of corridor for realignment. Default: 2000", false, 2000, "int");
TCLAP::ValueArg<int> arg_minlength("l", "min_length", "Minimum length of SV to be reported. Default: 30", false, 30, "int");
TCLAP::ValueArg<int> arg_mq("q", "minmapping_qual", "Minimum Mapping Quality. Default: 20", false, 20, "int");
- TCLAP::ValueArg<int> arg_cigar("c", "min_cigar_event", "Minimum Cigar Event (e.g. Insertion, deletion) to take into account. Default:50 ", false, 50, "int");
- TCLAP::ValueArg<int> arg_numreads("n", "num_reads_report", "Report up to N reads that support the SV. Default: 0", false, 0, "int");
-// TCLAP::ValueArg<int> arg_phase_minreads("", "min_reads_phase", "Minimum reads overlapping two SV to phase them together. Default: 1", false, 1, "int");
- TCLAP::ValueArg<std::string> arg_tmp_file("", "tmp_file", "patht to temporary file otherwise Sniffles will use the current directory.", false, "", "string");
-// TCLAP::SwitchArg arg_realign("", "re-align", "Enables the realignment of reads at predicted SV sites. Leads to more accurate breakpoint predictions.", cmd, false);
- TCLAP::SwitchArg arg_MD_cigar("", "use_MD_Cigar", "Enables Sniffles to use the alignment information to screen for suspicious regions.", cmd, false);
- //TCLAP::SwitchArg arg_Splitthreader("", "Splitthreader_output", "Enables Sniffles to compute also the full coverage required by Splitthreader.", cmd, false);
+ TCLAP::ValueArg<int> arg_numreads("n", "num_reads_report", "Report up to N reads that support the SV in the vcf file. Default: 0", false, 0, "int");
+ TCLAP::ValueArg<std::string> arg_tmp_file("", "tmp_file", "path to temporary file otherwise Sniffles will use the current directory.", false, "", "string");
TCLAP::SwitchArg arg_genotype("", "genotype", "Enables Sniffles to compute the genotypes.", cmd, false);
- TCLAP::SwitchArg arg_cluster("", "cluster", "Enables Sniffles to phase SVs that occure on the same reads", cmd, false);
+ TCLAP::SwitchArg arg_cluster("", "cluster", "Enables Sniffles to phase SVs that occur on the same reads", cmd, false);
cmd.add(arg_numreads);
cmd.add(arg_tmp_file);
cmd.add(arg_dist);
- //cmd.add(arg_bede);
-// cmd.add(arg_noregions);
cmd.add(arg_threads);
- cmd.add(arg_cigar);
cmd.add(arg_bedpe);
cmd.add(arg_vcf);
-// cmd.add(arg_ref);
- //cmd.add(arg_corridor);
- //cmd.add(arg_region);
cmd.add(arg_minlength);
- //cmd.add(arg_phase_minreads);
cmd.add(arg_mq);
cmd.add(arg_splits);
cmd.add(arg_support);
@@ -90,35 +65,25 @@ void read_parameters(int argc, char *argv[]) {
Parameter::Instance()->debug = true;
Parameter::Instance()->score_treshold = 10;
-
- Parameter::Instance()->read_name = " ";//0076373e-d278-4316-9967-9b4c0b74df57_Basecall_Alignment_template";//21_30705246"; ;//just for debuging reasons!
+ Parameter::Instance()->read_name = " "; //just for debuging reasons!
Parameter::Instance()->bam_files.push_back(arg_bamfile.getValue());
Parameter::Instance()->min_mq = arg_mq.getValue();
Parameter::Instance()->output_vcf = arg_vcf.getValue();
- Parameter::Instance()->min_cigar_event = arg_cigar.getValue();
Parameter::Instance()->report_n_reads = arg_numreads.getValue();
Parameter::Instance()->min_support = arg_support.getValue();
Parameter::Instance()->max_splits = arg_splits.getValue();
Parameter::Instance()->max_dist = arg_dist.getValue();
- //Parameter::Instance()->ref_seq = arg_ref.getValue();
- Parameter::Instance()->splitthreader_output = true; //arg_Splitthreader.getValue();
- Parameter::Instance()->realign = false; //arg_realign.getValue();
- //Parameter::Instance()->corridor = arg_corridor.getValue();
-// Parameter::Instance()->output_bede = arg_bede.getValue();
Parameter::Instance()->min_length = arg_minlength.getValue();
- //Parameter::Instance()->min_reads_phase = arg_phase_minreads.getValue();
- Parameter::Instance()->useMD_CIGAR = arg_MD_cigar.getValue();
Parameter::Instance()->genotype = arg_genotype.getValue();
Parameter::Instance()->phase = arg_cluster.getValue();
Parameter::Instance()->num_threads = arg_threads.getValue();
Parameter::Instance()->output_bedpe = arg_bedpe.getValue();
- //Parameter::Instance()->ignore_regions_bed = arg_noregions.getValue();
- Parameter::Instance()->tmp_file = "test.tmp"; //arg_tmp_file.getValue();
+ Parameter::Instance()->tmp_file = arg_tmp_file.getValue();
Parameter::Instance()->min_grouping_support = 1;
- Parameter::Instance()->huge_ins = 4000;//TODO check??
if (Parameter::Instance()->tmp_file.empty()) {
std::stringstream ss;
+ srand(time(NULL));
//ss<<"."; //TODO: User does not need to see this!
ss << rand();
ss << "_tmp";
@@ -150,9 +115,154 @@ void parse_binary() {
fclose(alt_allel_reads);
}
+double comp_std(std::vector<int> pos, int start) {
+ double count = 0;
+ double std_start = 0;
+
+ for (size_t i = 0; i < pos.size(); i++) {
+ count++;
+ if (pos[i] != -1) {
+ long diff = (start - pos[i]);
+ // std::cout << "DIFF Start: " << diff << std::endl;
+ std_start += std::pow((double) diff, 2.0);
+ }
+ }
+ return std::sqrt(std_start / count);
+}
+
+void test_sort_insert(int pos, std::vector<int> & positions) {
+
+ size_t i = 0;
+ while (i < positions.size() && positions[i] < pos) {
+ i++;
+ }
+ positions.insert(positions.begin() + i, pos);
+
+}
+
+double test_comp_std_quantile(std::vector<int> positions, int position) {
+ double count = 0;
+ std::vector<int> std_start_dists;
+ double std_start = 0;
+
+ for (std::vector<int>::iterator i = positions.begin(); i != positions.end(); i++) {
+
+ long diff = (position - (*i));
+ // std::cout << "DIFF Start: " << diff << std::endl;
+ test_sort_insert(std::pow((double) diff, 2.0), std_start_dists);
+ //std_start += std::pow((double) diff, 2.0);
+
+ }
+
+ count = 0;
+ for (size_t i = 0; i < std_start_dists.size() / 2; i++) {
+ std_start += std_start_dists[i];
+ count++;
+ }
+
+ return std::sqrt(std_start / count);
+
+}
+
+void test_std() {
+ srand(time(NULL));
+ int start = rand() % 100000; /// sqrt(1/12) for ins. Plot TRA std vs. cov/support.
+ std::vector<int> positions;
+ double avg=0;
+ double num=0;
+ for (int t = 0; t < 1000; t++) {
+ for (int border = 100; border < 90001; border = border * 5) {
+ //for (int cov = 1; cov < 800; cov += 10) {
+ int cov = 100;
+ for (size_t i = 0; i < cov; i++) {
+ int pos = (rand() % border) + (start - (border / 2));
+ positions.push_back(pos);
+ }
+ avg+=comp_std(positions, start) / test_comp_std_quantile(positions, start);
+ std::cout << "Cov: " << cov << " border: " << border << " STD: " << comp_std(positions, start) / test_comp_std_quantile(positions, start) << std::endl;
+ positions.clear();
+ num++;
+ //}
+ }
+ }
+ std::cout<<"AVG: "<<avg/num<<std::endl;
+}
+
+void get_rand(int mean, int num, vector<int> & positions, int interval) {
+//std::cout << "sim " << num << std::endl;
+ for (size_t i = 0; i < num; i++) {
+ int pos = (rand() % interval) + (mean - (interval / 2));
+ positions.push_back(pos);
+ }
+}
+#include <stdlib.h>
+std::vector<int> sort_distance(std::vector<int> positions, int mean) {
+ std::vector<int> distances;
+ for (size_t i = 0; i < positions.size(); i++) {
+ int dist = std::abs(mean - positions[i]);
+ size_t j = 0;
+ while (j < distances.size()) {
+ if (std::abs(mean - distances[j]) < dist) {
+ distances.insert(distances.begin() + j, positions[i]);
+ break;
+ }
+ j++;
+ }
+ if (j == distances.size()) {
+ distances.push_back(positions[i]);
+ }
+ }
+ return distances;
+}
+void test_slimming() {
+ double fract = 0.2;
+ srand(time(NULL));
+ int mean = rand() % 100000; /// sqrt(1/12) for ins. Plot TRA std vs. cov/support.
+ int intervall = 1000;
+
+ std::vector<std::vector<double> > stds;
+ int key = 0;
+ int cov = 100;
+ for (double fract = 0.1; fract < 1; fract += 0.1) {
+
+ //std::cout<<fract<<std::endl;
+ std::vector<int> positions;
+ get_rand(mean, round(cov * fract), positions, intervall); //random process
+ get_rand(mean, round(cov * (1 - fract)), positions, 10); //focused calls
+ // std::cout << "Cov: " << cov << " border: " << intervall << " STD: " << comp_std(positions, mean) << std::endl;
+ std::vector<int> dists;
+ dists = sort_distance(positions, mean);
+
+ /* for (size_t i = 0; i < dists.size(); i++) {
+ std::cout << abs(mean - dists[i]) << std::endl;
+ }
+ */
+ std::vector<double> std_tmp;
+ for (size_t i = 0; i < dists.size(); i++) {
+ std::vector<int> tmp;
+ tmp.assign(dists.rbegin(), dists.rend() - i);
+ double std = comp_std(tmp, mean);
+ //std::cout << "Points: " << tmp.size() << " STD: " << std << std::endl;
+ std_tmp.push_back(std);
+ }
+ stds.push_back(std_tmp);
+ }
+
+ for (size_t i = 0; i < stds.size(); i++) {
+ for (size_t j = 0; j < stds[i].size(); j++) {
+ std::cout << stds[i][j] << "\t";
+ }
+ std::cout << std::endl;
+ }
+}
int main(int argc, char *argv[]) {
try {
+ //test_slimming();
+ // test_std();
+ // exit(0);
+
+
//init parameter and reads user defined parameter from command line.
read_parameters(argc, argv);
@@ -160,6 +270,10 @@ int main(int argc, char *argv[]) {
omp_set_dynamic(0);
omp_set_num_threads(Parameter::Instance()->num_threads);
+ if ((!Parameter::Instance()->output_vcf.empty()) && (!Parameter::Instance()->output_bedpe.empty())) {
+ std::cerr << "Please select only vcf OR bedpe output format!" << std::endl;
+ exit(1);
+ }
//init printer:
IPrinter * printer;
if (!Parameter::Instance()->ref_seq.empty()) {
@@ -174,7 +288,6 @@ int main(int argc, char *argv[]) {
}
printer->init();
-
detect_breakpoints(Parameter::Instance()->bam_files[0], printer); //we could write out all read names for each sVs
printer->close_file();
@@ -192,15 +305,6 @@ int main(int argc, char *argv[]) {
go->update_SVs();
}
- //realignment: Using NGM???
- if (!Parameter::Instance()->ref_seq.empty()) {
- std::cout << "Realignment step activated:" << std::endl;
- //1. parse in output file(s)
- //2. extract reads from bam files (read groups?) //shellscript: picard/samtools??
- //3. realign
- //4. call Sniffles
- }
-
cout << "Cleaning tmp files" << endl;
string del = "rm ";
del += Parameter::Instance()->tmp_file;
@@ -208,23 +312,7 @@ int main(int argc, char *argv[]) {
} catch (TCLAP::ArgException &e) // catch any exceptions
{
- std::cerr << "error: " << e.error() << " for arg " << e.argId() << std::endl;
+ std::cerr << "Sniffles error: " << e.error() << " for arg " << e.argId() << std::endl;
}
return 0;
}
-/*
- * Input:
- * Regions to check
- *
- * Parameters
- *
- *
- */
-/*
- * 1. Detect strange regions
- * Using MD, Cigar, Split reads
- *
- * 2. Extract reads from region (?) The main read holds the whole sequence for BWA-MEM
- *
- * 3. Realign regions (?)
- */
diff --git a/src/Version.h b/src/Version.h
index 09ef3c8..d389390 100644
--- a/src/Version.h
+++ b/src/Version.h
@@ -1,9 +1,9 @@
#ifndef VERSION_H
#define VERSION_H
-#define VERSION_MAJOR ""
-#define VERSION_MINOR ""
-#define VERSION_BUILD ""
+#define VERSION_MAJOR "1"
+#define VERSION_MINOR "0"
+#define VERSION_BUILD "2"
#endif // VERSION_H
diff --git a/src/cluster/Cluster_SVs.cpp b/src/cluster/Cluster_SVs.cpp
index 45700f9..1ea3025 100644
--- a/src/cluster/Cluster_SVs.cpp
+++ b/src/cluster/Cluster_SVs.cpp
@@ -20,14 +20,7 @@ std::map<long, std::vector<int> > Cluster_SVS::parse_names_ids(int & max_ID) {
name_str tmp;
size_t nbytes = fread(&tmp, sizeof(struct name_str), 1, alt_allel_reads);
while (nbytes != 0) {
- max_ID = std::max(max_ID, tmp.svs_id);
- //if(strcmp("22_19990256",tmp.read_name.c_str())==0){
- // std::cout<<"Cluster: "<<tmp.svs_id<<std::endl;
- //}
- /*if (tmp.svs_id == 34 || tmp.svs_id == 35) {
- std::cout << "Cluster: " << tmp.svs_id << " " << tmp.read_name << std::endl;
- }*/
-
+ max_ID = std::max(max_ID, tmp.svs_id); //needs to be a long as we need to know the size prior to storing!
names[tmp.read_name].push_back(tmp.svs_id);
nbytes = fread(&tmp, sizeof(struct name_str), 1, alt_allel_reads);
}
@@ -68,7 +61,7 @@ void Cluster_SVS::update_SVs(std::vector<combine_str> & ids) {
if (count == col) { //if colum of id:
if (buffer[i - 1] == '\t') {
int id = atoi(&buffer[i]);
- fprintf(file, "%i", find_id(id, ids));
+ fprintf(file, "%s", find_id(id, ids).c_str());
fprintf(file, "%c", '\t');
}
} else {
@@ -93,28 +86,39 @@ void Cluster_SVS::update_SVs(std::vector<combine_str> & ids) {
move += filename;
system(move.c_str());
}
-void Cluster_SVS::add_id(int curr_id, int new_id, std::vector<combine_str> & ids) {
- for (size_t i = 0; i < ids.size(); i++) {
+void Cluster_SVS::add_id(int curr_id, int new_id, std::vector<combine_str> & ids,int subkey) {
+
+ for (size_t i = 0; i < ids.size(); i++) {//check if already in the new array
if (ids[i].curr_id == curr_id && ids[i].alt_id == new_id) {
ids[i].support++;
+ ids[i].hit=subkey;
return;
}
}
+
+ //make new entry:
combine_str tmp;
tmp.curr_id = curr_id;
- tmp.alt_id = new_id;
+ tmp.alt_id = new_id; //smallest ID of SVs
tmp.support = 1;
+ tmp.hit=subkey;
ids.push_back(tmp);
}
-int Cluster_SVS::find_id(int curr_id, std::vector<combine_str> & ids) {
+std::string Cluster_SVS::find_id(int curr_id, std::vector<combine_str> & ids) {
+ std::stringstream ss ;
for (size_t i = 0; i < ids.size(); i++) {
if (ids[i].support > Parameter::Instance()->min_grouping_support) {
if (ids[i].curr_id == curr_id) {
- return ids[i].alt_id;
+
+ ss<<ids[i].alt_id;
+ ss<<'_';
+ ss<<ids[i].hit;
+ return ss.str();
}
}
}
- return curr_id;
+ ss<<curr_id;
+ return ss.str();
}
void Cluster_SVS::update_SVs() {
//1: read in names + IDs -> store in map!
@@ -123,27 +127,28 @@ void Cluster_SVS::update_SVs() {
//TODO: restructure!
//id=svs_id;
- std::map<long, std::vector<int> > names = parse_names_ids(max_ID);
+ std::map<long, std::vector<int> > names = parse_names_ids(max_ID); //key = read_id values: SVs id's
//2: make array with ID as entry and value is the smalles ID in the colum of all storred readnames.
std::vector<combine_str> ids;
for (std::map<long, std::vector<int> >::iterator i = names.begin(); i != names.end(); i++) {
if ((*i).second.size() > 1) {
int min_id = max_ID + 1;
- for (size_t j = 0; j < (*i).second.size(); j++) {
+ for (size_t j = 0; j < (*i).second.size(); j++) { // get the smallest ID of SVs associated with the read id.
min_id = std::min(min_id, (*i).second[j]);
}
- for (size_t j = 0; j < (*i).second.size(); j++) {
- if ((*i).second[j] != min_id) {
- add_id((*i).second[j], min_id, ids);
- }
+ //min_id is now the smallest SVs id that this read is associated with.
+ int subkey=0;
+ for (size_t j = 0; j < (*i).second.size(); j++) { // update the other SVs IDs
+ //if ((*i).second[j] != min_id) {
+ add_id((*i).second[j], min_id, ids,subkey);
+ subkey++;
+ //}
}
}
}
names.clear();
-/* for (size_t i = 0; i < ids.size(); i++) {
- std::cout << ids[i].curr_id << " " << ids[i].alt_id << " " << ids[i].support << std::endl;
- }*/
+
//3: Update the IDS in the VCF/Bedpe files.
update_SVs(ids);
}
diff --git a/src/cluster/Cluster_SVs.h b/src/cluster/Cluster_SVs.h
index 8c0fc58..312dc96 100644
--- a/src/cluster/Cluster_SVs.h
+++ b/src/cluster/Cluster_SVs.h
@@ -12,23 +12,25 @@
#include <string.h>
#include <vector>
#include <map>
+#include <sstream>
#include "../Paramer.h"
struct __attribute__((packed)) name_str{
- long read_name;
+ long read_name; //needs to be a number to store in binary! (bit reservation!)
int svs_id;
};
struct combine_str{
int curr_id;
int alt_id;
int support;
+ short hit;
};
class Cluster_SVS{
private:
std::map<long, std::vector<int> > parse_names_ids(int & max_ID) ;
void update_SVs( std::vector<combine_str> & ids); //just because the pass is more efficient
- void add_id(int curr_id,int new_id, std::vector<combine_str> & ids);
- int find_id(int curr_id, std::vector<combine_str> & ids);
+ void add_id(int curr_id,int new_id, std::vector<combine_str> & ids,int subkey);
+ std::string find_id(int curr_id, std::vector<combine_str> & ids);
public:
Cluster_SVS(){
}
diff --git a/src/print/BedpePrinter.cpp b/src/print/BedpePrinter.cpp
index 746a928..0b4821d 100644
--- a/src/print/BedpePrinter.cpp
+++ b/src/print/BedpePrinter.cpp
@@ -8,7 +8,7 @@
#include "BedpePrinter.h"
void BedpePrinter::print_header() {
- fprintf(file, "%s", "#Chrom\tstart\tstop\tchrom2\tstart2\tstop2\tvariant_name/ID\tscore (smaller is better)\tstrand1\tstrand2\ttype\tnumber_of_split_reads\tbest_chrom\tbest_pos\tbest_chrom2\tbest_pos2\tpredicted_length\n");
+ fprintf(file, "%s", "#Chrom\tstart\tstop\tchrom2\tstart2\tstop2\tvariant_name/ID\tscore (smaller is better)\tstrand1\tstrand2\ttype\tnumber_of_split_reads\tbest_chr1\tbest_start\tbest_chr2\tbest_stop\tpredicted_length\n");
}
void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
if (!this->bed_tree.is_in(SV->get_coordinates().start.most_support, this->root) && !this->bed_tree.is_in(SV->get_coordinates().stop.most_support, this->root)) {
diff --git a/src/print/IPrinter.cpp b/src/print/IPrinter.cpp
index 8352356..8b72316 100644
--- a/src/print/IPrinter.cpp
+++ b/src/print/IPrinter.cpp
@@ -7,6 +7,36 @@
#include "IPrinter.h"
+bool IPrinter::to_print(Breakpoint * &SV, double & std_start, double & std_stop) {
+
+ std_start = 0;
+ std_stop = 0;
+ //comp_std(SV, std_start, std_stop);
+ comp_std_quantile(SV, std_start, std_stop);
+ bool to_print = true;
+
+ if(((SV->get_SVtype() & INS) && SV->get_coordinates().stop.most_support- SV->get_coordinates().start.most_support == Parameter::Instance()->huge_ins) && (SV->get_types().is_ALN)){
+ return (!SV->get_types().is_SR && (std_start<5 && std_stop<5));
+ }
+
+ if ((SV->get_SVtype() & INS) || (SV->get_SVtype() & DEL)) { //for insertions + deletions:
+ double dist = (double) (SV->get_coordinates().stop.most_support - SV->get_coordinates().start.most_support);
+ //dist = (double)std::min(((int)dist * 4), Parameter::Instance()->max_dist);
+ dist = dist * 4.0 * (uniform_variance / 2); //because we test against corrected value!
+ //std::cout<<"DIST: "<<(SV->get_coordinates().stop.most_support - SV->get_coordinates().start.most_support)<<" "<<dist<<" STD: "<<std_start<<" "<<std_stop<<std::endl;
+ return ((std_start < dist && std_stop < dist)); //0.2886751
+ }
+
+ //TODO test!
+ if (SV->get_SVtype() & NEST) {
+ return true;
+ }
+ double max_allowed=4*Parameter::Instance()->max_dist*(uniform_variance / 2);
+
+ return (std_start < max_allowed && std_stop < max_allowed);
+
+}
+
std::string IPrinter::get_chr(long pos, RefVector ref) {
// std::cout << "pos: " << pos << std::endl;
size_t id = 0;
@@ -19,7 +49,7 @@ std::string IPrinter::get_chr(long pos, RefVector ref) {
return ref[id - 1].RefName;
}
-void IPrinter::store_readnames(std::vector<int> names, int id) {
+void IPrinter::store_readnames(std::vector<long> names, int id) {
name_str tmp;
tmp.svs_id = id; //stays the same
for (size_t i = 0; i < names.size(); i++) {
@@ -67,10 +97,15 @@ std::string IPrinter::get_type(char type) {
if (!tmp.empty()) {
tmp += '/';
}
- tmp += "BND";
- //tmp += "TRA";
+ //tmp += "BND";
+ tmp += "TRA";
+ }
+ if (type & NEST) {
+ if (!tmp.empty()) {
+ tmp += '/';
+ }
+ tmp += "INVDUP";
}
-
return tmp; // should not occur!
}
// Get current date/time, format is YYYY-MM-DD.HH:mm:ss
@@ -84,3 +119,111 @@ const std::string IPrinter::currentDateTime() {
strftime(buf, sizeof(buf), "%Y%m%d", &tstruct);
return buf;
}
+
+void IPrinter::sort_insert(int pos, std::vector<int> & positions) {
+
+ size_t i = 0;
+ while (i < positions.size() && positions[i] < pos) {
+ i++;
+ }
+ positions.insert(positions.begin() + i, pos);
+
+}
+void IPrinter::comp_std_med(Breakpoint * &SV, double & std_start, double & std_stop) {
+ std::vector<int> std_start_dists;
+ std::vector<int> std_stop_dists;
+
+ std::map<std::string, read_str> support = SV->get_coordinates().support;
+ for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
+ if ((*i).second.SV & SV->get_SVtype()) {
+ if ((*i).second.coordinates.first != -1) {
+ long diff = (SV->get_coordinates().start.most_support - (*i).second.coordinates.first);
+ // std::cout << "DIFF Start: " << diff << std::endl;
+ sort_insert(std::pow((double) diff, 2.0), std_start_dists);
+ //std_start += std::pow((double) diff, 2.0);
+ }
+ if ((*i).second.coordinates.second != -1) {
+ long diff = (SV->get_coordinates().stop.most_support - (*i).second.coordinates.second);
+ // std::cout << "DIFF Stop: " << diff << std::endl;
+ sort_insert(std::pow((double) diff, 2.0), std_stop_dists);
+ //std_stop += std::pow((double) diff, 2.0);
+ }
+ }
+ }
+ int median = std_stop_dists.size() / 2;
+ std_start = std::sqrt(std_start_dists[median]);
+ std_stop = std::sqrt(std_stop_dists[median]);
+}
+
+void IPrinter::comp_std_quantile(Breakpoint * &SV, double & std_start, double & std_stop) {
+ double count = 0;
+ std::vector<int> std_start_dists;
+ std::vector<int> std_stop_dists;
+
+ std::stringstream ss;
+
+ std::map<std::string, read_str> support = SV->get_coordinates().support;
+
+ fprintf(distances, "%s", IPrinter::get_type(SV->get_SVtype()).c_str());
+ fprintf(distances, "%c", '\t');
+ fprintf(distances, "%i", SV->get_coordinates().stop.most_support-SV->get_coordinates().start.most_support);
+ fprintf(distances, "%c", '\t');
+ fprintf(distances, "%i", support.size());
+ fprintf(distances, "%c", '\t');
+ for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
+ if ((*i).second.SV & SV->get_SVtype()) {
+ if ((*i).second.coordinates.first != -1) {
+ long diff = (SV->get_coordinates().start.most_support - (*i).second.coordinates.first);
+ ss << '\t';
+ ss << diff;
+ sort_insert(std::pow((double) diff, 2.0), std_start_dists);
+ }
+ if ((*i).second.coordinates.second != -1) {
+ long diff = (SV->get_coordinates().stop.most_support - (*i).second.coordinates.second);
+ ss << '\t';
+ ss << diff;
+
+ sort_insert(std::pow((double) diff, 2.0), std_stop_dists);
+ }
+ }
+ }
+
+ count = 0;
+ for (int i = 0; i < std::max((int)std_stop_dists.size() / 2,8); i++) {
+ std_start += std_start_dists[i];
+ std_stop += std_stop_dists[i];
+ count++;
+ }
+
+ std_start = std::sqrt(std_start / count);
+ std_stop = std::sqrt(std_stop / count);
+ fprintf(distances, "%f", std_start);
+ fprintf(distances, "%s",ss.str().c_str());
+ fprintf(distances, "%c", '\n');
+
+
+}
+void IPrinter::comp_std(Breakpoint * &SV, double & std_start, double & std_stop) {
+ double count = 0;
+ std_start = 0;
+ std_stop = 0;
+ std::map<std::string, read_str> support = SV->get_coordinates().support;
+ for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
+ if ((*i).second.SV & SV->get_SVtype()) {
+ count++;
+ if ((*i).second.coordinates.first != -1) {
+ long diff = (SV->get_coordinates().start.most_support - (*i).second.coordinates.first);
+ // std::cout << "DIFF Start: " << diff << std::endl;
+ std_start += std::pow((double) diff, 2.0);
+ }
+ if ((*i).second.coordinates.second != -1) {
+ long diff = (SV->get_coordinates().stop.most_support - (*i).second.coordinates.second);
+ // std::cout << "DIFF Stop: " << diff << std::endl;
+ std_stop += std::pow((double) diff, 2.0);
+ }
+
+ }
+ }
+ std_start = std::sqrt(std_start / count);
+ std_stop = std::sqrt(std_stop / count);
+}
diff --git a/src/print/IPrinter.h b/src/print/IPrinter.h
index ed929e0..d6ce613 100644
--- a/src/print/IPrinter.h
+++ b/src/print/IPrinter.h
@@ -15,10 +15,15 @@
#include "../Ignore_Regions.h"
#include "../sub/Breakpoint.h"
#include "../cluster/Cluster_SVs.h"
+#include <math.h>
+
+double const uniform_variance = 0.2886751; //sqrt(1/12) see variance of uniform distribution -> std
+
class IPrinter {
protected:
FILE *file;
+ FILE *distances;
FILE *tmp_file;
uint id;
RefVector ref;
@@ -31,8 +36,9 @@ protected:
long calc_pos(long pos, RefVector ref, std::string &chr);
std::string get_chr(long pos, RefVector ref);
std::string get_type(char type);
-
+ void sort_insert(int pos,std::vector<int> & positons);
public:
+
IPrinter() {
id = 0;
root = NULL;
@@ -47,6 +53,7 @@ public:
print_body(SV, ref);
}
void init() {
+ distances=fopen("distances.txt", "w");
if(!Parameter::Instance()->output_vcf.empty()){
file = fopen(Parameter::Instance()->output_vcf.c_str(), "w");
}else if(!Parameter::Instance()->output_bedpe.empty()){
@@ -64,10 +71,14 @@ public:
tmp_name_file+="Names";
tmp_file=fopen(tmp_name_file.c_str(), "wb");
}
- void store_readnames(std::vector<int> names, int id);
+ bool to_print(Breakpoint * &SV,double & std_start,double &std_stop);
+ void store_readnames(std::vector<long> names, int id);
void close_file(){
fclose(this->file);
}
+ void comp_std(Breakpoint * &SV,double & std_start, double & std_stop);
+ void comp_std_med(Breakpoint * &SV, double & std_start, double & std_stop);
+ void comp_std_quantile(Breakpoint * &SV, double & std_start, double & std_stop);
const std::string currentDateTime();
};
diff --git a/src/print/VCFPrinter.cpp b/src/print/VCFPrinter.cpp
index fafe8fe..1515355 100644
--- a/src/print/VCFPrinter.cpp
+++ b/src/print/VCFPrinter.cpp
@@ -18,6 +18,7 @@ void VCFPrinter::print_header() {
fprintf(file, "%s", "##ALT=<ID=DEL,Description=\"Deletion\">\n");
fprintf(file, "%s", "##ALT=<ID=DUP,Description=\"Duplication\">\n");
fprintf(file, "%s", "##ALT=<ID=INV,Description=\"Inversion\">\n");
+ fprintf(file, "%s", "##ALT=<ID=INVDUP,Description=\"InvertedDUP with unknown boundaries\">\n");
fprintf(file, "%s", "##ALT=<ID=TRA,Description=\"Translocation\">\n");
fprintf(file, "%s", "##ALT=<ID=INS,Description=\"Insertion\">\n");
fprintf(file, "%s", "##INFO=<ID=CHR2,Number=1,Type=String,Description=\"Chromosome for END coordinate in case of a translocation\">\n");
@@ -43,86 +44,132 @@ void VCFPrinter::print_header() {
void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
if (!this->bed_tree.is_in(SV->get_coordinates().start.most_support, this->root) && !this->bed_tree.is_in(SV->get_coordinates().stop.most_support, this->root)) {
//temp. store read names supporting this SVs to later group the SVs together.
+ //double std_start = 0;
+ //double std_stop = 0;
+ // double std_medstart = 0;
+ //double std_medstop = 0;
+ double std_quant_start = 0;
+ double std_quant_stop = 0;
+ //comp_std(SV, std_start, std_stop);
+ //comp_std_med(SV, std_medstart, std_medstop);
- if (Parameter::Instance()->phase) {
- store_readnames(SV->get_read_ids(), id);
- }
- std::string chr;
- int pos = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr);
- fprintf(file, "%s", chr.c_str());
- fprintf(file, "%c", '\t');
- fprintf(file, "%i", pos);
- fprintf(file, "%c", '\t');
- fprintf(file, "%i", id);
- id++;
- pos = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr);
- std::string strands = SV->get_strand(1);
- fprintf(file, "%s", "\tN\t");
- if (SV->get_SVtype() & TRA) { //TODO check for INV!
- //N[22:36765684[ +-
- //]21:10540232]N -+
- if (strands[0] == '+') {
- fprintf(file, "%s", "N[");
- fprintf(file, "%s", chr.c_str());
- fprintf(file, "%c", ':');
- fprintf(file, "%i", pos);
- } else {
- fprintf(file, "%c", ']');
- fprintf(file, "%s", chr.c_str());
- fprintf(file, "%c", ':');
- fprintf(file, "%i", pos);
- }
- if (strands[1] == '+') {
- fprintf(file, "%c", '[');
- } else {
- fprintf(file, "%c", ']');
- }
- if (strands[0] == '-') {
- fprintf(file, "%c", 'N');
+ /* if ((SV->get_SVtype() & NEST) || ((std_start < SV->get_length() * 2 && std_stop < SV->get_length() * 2) && (std_start<400 && std_stop<400))) {
+ */
+ if (to_print(SV,std_quant_start,std_quant_stop)) {
+ if (Parameter::Instance()->phase) {
+ store_readnames(SV->get_read_ids(), id);
}
- fprintf(file, "%s", "\t.\tPASS\tIMPRECISE;SVMETHOD=Snifflesv");
- fprintf(file, "%s", Parameter::Instance()->version.c_str());
+ std::string chr;
+ int start = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr);
+ fprintf(file, "%s", chr.c_str());
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", start);
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", id);
+ id++;
+
+ int end = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr);
+ std::string strands = SV->get_strand(1);
+ fprintf(file, "%s", "\tN\t");
+ /*if (SV->get_SVtype() & TRA) { //TODO check for INV!
+ //N[22:36765684[ +-
+ //]21:10540232]N -+
+ if (strands[0] == '+') {
+ fprintf(file, "%s", "N[");
+ fprintf(file, "%s", chr.c_str());
+ fprintf(file, "%c", ':');
+ fprintf(file, "%i", pos);
+
+ } else {
+ fprintf(file, "%c", ']');
+ fprintf(file, "%s", chr.c_str());
+ fprintf(file, "%c", ':');
+ fprintf(file, "%i", pos);
+ }
+ if (strands[1] == '+') {
+ fprintf(file, "%c", '[');
+ } else {
+ fprintf(file, "%c", ']');
+ }
+ if (strands[0] == '-') {
+ fprintf(file, "%c", 'N');
+ }
+ fprintf(file, "%s", "\t.\tPASS\tIMPRECISE;SVMETHOD=Snifflesv");
+ fprintf(file, "%s", Parameter::Instance()->version.c_str());
+
+ } else {*/
- } else {
fprintf(file, "%c", '<');
fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str());
fprintf(file, "%c", '>');
- fprintf(file, "%s", "\t.\tPASS\tIMPRECISE;SVMETHOD=Snifflesv");
+
+ fprintf(file, "%s", "\t.\tPASS\t");
+ if(std_quant_start<10 && std_quant_stop<10){
+ fprintf(file, "%s", "PRECISE");
+ }else{
+ fprintf(file, "%s", "IMPRECISE");
+ }
+
+ fprintf(file, "%s", ";SVMETHOD=Snifflesv");
fprintf(file, "%s", Parameter::Instance()->version.c_str());
fprintf(file, "%s", ";CHR2=");
fprintf(file, "%s", chr.c_str());
fprintf(file, "%s", ";END=");
if (SV->get_SVtype() & INS) {
- fprintf(file, "%i", pos + SV->get_length());
+ fprintf(file, "%i", std::max((int) (end - SV->get_length()), start));
+ // } else if (SV->get_SVtype() & NEST) {
+ // fprintf(file, "%i", start);
} else {
- fprintf(file, "%i", pos);
+ fprintf(file, "%i", end);
}
- }
- fprintf(file, "%s", ";SVTYPE=");
- fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str());
- if (Parameter::Instance()->report_n_reads > 0) {
- std::vector<std::string> names = SV->get_read_names(Parameter::Instance()->report_n_reads);
- fprintf(file, "%s", ";RNAMES=");
- for (size_t i = 0; i < names.size() && i < Parameter::Instance()->report_n_reads; i++) {
- fprintf(file, "%s", names[i].c_str());
+
+ /*fprintf(file, "%s", ";STD_start=");
+ fprintf(file, "%f", std_start);
+ fprintf(file, "%s", ";STD_med_start=");
+ fprintf(file, "%f", std_medstart);*/
+ fprintf(file, "%s", ";STD_quant_start=");
+ fprintf(file, "%f", std_quant_start);
+ /*fprintf(file, "%s", ";STD_end=");
+ fprintf(file, "%f", std_stop);
+ fprintf(file, "%s", ";STD_med_stop=");
+ fprintf(file, "%f", std_medstop);*/
+ fprintf(file, "%s", ";STD_quant_stop=");
+ fprintf(file, "%f", std_quant_stop);
+ // }
+ fprintf(file, "%s", ";SVTYPE=");
+ fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str());
+ if (Parameter::Instance()->report_n_reads > 0) {
+ std::vector<std::string> names = SV->get_read_names(Parameter::Instance()->report_n_reads);
+ fprintf(file, "%s", ";RNAMES=");
+ for (size_t i = 0; i < names.size() && i < Parameter::Instance()->report_n_reads; i++) {
+ fprintf(file, "%s", names[i].c_str());
+ fprintf(file, "%s", ";");
+ }
+ } else {
fprintf(file, "%s", ";");
}
- } else {
- fprintf(file, "%s", ";");
+ fprintf(file, "%s", "SUPTYPE=");
+ fprintf(file, "%s", SV->get_supporting_types().c_str());
+ fprintf(file, "%s", ";SVLEN=");
+ // if (SV->get_SVtype() & NEST) {
+ // fprintf(file, "%i", -1);
+ // } else {
+ if(((SV->get_SVtype() & INS) && SV->get_length()== Parameter::Instance()->huge_ins) && !SV->get_types().is_SR ){
+ fprintf(file, "%s", "NA");
+ }else{
+ fprintf(file, "%i", SV->get_length());
+ }
+ // }
+ fprintf(file, "%s", ";STRANDS=");
+ fprintf(file, "%s", strands.c_str());
+ fprintf(file, "%s", ";RE=");
+ fprintf(file, "%i", SV->get_support());
+ fprintf(file, "%s", "\tGT:DR:DV\t./.:.:");
+ fprintf(file, "%i", SV->get_support());
+ fprintf(file, "%c", '\n');
}
- fprintf(file, "%s", "SUPTYPE=");
- fprintf(file, "%s", SV->get_supporting_types().c_str());
- fprintf(file, "%s", ";SVLEN=");
- fprintf(file, "%i", SV->get_length());
- fprintf(file, "%s", ";STRANDS=");
- fprintf(file, "%s", strands.c_str());
- fprintf(file, "%s", ";RE=");
- fprintf(file, "%i", SV->get_support());
- fprintf(file, "%s", "\tGT:DR:DV\t./.:.:");
- fprintf(file, "%i", SV->get_support());
- fprintf(file, "%c", '\n');
}
}
diff --git a/src/sub/Breakpoint.cpp b/src/sub/Breakpoint.cpp
index 50cec3a..416cca4 100644
--- a/src/sub/Breakpoint.cpp
+++ b/src/sub/Breakpoint.cpp
@@ -15,26 +15,28 @@
#include "Breakpoint.h"
///////////////////////////////// MERGING////////////////////////////////////////////
-std::string print_type(char SV1){
- std::string type="";
- if((SV1 & INS)){
- type+= "INS";
+std::string print_type(char SV1) {
+ std::string type = "";
+ if ((SV1 & INS)) {
+ type += "INS";
}
-
- if((SV1 & TRA)){
- type+= "TRA";
+ if ((SV1 & TRA)) {
+ type += "TRA";
}
-
- if((SV1 & INV)){
- type+= "INV";
+ if ((SV1 & INV)) {
+ type += "INV";
}
-
- if((SV1 & DEL)){
- type+= "DEL";
+ if ((SV1 & DEL)) {
+ type += "DEL";
}
-
- if((SV1 & DUP)){
- type+= "DUP";
+ if ((SV1 & DUP)) {
+ type += "DUP";
+ }
+ if ((SV1 & NEST)) {
+ type += "INVDUP";
+ }
+ if ((SV1 & NEST)) {
+ type += "INS_open";
}
return type;
}
@@ -53,136 +55,164 @@ bool Breakpoint::check_SVtype(Breakpoint * break1, Breakpoint * break2) { //todo
return true;
} else if ((SV1 & DEL) && (SV2 & DEL)) {
return true;
- } //else if (((SV1 & DUP) && (SV2 & INS)) || ((SV1 & INS) && (SV2 & DUP))) {
+ } else if (((SV1 & NEST) && (SV2 & NEST)) || (((SV1 & NEST) && (SV2 & INV)) || ((SV1 & INV) && (SV2 & NEST)))) {
+ return true;
+ }
+ //else if (((SV1 & DUP) && (SV2 & INS)) || ((SV1 & INS) && (SV2 & DUP))) {
// return true;
//}
//std::cout<<"S1: "<<print_type(SV1)<<" S2 "<<print_type(SV2)<<" "<<(*break2->get_coordinates().support.begin()).second.type<<std::endl;
- return false;
+ return (SV1 == SV2);
}
bool Breakpoint::is_same_strand(Breakpoint * tmp) {
//todo check for same SVtype? except for INV+DEL
if (check_SVtype(tmp, this)) {
- if ((*tmp->get_coordinates().support.begin()).second.SV & TRA) { //only for tra since we get otherwise a problem with the cigar events
- return ((*tmp->get_coordinates().support.begin()).second.strand.first == (*this->get_coordinates().support.begin()).second.strand.first && (*tmp->get_coordinates().support.begin()).second.strand.second == (*this->get_coordinates().support.begin()).second.strand.second);
- }
+
+ //if ((*tmp->get_coordinates().support.begin()).second.SV & TRA) { //only for tra since we get otherwise a problem with the cigar events
+ // //std::string readname= (*tmp->get_coordinates().support.begin()).first;
+ // return ((*tmp->get_coordinates().support.begin()).second.strand.first == (*this->get_coordinates().support.begin()).second.strand.first && (*tmp->get_coordinates().support.begin()).second.strand.second == (*this->get_coordinates().support.begin()).second.strand.second);
+ //}
return true;
}
return false;
}
int get_dist(Breakpoint * tmp) {
-
position_str pos = tmp->get_coordinates();
+
//return Parameter::Instance()->max_dist;
- if ((*tmp->get_coordinates().support.begin()).second.SV & TRA || (*tmp->get_coordinates().support.begin()).second.SV & INS) {
+
+ if ((*tmp->get_coordinates().support.begin()).second.SV & TRA) {
return Parameter::Instance()->max_dist; //TODO: change!
- } else {
- int max_val = Parameter::Instance()->max_dist; // * 0.5; //TOOD maybe 0.5?
- return std::max(std::min((int) (pos.stop.max_pos - pos.start.min_pos) / 80, Parameter::Instance()->max_dist), max_val); //20% of the lenght of the SV
}
+ long dist = (pos.stop.max_pos - pos.start.min_pos);
+
+ if ((*pos.support.begin()).second.length != 1) {
+ dist = (*pos.support.begin()).second.length;
+ }
+
+ /* if(dist<10){
+ std::cout<<"DIST <10 ! "<<dist <<std::endl;
+ }*/
+ dist = std::max((long) Parameter::Instance()->min_length * 2, dist);
+
+ /*if(dist <10){//TODO
+ dist=(long)Parameter::Instance()->max_dist;
+ std::cout<<"LEN SMALLER! "<<(*pos.support.begin()).first<<" "<<Parameter::Instance()->max_dist<<" "<<(pos.stop.max_pos - pos.start.min_pos)<< std::endl;
+ }*/
+ return std::min((int) (dist * 4), Parameter::Instance()->max_dist);
}
long Breakpoint::overlap(Breakpoint * tmp) {
bool flag = false;
-
- int max_dist = Parameter::Instance()->max_dist; // get_dist(tmp);
+//flag = ((*tmp->get_coordinates().support.begin()).second.SV & DEL);
+ int max_dist = std::min(get_dist(tmp), get_dist(this)); // Parameter::Instance()->max_dist
if (flag) {
- std::cout << "\t Overlap: " << max_dist << " start: " << abs(tmp->get_coordinates().start.min_pos - positions.stop.min_pos) << " stop :" << abs(tmp->get_coordinates().stop.max_pos - positions.start.max_pos);
- if(is_same_strand(tmp) ){
- std::cout<<"same strand";
+ std::cout << "\t Overlap: " << max_dist << " start: " << tmp->get_coordinates().start.min_pos << " " << positions.start.min_pos << " stop :" << tmp->get_coordinates().stop.max_pos << " " << positions.stop.max_pos;
+ if ((*positions.support.begin()).second.SV & DEL) {
+ std::cout << " Is DEL";
+ } else if ((*positions.support.begin()).second.SV & INS) {
+ std::cout << " Is Ins ";
+ } else if ((*positions.support.begin()).second.SV & DUP) {
+ std::cout << " Is Dup ";
+ } else if ((*positions.support.begin()).second.SV & INV) {
+ std::cout << " Is Inv ";
}
+ std::cout << " Support: " << positions.support.size();
+ std::cout << std::endl;
+
}
- //check type. ALN could be part of the SPlit read event! Not merge two split reads!
+//merging two robust calls:
+
if (is_same_strand(tmp) && (abs(tmp->get_coordinates().start.min_pos - positions.start.min_pos) < max_dist && abs(tmp->get_coordinates().stop.max_pos - positions.stop.max_pos) < max_dist)) {
if (flag) {
- std::cout << "\t hit!" << std::endl;
-
+ cout << "\tHIT" << endl;
}
return 0;
}
- //TODO:: if one of the two SVs is based on ALN it might be good to join them: to merge noisy flanking regions
- if (((tmp->get_types().is_ALN || this->get_types().is_ALN) && !(tmp->get_types().is_ALN && this->get_types().is_ALN)) && (abs(tmp->get_coordinates().start.min_pos - positions.stop.min_pos) < max_dist / 2 || abs(tmp->get_coordinates().stop.max_pos - positions.start.max_pos) < max_dist / 2)) { //TODO maybe add SV type check!
- if (flag) {
- std::cout << "\t hit!" << std::endl;
- }
+ if ((is_NEST(tmp, this) && is_same_strand(tmp)) && (abs(tmp->get_coordinates().start.min_pos - positions.start.min_pos) < Parameter::Instance()->max_dist || abs(tmp->get_coordinates().start.min_pos - positions.stop.min_pos) < Parameter::Instance()->max_dist)) {
return 0;
}
- if (flag) {
- std::cout << "no hit? " << std::endl;
- }
+//extend Split read by noisy region: //not longer needed??
+ /* if (((tmp->get_types().is_Noise || this->get_types().is_Noise) && !(tmp->get_types().is_Noise && this->get_types().is_Noise))
+ && (abs(tmp->get_coordinates().start.min_pos - positions.stop.min_pos) < max_dist / 2 || abs(tmp->get_coordinates().stop.max_pos - positions.start.max_pos) < max_dist / 2)) { //TODO maybe add SV type check!
+ if (flag) {
+ cout << "\tHIT Noise" << endl;
+ }
+ return 0;
+ }*/
//as abstraction lets try the start+stop coordinate!
long diff = (tmp->get_coordinates().start.min_pos - positions.start.min_pos);
- if (abs(diff) < max_dist) {
- return (tmp->get_coordinates().stop.max_pos - positions.stop.max_pos);
+//if (abs(diff) < max_dist) {
+//return (tmp->get_coordinates().stop.max_pos - positions.stop.max_pos);
+//}
+ if (diff == 0) {
+ return 1;
}
return diff; // + (tmp->get_coordinates().stop.max_pos - positions.stop.max_pos);
}
-void Breakpoint::add_read(Breakpoint * point) {
- if (point != NULL) {
- if ((*point->get_coordinates().support.begin()).second.type == 0) {
- this->type.is_ALN = true;
- }
- if ((*point->get_coordinates().support.begin()).second.type == 1) {
- this->type.is_SR = true;
- }
-
- if ((point->get_types().is_ALN || this->get_types().is_ALN) && !(point->get_types().is_ALN && this->get_types().is_ALN)) { //
- this->type.is_ALN = false; //TODO trick ;)
- //THis is to merger noisy flanking regions:
- if (abs(point->get_coordinates().start.min_pos - positions.stop.min_pos) < Parameter::Instance()->max_dist / 2) { //left of the current position
- //start stays; stop changes
- this->positions.stop.min_pos = positions.stop.min_pos;
- this->positions.stop.max_pos = positions.stop.max_pos;
- //Now we make the stop positions invalid:
- std::map<std::string, read_str> support = this->positions.support;
- for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
- (*i).second.coordinates.second = -1;
- }
- support = point->get_coordinates().support;
- for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
- (*i).second.coordinates.second = -1;
- }
- } else if (abs(point->get_coordinates().stop.max_pos - positions.start.max_pos) < Parameter::Instance()->max_dist / 2) { //right of the current position
- // cout << "right " << endl;
- //stop stays; start changes:
- this->positions.start.min_pos = positions.start.min_pos;
- this->positions.start.max_pos = positions.start.max_pos;
-
- //Now we make the start positions invalid:
- std::map<std::string, read_str> support = this->positions.support;
- for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
- (*i).second.coordinates.first = -1;
- }
- support = point->get_coordinates().support;
- for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
- (*i).second.coordinates.first = -1;
- }
- }
- } else {
- this->positions.start.min_pos = min(this->positions.start.min_pos, point->get_coordinates().start.min_pos);
- this->positions.start.max_pos = max(this->positions.start.max_pos, point->get_coordinates().start.max_pos);
+void Breakpoint::add_read(Breakpoint * point) { //point = one read support!
- if (point->get_coordinates().support.begin()->second.SV & INS) {
- this->positions.stop.min_pos = this->positions.start.max_pos;
- this->positions.stop.max_pos = this->positions.start.max_pos;
- } else {
- this->positions.stop.min_pos = min(this->positions.stop.min_pos, point->get_coordinates().stop.min_pos);
- this->positions.stop.max_pos = max(this->positions.stop.max_pos, point->get_coordinates().stop.max_pos);
- }
- }
- bool flag = false;
+ if (point != NULL) {
+ /*if ((*point->get_coordinates().support.begin()).second.type == 0) {
+ this->type.is_ALN = true;
+ }
+ if ((*point->get_coordinates().support.begin()).second.type == 1) {
+ this->type.is_SR = true;
+ }
+
+ if ((point->get_types().is_ALN || this->get_types().is_ALN) && !(point->get_types().is_ALN && this->get_types().is_ALN)) { //just to extend a split read event
+ this->type.is_ALN = false; //TODO trick ;)
+ //THis is to merger noisy flanking regions:
+ if (abs(point->get_coordinates().start.min_pos - positions.stop.min_pos) < Parameter::Instance()->max_dist / 2) { //left of the current position
+ //start stays; stop changes
+ this->positions.stop.min_pos = positions.stop.min_pos;
+ this->positions.stop.max_pos = positions.stop.max_pos;
+ //Now we make the stop positions invalid:
+ std::map<std::string, read_str> support = this->positions.support;
+ for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
+ (*i).second.coordinates.second = -1;
+ }
+ support = point->get_coordinates().support;
+ for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
+ (*i).second.coordinates.second = -1;
+ }
+ } else if (abs(point->get_coordinates().stop.max_pos - positions.start.max_pos) < Parameter::Instance()->max_dist / 2) { //right of the current position
+ // cout << "right " << endl;
+ //stop stays; start changes:
+ this->positions.start.min_pos = positions.start.min_pos;
+ this->positions.start.max_pos = positions.start.max_pos;
+
+ //Now we make the start positions invalid:
+ std::map<std::string, read_str> support = this->positions.support;
+ for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
+ (*i).second.coordinates.first = -1;
+ }
+ support = point->get_coordinates().support;
+ for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
+ (*i).second.coordinates.first = -1;
+ }
+ }
+ } else { // merge two events:
+ //grow interval:
+ // this->positions.start.min_pos = min(this->positions.start.min_pos, point->get_coordinates().start.min_pos);
+ // this->positions.start.max_pos = max(this->positions.start.max_pos, point->get_coordinates().start.max_pos);
+
+ /* if (point->get_coordinates().support.begin()->second.SV & INS) {
+ this->positions.stop.min_pos = this->positions.start.max_pos;
+ this->positions.stop.max_pos = this->positions.start.max_pos;
+ } else {
+ this->positions.stop.min_pos = min(this->positions.stop.min_pos, point->get_coordinates().stop.min_pos);
+ this->positions.stop.max_pos = max(this->positions.stop.max_pos, point->get_coordinates().stop.max_pos);
+ }
+ }*/
+ //merge the support:
std::map<std::string, read_str> support = point->get_coordinates().support;
for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
- if (this->positions.support[(*i).first].SV & INS) {
- flag = true;
- }
this->positions.support[(*i).first] = (*i).second;
}
- if (flag) {
- this->length = (this->length + point->get_length()) / 2;
- }
}
}
@@ -198,8 +228,8 @@ std::vector<std::string> Breakpoint::get_read_names(int maxim) {
return read_names;
}
-std::vector<int> Breakpoint::get_read_ids() {
- std: vector<int> read_names;
+std::vector<long> Breakpoint::get_read_ids() {
+ std: vector<long> read_names;
std::map<std::string, read_str> support = this->positions.support;
int num = 0;
for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
@@ -225,7 +255,7 @@ std::string Breakpoint::translate_strand(pair<bool, bool> strand_pair) {
}
void Breakpoint::summarize_type(char SV, std::vector<short>& array) {
- //std::string ss;
+//std::string ss;
if (SV & DEL) {
// ss += "DEL; ";
array[0]++;
@@ -250,13 +280,17 @@ void Breakpoint::summarize_type(char SV, std::vector<short>& array) {
// ss += "TRA; ";
array[4]++;
}
- //return ss;
+ if (SV & NEST) {
+ // ss += "NEST; ";
+ array[5]++;
+ }
+//return ss;
}
char Breakpoint::get_SVtype() {
- if (sv_type == ' ') {
- std::cerr << "was not set" << std::endl;
+ if (sv_type & NA) {
+// std::cerr << "was not set" << std::endl;
calc_support();
predict_SV();
}
@@ -264,22 +298,21 @@ char Breakpoint::get_SVtype() {
}
void Breakpoint::calc_support() {
- //if (positions.support.size() > Parameter::Instance()->min_support) {
std::vector<short> SV;
- for (size_t i = 0; i < 5; i++) {
+ for (size_t i = 0; i < 6; i++) {
SV.push_back(0);
}
- //run over all supports and check the majority type:
+//run over all supports and check the majority type:
for (std::map<std::string, read_str>::iterator i = positions.support.begin(); i != positions.support.end(); i++) {
summarize_type((*i).second.SV, SV);
}
- //given the majority type get the stats:
+//given the majority type get the stats:
this->sv_type = eval_type(SV);
- //}
+
}
char Breakpoint::eval_type(std::vector<short> SV) {
- /* std::stringstream ss;
+ /*std::stringstream ss;
if (SV[0] != 0) {
ss << " DEL(";
ss << SV[0];
@@ -306,8 +339,8 @@ char Breakpoint::eval_type(std::vector<short> SV) {
ss << ")";
}
this->sv_debug = ss.str(); //only for debug!
- std::cout << sv_debug << std::endl;
- */
+ std::cout << sv_debug << std::endl;*/
+
int maxim = 0;
int id = 0;
for (size_t i = 0; i < SV.size(); i++) {
@@ -332,17 +365,32 @@ char Breakpoint::eval_type(std::vector<short> SV) {
if (maxim == SV[4]) {
max_SV |= TRA;
}
-
+ if (maxim == SV[5]) {
+ max_SV |= NEST;
+ }
return max_SV;
}
+long get_median(std::vector<long> corrds) {
+ sort(corrds.begin(), corrds.end());
+ if (corrds.size() % 2 == 0) {
+ return (corrds[corrds.size() / 2 - 1] + corrds[corrds.size() / 2]) / 2;
+ }
+ return corrds[corrds.size() / 2];
+
+}
void Breakpoint::predict_SV() {
bool aln = false;
bool split = false;
+ bool noise = false;
int num = 0;
std::map<long, int> starts;
std::map<long, int> stops;
+ std::map<long, int> lengths; //ins!
std::map<std::string, int> strands;
+ std::vector<long> start2;
+ std::vector<long> stops2;
+ std::vector<long> lengths2;
for (std::map<std::string, read_str>::iterator i = positions.support.begin(); i != positions.support.end(); i++) {
if ((*i).second.SV & this->sv_type) { ///check type
@@ -353,6 +401,7 @@ void Breakpoint::predict_SV() {
} else {
starts[(*i).second.coordinates.first]++;
}
+ start2.push_back((*i).second.coordinates.first);
}
if ((*i).second.coordinates.second != -1) { //TODO test
if (stops.find((*i).second.coordinates.second) == stops.end()) {
@@ -360,12 +409,21 @@ void Breakpoint::predict_SV() {
} else {
stops[(*i).second.coordinates.second]++;
}
+ stops2.push_back((*i).second.coordinates.second);
}
- if (!((*i).second.type == 0 && ((*i).second.SV & INV))) {
+ if ((*i).second.SV & INS) { //check lenght for ins only!
+ // std::cout<<"LENGTH 1st: "<<(*i).second.length<<" "<<(*i).first<<std::endl;
+ if (lengths.find((*i).second.length) == lengths.end()) {
+ lengths[(*i).second.length] = 1;
+ } else {
- std::string tmp = translate_strand((*i).second.strand);
+ lengths[(*i).second.length]++;
+ }
+ lengths2.push_back((*i).second.length);
+ }
- //std::cout << tmp << std::endl;
+ if (!((*i).second.type == 0 && ((*i).second.SV & INV))) {
+ std::string tmp = translate_strand((*i).second.strand);
if (strands.find(tmp) == strands.end()) {
strands[tmp] = 1;
} else {
@@ -376,6 +434,8 @@ void Breakpoint::predict_SV() {
aln = true;
} else if ((*i).second.type == 1) {
split = true;
+ } else if ((*i).second.type == 2) {
+ noise = true;
} else {
std::cerr << "Type " << (*i).second.type << std::endl;
}
@@ -387,17 +447,18 @@ void Breakpoint::predict_SV() {
long counts = 0;
int maxim = 0;
long coord = 0;
+
for (map<long, int>::iterator i = starts.begin(); i != starts.end(); i++) {
- //cout<<"start:\t"<<(*i).first<<" "<<(*i).second<<endl;
+ // cout << "start:\t" << (*i).first << " " << (*i).second << endl;
if ((*i).second > maxim) {
coord = (*i).first;
maxim = (*i).second;
}
- mean += ((*i).first * (*i).second);
- counts += (*i).second;
+ //mean += ((*i).first * (*i).second);
+ // counts += (*i).second;
}
if (maxim < 5) {
- this->positions.start.most_support = mean / counts;
+ this->positions.start.most_support = get_median(start2); //mean / counts;
} else {
this->positions.start.most_support = coord;
}
@@ -407,21 +468,48 @@ void Breakpoint::predict_SV() {
mean = 0;
counts = 0;
for (map<long, int>::iterator i = stops.begin(); i != stops.end(); i++) {
+ // cout << "stop:\t" << (*i).first << " " << (*i).second << endl;
if ((*i).second > maxim) {
coord = (*i).first;
maxim = (*i).second;
}
- mean += ((*i).first * (*i).second);
- counts += (*i).second;
+ //mean += ((*i).first * (*i).second);
+ // counts += (*i).second;
}
if (maxim < 5) {
- this->positions.stop.most_support = mean / counts;
+ this->positions.stop.most_support = get_median(stops2); // mean / counts;
} else {
this->positions.stop.most_support = coord;
}
- if (!(this->get_SVtype() & INS)) {
+ /*if(this->get_SVtype() & NEST){
+ this->length = -1;
+ }else
+ */
+
+ if (!(this->get_SVtype() & INS)) { //all types but Insertions:
this->length = this->positions.stop.most_support - this->positions.start.most_support;
+ } else { //compute most supported length for insertions:
+ maxim = 0;
+ coord = 0;
+ mean = 0;
+ counts = 0;
+ for (map<long, int>::iterator i = lengths.begin(); i != lengths.end(); i++) {
+ // std::cout<<"LENGTH: "<<(*i).first<<" : "<<(*i).second<<std::endl;
+ if ((*i).second > maxim) {
+ coord = (*i).first;
+ maxim = (*i).second;
+ }
+ // mean += ((*i).first * (long) (*i).second);
+ // counts += (*i).second;
+ }
+ if (maxim < 3) {
+ this->length = get_median(lengths2);
+ } else {
+ this->length = coord;
+ }
+
}
+
starts.clear();
stops.clear();
@@ -444,14 +532,23 @@ void Breakpoint::predict_SV() {
this->supporting_types = "";
if (aln) {
+ this->type.is_ALN = true;
this->supporting_types += "AL";
}
if (split) {
+ this->type.is_SR = true;
if (!supporting_types.empty()) {
this->supporting_types += ",";
}
this->supporting_types += "SR";
}
+ if (noise) {
+ this->type.is_Noise = true;
+ if (!supporting_types.empty()) {
+ this->supporting_types += ",";
+ }
+ this->supporting_types += "NR";
+ }
}
std::string Breakpoint::get_chr(long pos, RefVector ref) {
@@ -510,7 +607,7 @@ std::string Breakpoint::get_strand(int num_best) {
//if(this->strand.empty()){
// predict_SV();
//}
- if (sv_type == ' ') {
+ if (sv_type & NA) {
// std::cout<<"was not set"<<std::endl;
calc_support();
predict_SV();
@@ -532,16 +629,20 @@ std::string Breakpoint::get_strand(int num_best) {
#include "Detect_Breakpoints.h"
std::string Breakpoint::to_string() {
std::stringstream ss;
- ss << "\t\tTREE: ";
- ss << TRANS_type(this->get_SVtype());
- ss << " ";
- ss << get_coordinates().start.min_pos;
- ss << ":";
- ss << get_coordinates().stop.max_pos;
- ss << " ";
- ss << positions.support.size();
- ss << " ";
- ss << get_strand(2);
+ if (positions.support.size() > 1) {
+ ss << "\t\tTREE: ";
+ ss << TRANS_type(this->get_SVtype());
+ ss << " ";
+ ss << get_coordinates().start.min_pos;
+ ss << ":";
+ ss << get_coordinates().stop.max_pos;
+ ss << " ";
+ ss << this->length;
+ ss << " ";
+ ss << positions.support.size();
+ ss << " ";
+ ss << get_strand(2);
+ }
return ss.str();
}
std::string Breakpoint::to_string(RefVector ref) {
diff --git a/src/sub/Breakpoint.h b/src/sub/Breakpoint.h
index ce27f73..233c421 100644
--- a/src/sub/Breakpoint.h
+++ b/src/sub/Breakpoint.h
@@ -14,6 +14,7 @@
#include <sstream>
#include <math.h>
#include <vector>
+#include <algorithm>
#include "../Paramer.h"
#include "../BamParser.h"
#include "../tree/BinTree.h"
@@ -35,7 +36,7 @@ struct svs_breakpoint_str{
};
struct read_str {
//to identify
- std::string name;
+// std::string name;
long id;
region_ref_str aln; //maybe we can use this!
short type; //split reads, cigar or md string
@@ -43,6 +44,7 @@ struct read_str {
pair<bool, bool> strand;
pair<long,long> coordinates; // I could use the bin tree for that!
char SV; // bit vector
+ int length;
};
struct position_str {
svs_breakpoint_str start;
@@ -60,6 +62,7 @@ struct position_str {
struct str_types{
bool is_SR;
bool is_ALN;
+ bool is_Noise;
};
//TODO define region object and inherit from that. Plus define avoid region objects for mappability problems.
@@ -72,7 +75,7 @@ private:
char sv_type;
std::string sv_debug;
std::string ref_seq;
- std::vector<short> support;
+ //std::vector<short> support;
short type_support;
//for phasing:
BinTree grouped;
@@ -89,11 +92,16 @@ private:
std::string translate_strand(pair<bool, bool> strand);
bool is_same_strand(Breakpoint * tmp);
bool check_SVtype(Breakpoint * break1, Breakpoint * break2);
+ bool is_NEST(Breakpoint * next, Breakpoint * curr){
+ return (( (*next->get_coordinates().support.begin()).second.SV& NEST) ||( (*curr->get_coordinates().support.begin()).second.SV& NEST) );
+ }
public:
Breakpoint(position_str sv,long len) {
- sv_type=' ';
+
+ sv_type |= NA;
type.is_ALN=((*sv.support.begin()).second.type==0);
type.is_SR=((*sv.support.begin()).second.type==1);
+ type.is_Noise=((*sv.support.begin()).second.type==2);
type_support=-1;
this->positions = sv;
this->grouped_node=NULL;
@@ -105,6 +113,10 @@ public:
int get_support();
long overlap(Breakpoint * tmp);
+ void set_coordinates(int start, int stop){
+ this->positions.start.min_pos=start;
+ this->positions.stop.max_pos=stop;
+ }
position_str get_coordinates() {
return this->positions;
}
@@ -146,8 +158,8 @@ public:
str_types get_types(){
return this->type;
}
- std::vector< std::string> get_read_names(int num);
- std::vector<int> get_read_ids();
+ std::vector<std::string> get_read_names(int num);
+ std::vector<long> get_read_ids();
std::string to_string();
};
diff --git a/src/sub/Detect_Breakpoints.cpp b/src/sub/Detect_Breakpoints.cpp
index b8e5c2f..9ff6bc0 100644
--- a/src/sub/Detect_Breakpoints.cpp
+++ b/src/sub/Detect_Breakpoints.cpp
@@ -6,6 +6,97 @@
*/
#include "Detect_Breakpoints.h"
+#include "../print/IPrinter.h"
+
+long fuck_off(long pos, RefVector ref, std::string &chr) {
+ size_t i = 0;
+ pos -= (ref[i].RefLength + Parameter::Instance()->max_dist);
+
+ while (i < ref.size() && pos >= 0) {
+ i++;
+ pos -= ((long) ref[i].RefLength + (long) Parameter::Instance()->max_dist);
+ }
+ chr = ref[i].RefName;
+ return pos + ref[i].RefLength + (long) Parameter::Instance()->max_dist;
+}
+void store_pos(vector<hist_str> &positions, long pos, std::string read_name) {
+ for (size_t i = 0; i < positions.size(); i++) {
+ if (abs(positions[i].position - pos) < Parameter::Instance()->min_length) {
+ positions[i].hits++;
+ positions[i].names.push_back(read_name);
+ return;
+ }
+ }
+ hist_str tmp;
+ tmp.position = pos;
+ tmp.hits = 1;
+ tmp.names.push_back(read_name);
+ positions.push_back(tmp);
+}
+
+Breakpoint * split_points(vector<std::string> names, std::map<std::string, read_str> support) {
+ std::map<std::string, read_str> new_support;
+ for (size_t i = 0; i < names.size(); i++) {
+ new_support[names[i]] = support[names[i]];
+ }
+ position_str svs;
+ svs.start.min_pos = 0;//just to initialize. Should not be needed anymore at this stage of the prog.
+ svs.stop.max_pos = 0;
+ svs.support = new_support;
+ Breakpoint * point = new Breakpoint(svs, (*new_support.begin()).second.coordinates.second - (*new_support.begin()).second.coordinates.first);
+ return point;
+}
+
+void detect_merged_svs(position_str point, RefVector ref, vector<Breakpoint *> & new_points) {
+ new_points.clear(); //just in case!
+ vector<hist_str> pos_start;
+ vector<hist_str> pos_stop;
+
+ for (std::map<std::string, read_str>::iterator i = point.support.begin(); i != point.support.end(); i++) {
+ long pos = (*i).second.coordinates.first;
+ //std::cout<<(*i).second.coordinates.first<<" "<<(*i).second.coordinates.second<<std::endl;
+ store_pos(pos_start, pos, (*i).first);
+ pos = (*i).second.coordinates.second;
+ store_pos(pos_stop, pos, (*i).first);
+ }
+// std::cout << "Start: " << std::endl;
+ int start_count = 0;
+ for (size_t i = 0; i < pos_start.size(); i++) {
+ if (pos_start[i].hits > Parameter::Instance()->min_support) {
+ start_count++;
+ /* std::string chr = "";
+ long pos = fuck_off(pos_start[i].position, ref, chr);
+ std::cout << chr << " " << pos << ":" << pos_start[i].hits << " ; ";
+ */
+ }
+ }
+// std::cout << std::endl;
+// std::cout << "Stop: " << std::endl;
+ int stop_count = 0;
+ for (size_t i = 0; i < pos_stop.size(); i++) {
+ if (pos_stop[i].hits > Parameter::Instance()->min_support) {
+ stop_count++;
+ /* std::string chr = "";
+ long pos = fuck_off(pos_stop[i].position, ref, chr);
+ std::cout << chr << " " << pos << ":" << pos_stop[i].hits << " ; ";
+ */
+ }
+ }
+// std::cout << std::endl;
+// std::cout << std::endl;
+
+ if (stop_count > 1 || start_count > 1) {
+ std::cout << "\tprocessing merged TRA" << std::endl;
+ if (start_count > 1) {
+ new_points.push_back(split_points(pos_start[0].names, point.support));
+ new_points.push_back(split_points(pos_start[1].names, point.support));
+ } else {
+ new_points.push_back(split_points(pos_stop[0].names, point.support));
+ new_points.push_back(split_points(pos_stop[1].names, point.support));
+ }
+ }
+}
+
std::string TRANS_type(char type) {
string tmp;
if (type & DEL) {
@@ -35,7 +126,12 @@ std::string TRANS_type(char type) {
}
tmp += "TRA";
}
-
+ if (type & NEST) {
+ if (!tmp.empty()) {
+ tmp += '/';
+ }
+ tmp += "NEST";
+ }
return tmp; // should not occur!
}
@@ -50,33 +146,48 @@ long get_ref_lengths(int id, RefVector ref) {
bool should_be_stored(Breakpoint *& point) {
point->calc_support(); // we need that before:
+ //std::cout << "Stored: " << point->get_support() << " " << point->get_length() << std::endl;
if (point->get_SVtype() & TRA) { // we cannot make assumptions abut support yet.
return (point->get_support() > 2); // this is needed as we take each chr independently and just look at the primary alignment
- } else if (point->get_support() > Parameter::Instance()->min_support) {
+ } else if (point->get_support() >= Parameter::Instance()->min_support) {
point->predict_SV();
- return (point->get_support() > Parameter::Instance()->min_support && point->get_length() > Parameter::Instance()->min_length);
+ return (point->get_support() >= Parameter::Instance()->min_support && point->get_length() > Parameter::Instance()->min_length);
}
return false;
}
-void polish_points(std::vector<Breakpoint *> & points) { //TODO might be usefull! but why does the tree not fully work??
+void polish_points(std::vector<Breakpoint *> & points, RefVector ref) { //TODO might be usefull! but why does the tree not fully work??
return;
if (!points.empty()) {
- std::vector<Breakpoint *> new_points;
- points[0]->calc_support();
- new_points.push_back(points[0]);
- for (size_t i = 1; i < points.size(); i++) {
+ for (size_t i = 0; i < points.size(); i++) {
+ points[i]->predict_SV();
points[i]->calc_support();
- if (should_be_stored(points[i])) {
- if (abs(points[i - 1]->get_coordinates().start.min_pos - points[i]->get_coordinates().start.min_pos) < Parameter::Instance()->min_length && abs(points[i - 1]->get_coordinates().stop.max_pos - points[i]->get_coordinates().stop.max_pos) < Parameter::Instance()->min_length) {
- //merge!
- std::cout << "HIT: " << points[i - 1]->get_coordinates().start.min_pos << " " << points[i - 1]->get_support() << " OTHER: " << points[i]->get_coordinates().start.min_pos << " " << points[i]->get_support() << std::endl;
+ }
+
+ for (size_t i = 0; i < points.size(); i++) {
+ if (!(points[i]->get_SVtype() & TRA)) { // we cannot make assumptions abut support yet.
+ for (size_t j = 0; j < points.size(); j++) {
+ if (!(points[j]->get_SVtype() & TRA)) {
+ if ((i != j)
+ && (abs(points[j]->get_coordinates().start.most_support - points[i]->get_coordinates().start.most_support) < Parameter::Instance()->max_dist / 2
+ && abs(points[j]->get_coordinates().stop.most_support - points[i]->get_coordinates().stop.most_support) < Parameter::Instance()->max_dist / 2)) {
+
+ std::string chr;
+ int pos1 = fuck_off(points[j]->get_coordinates().start.most_support, ref, chr);
+ std::cout << "HIT: " << chr << " " << pos1 << " " << TRANS_type((*points[j]->get_coordinates().support.begin()).second.SV) << " OTHER: ";
+
+ pos1 = fuck_off(points[i]->get_coordinates().start.most_support, ref, chr);
+
+ std::cout << chr << " " << pos1 << " " << TRANS_type((*points[i]->get_coordinates().support.begin()).second.SV) << std::endl;
+ }
+ }
}
}
}
}
}
+
void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
estimate_parameters(read_filename);
BamParser * mapped_file = 0;
@@ -93,13 +204,17 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
std::cout << "Start parsing..." << std::endl;
//Using Interval tree to store and manage breakpoints:
+ //IntervallList final;
+ //IntervallList bst;
+
IntervallTree final;
+ IntervallTree bst;
+
TNode * root_final = NULL;
int current_RefID = 0;
- IntervallTree bst;
TNode *root = NULL;
- //FILE * alt_allel_reads;
+//FILE * alt_allel_reads;
FILE * ref_allel_reads;
if (Parameter::Instance()->genotype) {
@@ -113,24 +228,23 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
}
Alignment * tmp_aln = mapped_file->parseRead(Parameter::Instance()->min_mq);
long ref_space = get_ref_lengths(tmp_aln->getRefID(), ref);
- std::string prev = "test";
- std::string curr = "wtf";
+ //std::string prev = "test";
+ //std::string curr = "curr";
long num_reads = 0;
while (!tmp_aln->getQueryBases().empty()) {
-
if ((tmp_aln->getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800) && tmp_aln->get_is_save())) {
- //flush_tree(local_tree, local_root, final, root_final);
-
- if (current_RefID != tmp_aln->getRefID()) { // Regular scan through the SV and move those where the end point lies far behind the current pos or reads. Eg. 1MB?
+ //change CHR:
+ if (current_RefID != tmp_aln->getRefID()) {
current_RefID = tmp_aln->getRefID();
ref_space = get_ref_lengths(tmp_aln->getRefID(), ref);
std::cout << "\tSwitch Chr " << ref[tmp_aln->getRefID()].RefName << " " << ref[tmp_aln->getRefID()].RefLength << std::endl;
std::vector<Breakpoint *> points;
// clarify(points);
bst.get_breakpoints(root, points);
- polish_points(points);
+ polish_points(points, ref);
for (int i = 0; i < points.size(); i++) {
if (should_be_stored(points[i])) {
+ //detect_merged_svs(points[i]);
if (points[i]->get_SVtype() & TRA) {
final.insert(points[i], root_final);
} else {
@@ -138,8 +252,9 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
}
}
}
- bst.makeempty(root);
+ bst.clear(root);
}
+ //SCAN read:
std::vector<str_event> aln_event;
std::vector<aln_str> split_events;
if (tmp_aln->getMappingQual() > Parameter::Instance()->min_mq) {
@@ -166,6 +281,7 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
}
//tmp_aln->set_supports_SV(aln_event.empty() && split_events.empty());
+ //Store reference supporting reads for genotype estimation:
str_read tmp;
tmp.SV_support = !(aln_event.empty() && split_events.empty());
if ((Parameter::Instance()->genotype && !tmp.SV_support) && (score == -1 || score > Parameter::Instance()->score_treshold)) {
@@ -176,59 +292,39 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
tmp.SV_support = false;
fwrite(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
}
- /*else { // we store the reads that support the SVs in IPrinter when writing out the SVs.
- for (size_t i = 0; i < split_events.size(); i++) {
- tmp.chr = ref[split_events[i].RefID].RefName;
- tmp.start = split_events[i].pos;
- tmp.length = split_events[i].length;
- fwrite(&tmp, sizeof(struct str_read), 1, alt_allel_reads);
- }
- if (split_events.empty()) { //splits store the primary aln as well!
- tmp.chr = ref[tmp_aln->getRefID()].RefName;
- tmp.start = tmp_aln->getPosition();
- tmp.length = tmp_aln->getRefLength();
- fwrite(&tmp, sizeof(struct str_read), 1, alt_allel_reads);
- }
- }*/
-
- // clock_t begin = clock();
- //maybe just store the extreme intervals for coverage -> If the cov doubled within Xbp or were the coverage is 0.
+
+ //store the potential SVs:
if (!aln_event.empty()) {
add_events(tmp_aln, aln_event, 0, ref_space, bst, root, num_reads);
}
- // Parameter::Instance()->meassure_time(begin, " add event ");
- //cout<<"event: "<<aln_event.size()<<endl;
- // begin = clock();
if (!split_events.empty()) {
add_splits(tmp_aln, split_events, 1, ref, bst, root, num_reads);
-
}
- // Parameter::Instance()->meassure_time(begin, " add split ");
-
}
}
+ //get next read:
mapped_file->parseReadFast(Parameter::Instance()->min_mq, tmp_aln);
+
num_reads++;
+
if (num_reads % 10000 == 0) {
- curr = tmp_aln->getName();
+ // curr = tmp_aln->getName();
cout << "\t\t# Processed reads: " << num_reads << endl;
- if (curr.size() == prev.size() && strcmp(prev.c_str(), curr.c_str()) == 0) {
- std::cerr << "Read occurred twice with primary alignment marked. Possibly no eof recognized in bam file." << std::endl;
- break;
- }
- prev = curr;
+ /*(if (curr.size() == prev.size() && strcmp(prev.c_str(), curr.c_str()) == 0) {
+ std::cerr << "Read occurred twice with primary alignment marked. Possibly no eof recognized in bam file." << std::endl;
+ break;
+ }
+ prev = curr;*/
}
}
-
-// cout << "Print tree:" << std::endl;
-// bst.inorder(root);
//filter and copy results:
std::cout << "Finalizing .." << std::endl;
std::vector<Breakpoint *> points;
bst.get_breakpoints(root, points);
- polish_points(points);
+ polish_points(points, ref);
for (int i = 0; i < points.size(); i++) {
if (should_be_stored(points[i])) {
+
if (points[i]->get_SVtype() & TRA) {
final.insert(points[i], root_final);
} else {
@@ -236,14 +332,26 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
}
}
}
- bst.makeempty(root);
+ bst.clear(root);
if (Parameter::Instance()->genotype) {
fclose(ref_allel_reads);
}
// sweep->finalyze();
points.clear();
final.get_breakpoints(root_final, points);
- polish_points(points);
+
+ size_t points_size = points.size();
+ for (size_t i = 0; i < points_size; i++) { // its not nice, but I may alter the length of the vector within the loop.
+ if (points[i]->get_SVtype() & TRA) {
+ vector<Breakpoint *> new_points;
+ detect_merged_svs(points[i]->get_coordinates(), ref, new_points);
+ if (!new_points.empty()) { // I only allow for 1 split!!
+ points[i] = new_points[0];
+ points.push_back(new_points[1]);
+ }
+ }
+ }
+
for (size_t i = 0; i < points.size(); i++) {
if (points[i]->get_SVtype() & TRA) {
points[i]->calc_support();
@@ -255,25 +363,24 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
}
}
-void add_events(Alignment *& tmp, std::vector<str_event> events, short type, long ref_space, IntervallTree & bst, TNode *&root, long read_id) {
+void add_events(Alignment *& tmp, std::vector<str_event> events, short type, long ref_space, IntervallContainer & bst, TNode *&root, long read_id) {
bool flag = (strcmp(tmp->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
for (size_t i = 0; i < events.size(); i++) {
position_str svs;
read_str read;
- read.type = 0;
+ if (events[i].is_noise) {
+ read.type = 2;
+ } else {
+ read.type = 0;
+ }
read.SV = events[i].type;
if (flag) {
std::cout << "ADD EVENT " << tmp->getName() << " " << tmp->getRefID() << " " << events[i].pos << " " << abs(events[i].length) << std::endl;
}
- svs.start.min_pos = (long) events[i].pos;
- svs.start.min_pos += ref_space;
- svs.stop.max_pos = svs.start.min_pos;
-
- if (!(events[i].type & INS)) { //for all events but not INS!
- svs.stop.max_pos += events[i].length;
- }
+ svs.start.min_pos = (long) events[i].pos + ref_space;
+ svs.stop.max_pos = svs.start.min_pos + events[i].length;
if (tmp->getStrand()) {
read.strand.first = (tmp->getStrand());
@@ -317,16 +424,25 @@ void add_events(Alignment *& tmp, std::vector<str_event> events, short type, lon
read.coordinates.first = svs.start.min_pos;
read.coordinates.second = svs.stop.max_pos;
}
+ /*if(read.SV & DEL){
+ std::cout<<"ADD DEL: "<<svs.start.min_pos<< " "<<svs.stop.max_pos<<" "<<tmp->getName()<<std::endl;
+ }*/
+ /*if(read.SV & INS){
+ std::cout<<"ALN LEN: "<<events[i].length<<" Pos: "<<svs.start.min_pos<< " "<<svs.stop.max_pos<<" "<<tmp->getName()<<std::endl;
+ }*/
read.id = read_id;
svs.support[tmp->getName()] = read;
+ svs.support[tmp->getName()].length = events[i].length;
Breakpoint * point = new Breakpoint(svs, events[i].length);
bst.insert(point, root);
+ //std::cout<<"Print:"<<std::endl;
+ //bst.print(root);
}
}
-void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVector ref, IntervallTree & bst, TNode *&root, long read_id) {
+void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVector ref, IntervallContainer & bst, TNode *&root, long read_id) {
bool flag = (strcmp(tmp->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
- //flag = true;
+
if (false) {
cout << "SPLIT: " << std::endl;
for (size_t i = 0; i < events.size(); i++) {
@@ -356,50 +472,42 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
read.strand.first = !events[i - 1].strand;
read.strand.second = events[i].strand;
}
- int len1 = 0;
- int len2 = 0;
+ // int len1 = 0;
+ //int len2 = 0;
svs.read_start = events[i - 1].read_pos_stop; // (short) events[i - 1].read_pos_start + (short) events[i - 1].length;
svs.read_stop = events[i].read_pos_start;
if (events[i - 1].strand) {
- len1 = events[i - 1].pos;
- len2 = events[i].pos - events[i].length;
+ // len1 = events[i - 1].pos;
+ // len2 = events[i].pos - events[i].length;
svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
} else {
- len1 = events[i].pos;
- len2 = events[i - 1].pos - events[i - 1].length;
+ // len1 = events[i].pos;
+ // len2 = events[i - 1].pos - events[i - 1].length;
svs.start.min_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
svs.stop.max_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
}
if (flag) {
cout << "Debug: SV_Size: " << (svs.start.min_pos - svs.stop.max_pos) << " Ref_start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " Ref_stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref) << " readstart: " << svs.read_start << " readstop: "
- << svs.read_stop << "readstart+len: " << len1 << " readstop+len: " << len2 << endl;
+ << svs.read_stop << std::endl;
}
- if ((svs.stop.max_pos - svs.start.min_pos) > 0 && ((svs.stop.max_pos - svs.start.min_pos) + (Parameter::Instance()->min_cigar_event) < (svs.read_stop - svs.read_start) && (svs.read_stop - svs.read_start) > (Parameter::Instance()->min_cigar_event * 2))) {
+ if ((svs.stop.max_pos - svs.start.min_pos) > -1 && ((svs.stop.max_pos - svs.start.min_pos) + (Parameter::Instance()->min_length) < (svs.read_stop - svs.read_start) && (svs.read_stop - svs.read_start) > (Parameter::Instance()->min_length * 2))) {
if (flag) {
cout << "INS: " << endl;
}
- //read.SV = 'n'; //TODO redefine criteria!
read.SV |= INS;
- /* if (events[i - 1].strand) { //TODO check f
- svs.start.min_pos -=events[i - 1].length;
- }else{
- svs.start.min_pos -=events[i].length;
- }*/
- // cout<<"Split INS: "<<events[i - 1].length<<" "<<(svs.stop.max_pos - svs.start.min_pos)<<" "<<svs.read_stop - svs.read_start<<endl;
- } else if ((svs.start.min_pos - svs.stop.max_pos) * -1 > (svs.read_stop - svs.read_start) + (Parameter::Instance()->min_cigar_event)) {
+ } else if ((svs.start.min_pos - svs.stop.max_pos) * -1 > (svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length)) {
read.SV |= DEL;
if (flag) {
cout << "DEL" << endl;
}
- } else if ((svs.start.min_pos - svs.stop.max_pos) > Parameter::Instance()->min_cigar_event && (svs.read_start - svs.read_stop)<Parameter::Instance()->min_length ) { //check with respect to the coords of reads!
+ } else if ((svs.start.min_pos - svs.stop.max_pos) > Parameter::Instance()->min_length && (svs.read_start - svs.read_stop) < Parameter::Instance()->min_length) { //check with respect to the coords of reads!
read.SV |= DUP;
if (flag) {
cout << "DUP: " << endl;
}
- //TODO ADDED &&(svs.read_stop - svs.read_start) > (Parameter::Instance()->min_cigar_event)
} else {
if (flag) {
cout << "N" << endl;
@@ -407,18 +515,41 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
read.SV = 'n';
}
} else { // if first part of read is in a different direction as the second part-> INV
- if (flag) {
- cout << "INV" << endl;
- }
+
read.strand.first = events[i - 1].strand;
read.strand.second = !events[i].strand;
- read.SV |= INV;
- if (events[i - 1].strand) {
- svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref);
+ if (overlaps(events[i - 1], events[i])) {
+ if (flag) {
+ std::cout << "Overlap curr: " << events[i].pos << " " << events[i].pos + events[i].length << " prev: " << events[i - 1].pos << " " << events[i - 1].pos + events[i - 1].length << " " << tmp->getName() << std::endl;
+ }
+ read.SV |= NEST;
+
+ if (events[i - 1].strand) {
+ svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref);
+ } else {
+ svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
+ }
+
+ if (svs.start.min_pos > svs.stop.max_pos) {
+ long tmp = svs.start.min_pos;
+ svs.start.min_pos = svs.stop.max_pos;
+ svs.stop.max_pos = tmp;
+ }
+ // if(flag){
+ // std::cout<<"NEST: "<<svs.start.min_pos- get_ref_lengths(events[i - 1].RefID, ref) << " "<<svs.stop.max_pos - get_ref_lengths(events[i - 1].RefID, ref)<<" "<<tmp->getName()<<std::endl;
+ // }
+ // svs.stop.max_pos = svs.start.min_pos + Parameter::Instance()->min_length * 2;
} else {
- svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
+ read.SV |= INV;
+ if (events[i - 1].strand) {
+ svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref);
+ } else {
+ svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
+ }
}
}
@@ -447,7 +578,7 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
if (read.SV != 'n') {
if (flag) {
- std::cout << "SPLIT: " << TRANS_type(read.SV) << " start: " << svs.start.min_pos << " stop: " << svs.stop.max_pos; //- get_ref_lengths(events[i].RefID, ref)
+ std::cout << "SPLIT: " << TRANS_type(read.SV) << " start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref);
if (events[i - 1].strand) {
std::cout << " +";
} else {
@@ -459,7 +590,7 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
std::cout << " -";
}
std::cout << " " << tmp->getName() << std::endl;
- std::cout<<"READ: "<<svs.read_start<<" "<<svs.read_stop<<" "<<svs.read_start - svs.read_stop<<std::endl;
+ std::cout << "READ: " << svs.read_start << " " << svs.read_stop << " " << svs.read_start - svs.read_stop << std::endl;
}
//std::cout<<"split"<<std::endl;
@@ -485,25 +616,26 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
read.coordinates.first = svs.start.min_pos;
read.coordinates.second = svs.stop.max_pos;
}
-
+ /*if(read.SV & INS){
+ std::cout<<"Split LEN: "<<abs(read.coordinates.second-read.coordinates.first)<<" Pos: "<<read.coordinates.first-get_ref_lengths(events[i - 1].RefID, ref)<<tmp->getName()<<std::endl;
+ }*/
+ //if( abs(read.coordinates.second-read.coordinates.first)<10){
+ /// events[i].length=abs(read.coordinates.second-read.coordinates.first);//TODO try
+ // std::cout<<"Split event ===1 "<<" "<<abs(read.coordinates.second-read.coordinates.first)<<" len: "<< events[i].length<<" Pos "<<events[i].pos <<" Name "<<tmp->getName() <<std::endl;
+ // }
//pool out?
read.id = read_id;
svs.support[tmp->getName()] = read;
- Breakpoint * point = new Breakpoint(svs, events[i].length);
-
+ svs.support[tmp->getName()].length = abs(read.coordinates.second - read.coordinates.first);
+ Breakpoint * point = new Breakpoint(svs, abs(read.coordinates.second - read.coordinates.first));
+ //std::cout<<"split ADD: " << <<" Name: "<<tmp->getName()<<" "<< svs.start.min_pos- get_ref_lengths(events[i].RefID, ref)<<"-"<<svs.stop.max_pos- get_ref_lengths(events[i].RefID, ref)<<std::endl;
bst.insert(point, root);
- // breakpoints_tmp1.push_back(pair<position_str,int> (svs, events[i].length)); TODO we could create 2 loops: 1st: collect evidence 2nd: further interpretation of complex Var
+ // std::cout<<"Print:"<<std::endl;
+ // bst.print(root);
}
}
}
-void clarify(std::vector<Breakpoint *> & points) {
-//if WTF regions next to duplications-> delete!
- /*for(size_t i=0;i<points.size();i++){
-
- }*/
-}
-
void estimate_parameters(std::string read_filename) {
cout << "Estimating parameter..." << endl;
BamParser * mapped_file = 0;
@@ -524,7 +656,7 @@ void estimate_parameters(std::string read_filename) {
double avg_diffs_perwindow = 0;
vector<int> mis_per_window; //histogram over #differences
vector<int> scores;
- std::string curr, prev = "";
+// std::string curr, prev = "";
double avg_dist = 0;
while (!tmp_aln->getQueryBases().empty() && num < 1000) { //1000
if (rand() % 100 < 20 && ((tmp_aln->getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800)))) { //}&& tmp_aln->get_is_save()))) {
@@ -532,6 +664,7 @@ void estimate_parameters(std::string read_filename) {
//2. get score ration without checking before hand! (above if!)
double dist = 0;
vector<int> tmp = tmp_aln->get_avg_diff(dist);
+ //std::cout<<dist<<std::endl;
avg_dist += dist;
for (size_t i = 0; i < tmp.size(); i++) {
while (tmp[i] + 1 > mis_per_window.size()) { //adjust length
@@ -548,18 +681,23 @@ void estimate_parameters(std::string read_filename) {
scores[score]++;
num++;
}
+
mapped_file->parseReadFast(Parameter::Instance()->min_mq, tmp_aln);
- curr = tmp_aln->getName();
- if (curr.size() == prev.size() && strcmp(prev.c_str(), curr.c_str()) == 0) {
- std::cerr << "Read occurred twice with primary alignment marked. Possibly no eof recognized in bam file." << std::endl;
- break;
- }
- prev = curr;
+ /*curr = tmp_aln->getName();
+ if (curr.size() == prev.size() && strcmp(prev.c_str(), curr.c_str()) == 0) {
+ std::cerr << "Read occurred twice with primary alignment marked. Possibly no eof recognized in bam file." << std::endl;
+ break;
+ }
+ prev = curr;*/
+ }
+ if (num == 0) {
+ std::cerr << "No reads detected in " << Parameter::Instance()->bam_files[0] << std::endl;
+ exit(1);
}
vector<int> nums;
size_t pos = 0;
Parameter::Instance()->max_dist_alns = floor(avg_dist / num) / 2;
- Parameter::Instance()->window_thresh = 25;
+ Parameter::Instance()->window_thresh = 50; //25;
if (!mis_per_window.empty()) {
for (size_t i = 0; i < mis_per_window.size(); i++) {
@@ -572,7 +710,7 @@ void estimate_parameters(std::string read_filename) {
}
pos = nums.size() * 0.95; //the highest 5% cutoff
if (pos <= nums.size()) {
- Parameter::Instance()->window_thresh = std::max(50, nums[pos]); //just in case we have too clean data! :)
+ Parameter::Instance()->window_thresh = std::max(Parameter::Instance()->window_thresh, nums[pos]); //just in case we have too clean data! :)
}
nums.clear();
}
@@ -589,3 +727,15 @@ void estimate_parameters(std::string read_filename) {
std::cout << "\tMin score ratio: " << Parameter::Instance()->score_treshold << std::endl;
}
+bool overlaps(aln_str prev, aln_str curr) {
+
+ double ratio = 0;
+ double overlap = 0;
+
+ if (prev.pos + Parameter::Instance()->min_length < curr.pos + curr.length && prev.pos + prev.length - Parameter::Instance()->min_length > curr.pos) {
+ overlap = min((curr.pos + curr.length), (prev.pos + prev.length)) - max(prev.pos, curr.pos);
+ ratio = overlap / (double) min(curr.length, prev.length);
+ }
+
+ return (ratio > 0.4 && overlap > 200);
+}
diff --git a/src/sub/Detect_Breakpoints.h b/src/sub/Detect_Breakpoints.h
index acb003c..fb1eeae 100644
--- a/src/sub/Detect_Breakpoints.h
+++ b/src/sub/Detect_Breakpoints.h
@@ -14,20 +14,28 @@
#include "../plane-sweep/Plane-sweep.h"
#include "../tree/IntervallTree.h"
#include "../tree/TNode.h"
+#include "../tree/IntervallContainer.h"
+#include "../tree/IntervallList.h"
#include "../Paramer.h"
#include "../print/IPrinter.h"
#include <iostream>
#include <omp.h>
+struct hist_str{
+ long position;
+ int hits;
+ std::vector<std::string> names;
+};
void clarify(std::vector<Breakpoint *> & points);
void detect_breakpoints(std::string filename, IPrinter *& printer);
//void screen_for_events(Node * list,IntervallTree & bst ,TNode *&root, int cov, int lowMQ_cov,RefVector ref);
bool screen_for_events(Alignment * tmp, IntervallTree & bst, TNode *&root, RefVector ref, int cov);
-void add_events(Alignment *& tmp, std::vector<str_event> events, short type, long ref_space, IntervallTree & bst, TNode *&root,long read_id);
-void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVector ref, IntervallTree & bst, TNode *&root,long read_id);
+void add_events(Alignment *& tmp, std::vector<str_event> events, short type, long ref_space, IntervallContainer & bst, TNode *&root,long read_id);
+void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVector ref, IntervallContainer & bst, TNode *&root,long read_id);
void estimate_parameters(std::string read_filename);
-
+bool overlaps(aln_str prev,aln_str curr);
+void detect_merged_svs(Breakpoint * point);
std::string TRANS_type(char type);
diff --git a/src/test b/src/test
deleted file mode 100644
index e69de29..0000000
diff --git a/src/tree/IntervallContainer.h b/src/tree/IntervallContainer.h
new file mode 100644
index 0000000..b46227d
--- /dev/null
+++ b/src/tree/IntervallContainer.h
@@ -0,0 +1,27 @@
+/*
+ * IntervallContainer.h
+ *
+ * Created on: Nov 2, 2016
+ * Author: fsedlaze
+ */
+
+#ifndef TREE_INTERVALLCONTAINER_H_
+#define TREE_INTERVALLCONTAINER_H_
+
+class IntervallContainer {
+protected:
+
+public:
+ IntervallContainer() {
+
+ }
+ virtual ~IntervallContainer() {
+
+ }
+ virtual void insert(Breakpoint * point, TNode *&)=0;
+ virtual void get_breakpoints(TNode *p, std::vector<Breakpoint *> & points)=0;
+ virtual void clear(TNode*&)=0;
+ virtual void print(TNode *p)=0;
+};
+
+#endif /* TREE_INTERVALLCONTAINER_H_ */
diff --git a/src/tree/IntervallList.cpp b/src/tree/IntervallList.cpp
new file mode 100644
index 0000000..1187aed
--- /dev/null
+++ b/src/tree/IntervallList.cpp
@@ -0,0 +1,43 @@
+/*
+ * List.cpp
+ *
+ * Created on: Nov 2, 2016
+ * Author: fsedlaze
+ */
+
+#include "IntervallList.h"
+
+void IntervallList::insert(Breakpoint * point, TNode *& note) {
+
+ for (size_t i = 0; i < this->breakpoints.size(); i++) {
+ long score = this->breakpoints[i]->get_data()->overlap(point);
+ if (score == 0) {
+ // std::cout<<"overlap: "<<this->breakpoints[i]->get_data()->get_coordinates().support.size()<<std::endl;
+ this->breakpoints[i]->get_data()->add_read(point);
+ delete point;
+ return;
+ }/* else if (score < 0) {
+ TNode * p = new TNode(point);
+ this->breakpoints.insert((this->breakpoints.begin()+i),p);
+ return;
+ }*/
+ }
+ TNode * p = new TNode(point);
+ this->breakpoints.push_back(p);
+}
+void IntervallList::get_breakpoints(TNode *p, std::vector<Breakpoint *> & points) {
+ for (size_t i = 0; i < this->breakpoints.size(); i++) {
+ points.push_back(this->breakpoints[i]->get_data());
+ }
+}
+
+void IntervallList::clear(TNode*&){
+ this->breakpoints.clear();
+}
+void IntervallList::print(TNode *p){
+ std::cout<<"Print:"<<std::endl;
+ for (size_t i = 0; i < this->breakpoints.size(); i++) {
+ std::cout<<"( "<<this->breakpoints[i]->get_data()->get_coordinates().start.min_pos<<"-"<<this->breakpoints[i]->get_data()->get_coordinates().stop.max_pos<<" )\t";
+ }
+ std::cout<<std::endl;
+}
diff --git a/src/tree/IntervallList.h b/src/tree/IntervallList.h
new file mode 100644
index 0000000..79ef6a7
--- /dev/null
+++ b/src/tree/IntervallList.h
@@ -0,0 +1,32 @@
+/*
+ * List.h
+ *
+ * Created on: Nov 2, 2016
+ * Author: fsedlaze
+ */
+
+#ifndef TREE_INTERVALLLIST_H_
+#define TREE_INTERVALLLIST_H_
+#include <vector>
+
+#include "TNode.h"
+#include "IntervallContainer.h"
+
+class IntervallList:public IntervallContainer {
+private:
+ std::vector<TNode * > breakpoints;
+public:
+ IntervallList(){
+
+ }
+ ~IntervallList(){
+
+ }
+ void insert(Breakpoint * point, TNode *&);
+ void get_breakpoints(TNode *p,std::vector<Breakpoint *> & points);
+ void clear(TNode*&);
+ void print(TNode *p);
+};
+
+
+#endif /* TREE_INTERVALLLIST_H_ */
diff --git a/src/tree/IntervallTree.cpp b/src/tree/IntervallTree.cpp
index daaff03..76ee22d 100644
--- a/src/tree/IntervallTree.cpp
+++ b/src/tree/IntervallTree.cpp
@@ -7,18 +7,44 @@
#include "IntervallTree.h"
+void IntervallTree::careful_screening(Breakpoint *& new_break, TNode *p) { //maybe I just need the pointer not a ref.
+ if (p != NULL && !(new_break->get_coordinates().start.min_pos == -1 && new_break->get_coordinates().stop.max_pos == -1)) {
+ careful_screening(new_break, p->left);
+ if (p->get_data()->overlap(new_break) == 0) { //SV type
+ p->get_data()->add_read(new_break);
+ new_break->set_coordinates(-1, -1);
+ return;
+ }
+ careful_screening(new_break, p->right);
+ }
+}
+
// Inserting a node
void IntervallTree::insert(Breakpoint * new_break, TNode *&p) {
-
- if (p == NULL) {
- p = new TNode(new_break); //TODO: this is going to be a problem!
+ if (new_break->get_coordinates().start.min_pos == -1 && new_break->get_coordinates().stop.max_pos == -1) {
+ return;
+ }
+ if (p == NULL) { // add to tree:
+ p = new TNode(new_break);
if (p == NULL) {
std::cout << "Out of Space\n" << std::endl;
}
- } else {
+ } else { // find on tree:
long score = p->get_data()->overlap(new_break); //comparison function
+ if (score == 0) { //add SV types?
+ p->get_data()->add_read(new_break);
+ new_break->set_coordinates(-1, -1);
+ //delete new_break;
+ return;
+ } else if (abs(score) < Parameter::Instance()->max_dist) { // if two or more events are too close:
+ //std::cout<<"Screen"<<std::endl;
+ careful_screening(new_break, p);
+ if (new_break->get_coordinates().start.min_pos == -1 && new_break->get_coordinates().stop.max_pos == -1) {
+ return;
+ }
+ }
- if (score > 0) {
+ if (score > 0) { // go left
insert(new_break, p->left);
if ((bsheight(p->left) - bsheight(p->right)) == 2) {
score = p->left->get_data()->overlap(new_break);
@@ -28,7 +54,7 @@ void IntervallTree::insert(Breakpoint * new_break, TNode *&p) {
p = drl(p);
}
}
- } else if (score < 0) {
+ } else if (score < 0) { // go right
insert(new_break, p->right);
if ((bsheight(p->right) - bsheight(p->left)) == 2) {
score = p->right->get_data()->overlap(new_break);
@@ -38,10 +64,6 @@ void IntervallTree::insert(Breakpoint * new_break, TNode *&p) {
p = drr(p);
}
}
- } else { //overlaps!
- p->get_data()->add_read(new_break);
- delete new_break;
- //std::cout << "Breakpoint was detected\n" << std::endl;
}
}
int m, n, d;
@@ -94,15 +116,15 @@ void IntervallTree::find(Breakpoint * point, TNode * &p) {
}
// Copy a tree
void IntervallTree::copy(TNode * &p, TNode * &p1) {
- makeempty(p1);
+ clear(p1);
p1 = nodecopy(p);
}
// Make a tree empty
-void IntervallTree::makeempty(TNode * &p) {
+void IntervallTree::clear(TNode * &p) {
TNode * d;
if (p != NULL) {
- makeempty(p->left);
- makeempty(p->right);
+ clear(p->left);
+ clear(p->right);
d = p;
free(d);
p = NULL;
@@ -175,6 +197,7 @@ void IntervallTree::preorder(TNode * p) {
void IntervallTree::get_breakpoints(TNode *p, std::vector<Breakpoint *> & points) {
if (p != NULL) {
get_breakpoints(p->right, points);
+ //std::cout << "( " << p->get_data()->get_coordinates().start.min_pos << "-" << p->get_data()->get_coordinates().stop.max_pos << " "<< p->get_data()->get_coordinates().support.size()<<" )"<<std::endl;
points.push_back(p->get_data());
get_breakpoints(p->left, points);
}
@@ -184,10 +207,21 @@ void IntervallTree::get_breakpoints(TNode *p, std::vector<Breakpoint *> & points
void IntervallTree::inorder(TNode * p) {
if (p != NULL) {
inorder(p->left);
- std::cout << p->get_data()->to_string()<<endl;
+ std::cout << p->get_data()->to_string() << endl;
inorder(p->right);
}
}
+void IntervallTree::print(TNode *p) {
+ if (p != NULL) {
+ print(p->left);
+ std::string msg = p->get_data()->to_string();
+ if (msg.size() > 3) {
+ std::cout << msg << endl;
+ }
+ //std::cout << "( " << p->get_data()->get_coordinates().start.min_pos << "-" << p->get_data()->get_coordinates().stop.max_pos << " "<< p->get_data()->get_coordinates().support.size()<<" )"<<std::endl;
+ print(p->right);
+ }
+}
// PostOrder Printing
void IntervallTree::postorder(TNode * p) {
@@ -260,6 +294,6 @@ void IntervallTree::collapse_intervalls(TNode *&p) {
this->insert(points[i], new_root);
}
}
- this->makeempty(p);
+ this->clear(p);
p = new_root;
}
diff --git a/src/tree/IntervallTree.h b/src/tree/IntervallTree.h
index a33fd16..7bb0dc0 100644
--- a/src/tree/IntervallTree.h
+++ b/src/tree/IntervallTree.h
@@ -11,13 +11,18 @@
#include <vector>
#include "TNode.h"
-class IntervallTree {
+#include "IntervallContainer.h"
+
+
+
+class IntervallTree:public IntervallContainer {
private:
int max(int, int);
TNode * srl(TNode *&);
TNode * drl(TNode *&);
TNode * srr(TNode *&);
TNode * drr(TNode *&);
+ void careful_screening(Breakpoint *& new_break, TNode *p);
public:
void insert(Breakpoint * point, TNode *&);
void del(Breakpoint * point, TNode *&);
@@ -25,7 +30,7 @@ public:
void find(Breakpoint * point, TNode *&);
TNode * findmin(TNode*);
TNode * findmax(TNode*);
- void makeempty(TNode *&);
+ void clear(TNode *&);
void copy(TNode * &, TNode *&);
TNode * nodecopy(TNode *&);
void preorder(TNode*);
@@ -35,6 +40,7 @@ public:
void get_breakpoints(TNode *p,std::vector<Breakpoint *> & points);
int nonodes(TNode*);
void collapse_intervalls(TNode *&p);
+ void print(TNode *p);
};
#endif /* TREE_INTERVALLTREE_H_ */
diff --git a/src/tree/TNode.h b/src/tree/TNode.h
index 7183d25..9d7c697 100644
--- a/src/tree/TNode.h
+++ b/src/tree/TNode.h
@@ -54,15 +54,6 @@ public:
void set_height(int val) {
this->height = val;
}
- /*int overlap(TNode * tmp) {
- int score = 0; //(tmp->get_type() - type); // 0 if both points are on the same chr!
- if (abs(tmp->get_data()->get_coordinates().start - data->get_coordinates().start) < MAX_DIST
- && abs(tmp->get_data()->get_coordinates().stop - data->get_coordinates().stop) < MAX_DIST) {
- return score;
- }
- //as abstraction lets try the start coordinate only!
- return abs(tmp->get_data()->get_coordinates().start - data->get_coordinates().start);
- }*/
};
#endif /* TREE_TNODE_H_ */
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/sniffles.git
More information about the debian-med-commit
mailing list