[med-svn] [sniffles] 01/04: New upstream version 1.0.7+ds
Afif Elghraoui
afif at moszumanska.debian.org
Thu Nov 23 06:38:09 UTC 2017
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository sniffles.
commit 6b4aed994aa5f6ac636d987f6552a2ccc1c0c011
Author: Afif Elghraoui <afif at debian.org>
Date: Thu Nov 23 01:30:13 2017 -0500
New upstream version 1.0.7+ds
---
CMakeLists.txt | 4 +-
README.md | 21 +-
src/Alignment.cpp | 370 ++++++++++------------------
src/Alignment.h | 8 +-
src/CMakeLists.txt | 4 +
src/Genotyper/Genotyper.cpp | 171 ++++++++-----
src/Genotyper/Genotyper.h | 16 +-
src/Paramer.h | 11 +-
src/Parser.h | 3 +-
src/Sniffles.cpp | 105 +++++---
src/Version.h | 6 +-
src/cluster/Cluster_SVs.cpp | 17 +-
src/force_calling/Force_calling.cpp | 189 +++++++++++++++
src/force_calling/Force_calling.h | 32 +++
src/force_calling/VCF_parser.cpp | 464 ++++++++++++++++++++++++++++++++++++
src/force_calling/VCF_parser.h | 47 ++++
src/print/BedpePrinter.cpp | 56 ++++-
src/print/BedpePrinter.h | 1 +
src/print/IPrinter.cpp | 102 +++++---
src/print/IPrinter.h | 19 +-
src/print/VCFPrinter.cpp | 261 +++++++++++---------
src/print/VCFPrinter.h | 1 +
src/sub/Breakpoint.cpp | 251 ++++++++++---------
src/sub/Breakpoint.h | 33 ++-
src/sub/Detect_Breakpoints.cpp | 246 +++++++++----------
src/sub/Detect_Breakpoints.h | 4 +-
src/tree/Breakpoint_Tree.cpp | 42 ++--
src/tree/Breakpoint_Tree.h | 5 +-
src/tree/IntervallTree.cpp | 65 +++++
src/tree/IntervallTree.h | 4 +
30 files changed, 1762 insertions(+), 796 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b89228c..9a7bb78 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,9 +5,9 @@ project(Sniffles)
set( SNIF_VERSION_MINOR 0 )
IF(CMAKE_BUILD_TYPE STREQUAL "Debug")
message(STATUS "Building in debug mode!")
- set( SNIF_VERSION_BUILD 6-debug )
+ set( SNIF_VERSION_BUILD 7-debug )
else()
- set( SNIF_VERSION_BUILD 6 )
+ set( SNIF_VERSION_BUILD 7 )
ENDIF()
diff --git a/README.md b/README.md
index 6c4d975..6c666cd 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
# Sniffles
-Sniffles is a structural variation caller using third generation sequencing (PacBio or Oxford Nanopore). It detects all types of SVs using evidence from split-read alignments, high-mismatch regions, and coverage analysis. Please note the current version of Sniffles requires sorted output from BWA-MEM (use -M and -x parameter) or NGM-LR with the optional SAM attributes enabled! If you experience problems or have suggestions please contact: fritz.sedlazeck at gmail.com
+Sniffles is a structural variation caller using third generation sequencing (PacBio or Oxford Nanopore). It detects all types of SVs (10bp+) using evidence from split-read alignments, high-mismatch regions, and coverage analysis. Please note the current version of Sniffles requires sorted output from BWA-MEM (use -M and -x parameter) or NGM-LR with the optional SAM attributes enabled! If you experience problems or have suggestions please contact: fritz.sedlazeck at gmail.com
Please see our github wiki for more information (https://github.com/fritzsedlazeck/Sniffles/wiki)
@@ -27,3 +27,22 @@ Biological Data Science, Cold Spring Harbor Laboratory, Cold Spring Harbor, NY,
[NextGenMap-LR: Highly accurate read mapping of third generation sequencing reads for improved structural variation analysis](http://www.cibiv.at/~philipp_/files/gi2016_poster_phr.pdf)
Genome Informatics 2016, Wellcome Genome Campus Conference Centre, Hinxton, Cambridge, UK, 19.09.-2.09.2016
+**************************************
+## Datasets used in the mansucript:
+We provide the NGMLR aligned reads and the Sniffles calls for the data sets used:
+
+Arabidopsis trio:
++ [http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/Arabidopsis_trio](http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/Arabidopsis_trio) .
+
+Genome in the Bottle trio:
++ Mappings: [ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/PacBio_MtSinai_NIST/Baylor_NGMLR_bam_GRCh37/](ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/PacBio_MtSinai_NIST/Baylor_NGMLR_bam_GRCh37/) .
+
++ SV calls: [http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/GiaB/](http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/GiaB/)
+
+NA12878:
++ [http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/NA12878/](http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/NA12878/) .
+
+SKBR3:
++ [http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/Skbr3/](http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/Skbr3/) .
+
+
diff --git a/src/Alignment.cpp b/src/Alignment.cpp
index 6fcb409..80f23d5 100644
--- a/src/Alignment.cpp
+++ b/src/Alignment.cpp
@@ -15,22 +15,6 @@ void Alignment::initAlignment() {
}
void Alignment::setAlignment(BamAlignment * align) {
al = align;
- /*
- //comment from here:
- //todo: why does this influence the INV detection???
- alignment.first.clear();
- alignment.second.clear();
- is_computed = false;
-
- orig_length = al->QueryBases.size();
-
- for (size_t i = 0; i < al->QueryBases.size(); i++) {
- alignment.first += toupper(al->QueryBases[i]);
- alignment.second += 'X';
- }
-
-
- stop = this->getPosition() + this->getRefLength();*/
}
void update_aln(std::string & alignment, int & i, int pos_to_modify) {
@@ -47,7 +31,7 @@ void update_aln(std::string & alignment, int & i, int pos_to_modify) {
void add_event(int pos, list<differences_str>::iterator & i, list<differences_str> & events) {
//insert sorted into vector:
while (i != events.end() && pos > (*i).position) {
- i++;
+ ++i;
}
differences_str ev;
ev.position = pos;
@@ -63,46 +47,49 @@ void add_event(int pos, size_t & i, vector<differences_str> & events) {
differences_str ev;
ev.position = pos;
ev.type = 0; //mismatch
+ ev.readposition = -1;
events.insert(events.begin() + i, ev);
}
-//todo: check if list changes things
-vector<differences_str> Alignment::summarizeAlignment() {
- //clock_t comp_aln = clock();
+vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &dels) {
+// clock_t comp_aln = clock();
vector<differences_str> events;
int pos = this->getPosition();
differences_str ev;
bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
-
+ int read_pos = 0;
+ if (al->CigarData[0].Type == 'S') {
+ read_pos += al->CigarData[0].Length;
+ }
for (size_t i = 0; i < al->CigarData.size(); i++) {
- if (flag) {
- // cout<<al->CigarData[i].Type<<al->CigarData[i].Length<<std::endl;
- }
- if (al->CigarData[i].Type == 'D') {
+ if (al->CigarData[i].Type == 'M') {
+ pos += al->CigarData[i].Length;
+ read_pos += al->CigarData[i].Length;
+ } else if (al->CigarData[i].Type == 'D') {
ev.position = pos;
ev.type = al->CigarData[i].Length; //deletion
+ ev.readposition = read_pos;
events.push_back(ev);
pos += al->CigarData[i].Length;
} else if (al->CigarData[i].Type == 'I') {
ev.position = pos;
+ ev.readposition = read_pos;
ev.type = al->CigarData[i].Length * -1; //insertion
events.push_back(ev);
- } else if (al->CigarData[i].Type == 'M') {
- pos += al->CigarData[i].Length;
+ read_pos += al->CigarData[i].Length;
} else if (al->CigarData[i].Type == 'N') {
pos += al->CigarData[i].Length;
+ read_pos += al->CigarData[i].Length;
} else if (al->CigarData[i].Type == 'S' && al->CigarData[i].Length > Parameter::Instance()->huge_ins) { /// Used for reads ranging into an inser
string sa;
al->GetTag("SA", sa);
uint32_t sv;
if ((al->GetTag("SV", sv) && sa.empty()) && (!(sv & Ns_CLIPPED) && !(sv & FULLY_EXPLAINED))) { // TODO remove last )
- if (flag) {
- std::cout << "Chop: " << pos << " Rname: " << this->getName() << std::endl;
- }
- if (pos == this->getPosition()) {
- ev.position = pos; // - Parameter::Instance()->huge_ins;
+ ev.position = pos; // - Parameter::Instance()->huge_ins;
+ if (i == 0) {
+ ev.readposition = 0;
} else {
- ev.position = pos;
+ ev.readposition = read_pos;
}
ev.type = Parameter::Instance()->huge_ins * -1; //insertion: WE have to fix the length since we cannot estimate it!]
events.push_back(ev);
@@ -110,31 +97,32 @@ vector<differences_str> Alignment::summarizeAlignment() {
}
}
- if (flag) {
- std::cout << "FIRST:" << std::endl;
- for (size_t i = 0; i < events.size(); i++) {
- if (abs(events[i].type) > 200) {
- cout << events[i].position << " " << events[i].type << endl;
- }
- }
- cout << endl;
- }
+ /*if (flag) {
+ std::cout << "FIRST:" << std::endl;
+ for (size_t i = 0; i < events.size(); i++) {
+ if (abs(events[i].type) > 200) {
+ cout << events[i].position << " " << events[i].type << endl;
+ }
+ }
+ cout << endl;
+ }*/
- //set ref length requ. later on:
+//set ref length requ. later on:
this->ref_len = pos - getPosition(); //TODO compare to get_length!
-
- //cout<<" comp len: "<<this->ref_len<<" "<<pos<<" "<<this->getPosition()<<endl;
-// Parameter::Instance()->meassure_time(comp_aln, "\t\tCigar: ");
+ //Parameter::Instance()->meassure_time(comp_aln, "\t\tCigar: ");
string md = this->get_md();
pos = this->getPosition();
int corr = 0;
bool match = false;
- bool gap;
+ bool gap = false;
int ref_pos = 0;
size_t pos_events = 0;
int max_size = (this->getRefLength() * 0.9) + getPosition();
- //comp_aln = clock();
+// comp_aln = clock();
+ indel_str del;
+ del.sequence = "";
+ del.pos = -1;
for (size_t i = 0; i < md.size() && pos < max_size; i++) {
if (md[i] == '^') {
gap = true;
@@ -150,69 +138,29 @@ vector<differences_str> Alignment::summarizeAlignment() {
}
//store in sorted order:
add_event(pos, pos_events, events);
-
- pos++; //just the pos on ref!
+ pos++; //just the pos on ref!
+ } else if (Parameter::Instance()->print_seq) { //can only be a deletion:
+ if (del.pos == -1) {
+ del.pos = pos;
+ } else { //avoid first string position;
+ del.sequence += md[i];
+ }
}
match = false;
} else if (!match) {
match = true;
pos += atoi(&md[i]);
gap = false;
- }
- }
-
- if (flag) {
- std::cout << "SECOND:" << std::endl;
- for (size_t i = 0; i < events.size(); i++) {
- if (abs(events[i].type) > 200) {
- cout << events[i].position << " " << events[i].type << endl;
+ if (Parameter::Instance()->print_seq && del.sequence.size() > Parameter::Instance()->min_length) {
+ dels.push_back(del);
}
+ del.sequence = "";
+ del.pos = -1;
}
- cout << endl;
}
- //Parameter::Instance()->meassure_time(comp_aln, "\t\tMD string: ");
-// comp_aln = clock();
- size_t i = 0;
- //to erase stretches of consecutive mismatches == N in the ref
- /*int break_point = 0;
- while (i < events.size()) {
- if (events[i].position > max_size) {
- while (i < events.size()) {
- if (abs(events[events.size() - 1].type) == Parameter::Instance()->huge_ins) {
- i++;
- } else {
- events.erase(events.begin() + i, events.begin() + i + 1);
- }
- }
- break;
- }
- if (events[i].type == 0) {
- size_t j = 1;
- while (i + j < events.size() && ((events[i + j].position - events[i + (j - 1)].position) == 1 && events[i + j].type == 0)) {
- j++;
- }
- if (j > 10) { //if stetch is at least 3 consecutive mismatches
- events.erase(events.begin() + i, events.begin() + i + j);
- } else {
- i += j;
- }
- } else {
- i++;
- }
- }*/
-// Parameter::Instance()->meassure_time(comp_aln, "\t\terrase N: ");
- if (flag) {
- cout << "LAST:" << endl;
- for (size_t i = 0; i < events.size(); i++) {
- if (abs(events[i].type) > 200) {
- cout << events[i].position << " " << events[i].type << endl;
- }
- }
- cout << endl;
+// Parameter::Instance()->meassure_time(comp_aln, "\t\tMD string: ");
- cout << "total: " << events.size() << endl;
- }
return events;
}
void Alignment::computeAlignment() {
@@ -258,7 +206,7 @@ void Alignment::computeAlignment() {
}
Parameter::Instance()->meassure_time(comp_aln, "\t\tCIGAR opterations ");
comp_aln = clock();
- //Apply MD string:
+//Apply MD string:
string md = this->get_md();
pos = 0;
int corr = 0;
@@ -461,6 +409,7 @@ void Alignment::get_coords(aln_str tmp, int & start, int &stop) {
if (!tmp.strand) {
index = tmp.cigar.size() - 1;
}
+// cout<<"Cigar: "<<this->getName()<<" "<<tmp.cigar.size()<<" "<<index<<endl;
if (tmp.cigar[index].Type == 'S' || tmp.cigar[index].Type == 'H') {
start = tmp.cigar[index].Length;
} else {
@@ -470,7 +419,7 @@ void Alignment::get_coords(aln_str tmp, int & start, int &stop) {
}
void Alignment::check_entries(vector<aln_str> &entries) {
- bool flag =(strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
+ bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
if (flag) {
std::cout << "Nested? " << std::endl;
@@ -505,8 +454,8 @@ void Alignment::check_entries(vector<aln_str> &entries) {
if (abs(ref_size - read_size) > Parameter::Instance()->min_length) {
valid++;
}
- if(flag){
- cout<<"Read: "<<read_size << " Ref: "<<ref_size<< " "<<this->getName()<<std::endl;
+ if (flag) {
+ cout << "Read: " << read_size << " Ref: " << ref_size << " " << this->getName() << std::endl;
}
if (chr != entries[i].RefID) {
@@ -522,7 +471,7 @@ void Alignment::check_entries(vector<aln_str> &entries) {
if (flag) {
std::cout << "summary: " << strands << " " << valid << " " << std::endl;
}
- if (strands < 3 || valid < 2 ) { //check!
+ if (strands < 3 || valid < 2) { //check!
if (flag) {
std::cout << "Return" << std::endl;
}
@@ -614,83 +563,6 @@ void Alignment::check_entries(vector<aln_str> &entries) {
std::cout << std::endl;
}
}
-/*void Alignment::check_entries(vector<aln_str> &entries) { //something is not working!
- bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
-
- //Given that we start outside of the INV:
- if (flag) {
- cout << "Check:" << endl;
- for (size_t i = 0; i < entries.size(); i++) {
- cout << "ENT: " << entries[i].pos << " " << entries[i].pos + entries[i].length << " Read: " << entries[i].read_pos_start << " " << entries[i].read_pos_stop << " ";
- if (entries[i].strand) {
- cout << "+" << endl;
- } else {
- cout << "-" << endl;
- }
- }
- check_entries2(entries);
- }
- bool left_of = true;
- vector<aln_str> new_entries = entries;
- for (size_t i = 1; i < entries.size(); i++) {
- if (entries[i].strand != entries[i - 1].strand && entries[i].RefID == entries[i - 1].RefID) {
- int ref_dist = 0;
- int read_dist = 0;
- //all comp with abs:
- //maybe try without abs!:
- if (entries[0].strand) {
- ref_dist = abs((entries[i - 1].pos + entries[i - 1].length) - entries[i].pos);
- read_dist = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start);
- } else {
- ref_dist = abs((entries[i - 1].pos) - (entries[i].pos + entries[i].length));
- read_dist = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start);
- }
- if (flag) {
- cout << "ref dist: " << ref_dist << " Read: " << read_dist << " DIFF: " << abs(ref_dist - read_dist) << endl;
- }
-
- if (abs(ref_dist - read_dist) > Parameter::Instance()->min_length) {
- //ref_dist > 30 &&
- if (read_dist < Parameter::Instance()->min_length && ref_dist / (read_dist + 1) > 3) { //+1 because otherwise there could be a division by 0!
- if (flag) {
- cout << "DEL? " << ref_dist << " " << read_dist << endl;
- }
- aln_str tmp;
- tmp.RefID = entries[i].RefID;
- if (left_of) {
- tmp.strand = entries[i - 1].strand;
- if (tmp.strand) {
- tmp.pos = entries[i].pos - 1;
- } else {
- tmp.pos = entries[i].pos + entries[i].length - 1;
- }
- left_of = false;
- } else {
- tmp.strand = entries[i].strand;
- if (tmp.strand) {
- tmp.pos = entries[i - 1].pos + entries[i - 1].length - 1;
- } else {
- tmp.pos = entries[i - 1].pos - 1;
- }
- left_of = true;
- }
- tmp.length = 1;
- tmp.read_pos_start = entries[i].read_pos_start - 1;
- tmp.read_pos_stop = tmp.read_pos_start + 1;
- tmp.mq = 60;
- sort_insert(tmp, new_entries);
- } else if (read_dist > Parameter::Instance()->min_length && ref_dist < 10) {
- //cout << "INS? " << this->getName() << endl;
- }
- }
- } else {
- left_of = true; //??
- }
- }
- if (entries.size() < new_entries.size()) {
- entries = new_entries;
- }
- }*/
void Alignment::sort_insert_ref(aln_str tmp, vector<aln_str> &entries) {
@@ -742,6 +614,9 @@ vector<aln_str> Alignment::getSA(RefVector ref) {
bool flag = strcmp(getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0;
get_coords(tmp, tmp.read_pos_start, tmp.read_pos_stop);
+ if (flag) {
+ cout << "\t read " << tmp.read_pos_start << " stop " << tmp.read_pos_stop << endl;
+ }
entries.push_back(tmp);
if (flag) {
std::cout << "Main Read: read start:" << tmp.read_pos_start << " REF: " << tmp.pos << " RefID: " << tmp.RefID << std::endl;
@@ -775,12 +650,15 @@ vector<aln_str> Alignment::getSA(RefVector ref) {
if (sa[i] == ',') {
count++;
}
- if (sa[i] == ';') {
+ if (sa[i] == ';' && !cigar.empty()) {
//TODO: maybe check how often this happens per read!
if ((tmp.mq > Parameter::Instance()->min_mq || sv & FULLY_EXPLAINED) && entries.size() <= Parameter::Instance()->max_splits) {
//TODO: check this!
tmp.cigar = translate_cigar(cigar); //translates the cigar (string) to a type vector
get_coords(tmp, tmp.read_pos_start, tmp.read_pos_stop); //get the coords on the read.
+ if (flag) {
+ cout << "\t read " << tmp.read_pos_start << " stop " << tmp.read_pos_stop << endl;
+ }
tmp.length = (long) get_length(tmp.cigar); //gives the length on the reference.
tmp.RefID = get_id(ref, chr); //translates back the chr to the id of the chr;
//TODO: should we do something about the MD string?
@@ -848,11 +726,10 @@ bool Alignment::get_is_save() {
double score = get_scrore_ratio(); //TODO should I use this again for bwa?
- return !((al->GetTag("XA", sa) && !sa.empty()) || (al->GetTag("XT", sa) && !sa.empty())) && (score == -1 || score > Parameter::Instance()->score_treshold); //|| //TODO: 7.5
+ return !((al->GetTag("XA", sa) && !sa.empty()) || (al->GetTag("XT", sa) && !sa.empty())) && (score == -1 || score > Parameter::Instance()->score_treshold); //|| //TODO: 7.5
}
std::vector<CigarOp> Alignment::translate_cigar(std::string cigar) {
-
std::vector<CigarOp> new_cigar;
size_t i = 0;
@@ -1086,7 +963,8 @@ vector<int> Alignment::get_avg_diff(double & dist, double & avg_del, double & av
//cout<<alignment.first<<endl;
//cout<<alignment.second<<endl;
vector<int> mis_per_window;
- vector<differences_str> event_aln = summarizeAlignment();
+ std::vector<indel_str> dels;
+ vector<differences_str> event_aln = summarizeAlignment(dels);
if (event_aln.empty()) {
dist = 0;
return mis_per_window;
@@ -1131,13 +1009,11 @@ vector<int> Alignment::get_avg_diff(double & dist, double & avg_del, double & av
vector<str_event> Alignment::get_events_Aln() {
- bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
+ bool flag = (strcmp(this->getName().c_str(),Parameter::Instance()->read_name.c_str()) == 0);
//clock_t comp_aln = clock();
- vector<differences_str> event_aln = summarizeAlignment();
- if (flag) {
- std::cout << "test size: " << event_aln.size() << std::endl;
- }
+ std::vector<indel_str> dels;
+ vector<differences_str> event_aln = summarizeAlignment(dels);
//double time2 = Parameter::Instance()->meassure_time(comp_aln, "\tcompAln Events: ");
vector<str_event> events;
@@ -1165,9 +1041,6 @@ vector<str_event> Alignment::get_events_Aln() {
profile.push_back(tmp);
} else if (abs(event_aln[i].type) > Parameter::Instance()->min_length) { //for single events like NGM-LR would produce them.
tmp.position = event_aln[i].position;
- if (flag) {
- std::cout << "HIT" << std::endl;
- }
profile.push_back(tmp);
}
}
@@ -1176,9 +1049,7 @@ vector<str_event> Alignment::get_events_Aln() {
int stop = 0;
size_t start = 0;
for (size_t i = 0; i < profile.size() && stop < event_aln.size(); i++) {
- if (flag) {
- std::cout << "Prof: " << profile[i].position << " " << event_aln[stop].position << std::endl;
- }
+
if (profile[i].position >= event_aln[stop].position) {
//find the postion:
size_t pos = 0;
@@ -1190,9 +1061,6 @@ vector<str_event> Alignment::get_events_Aln() {
int prev = event_aln[pos].position;
start = pos;
int prev_type = 1;
- if (flag) {
- std::cout << "check start" << std::endl;
- }
//todo it is actually pos + type and not *type
while (start > 0 && (prev - event_aln[start].position) < (Parameter::Instance()->max_dist_alns)) { //13 //} * abs(event_aln[start].type) + 1)) { //TODO I dont like 13!??
prev = event_aln[start].position;
@@ -1210,9 +1078,6 @@ vector<str_event> Alignment::get_events_Aln() {
prev = event_aln[pos].position;
stop = pos;
prev_type = 1;
- if (flag) {
- std::cout << "check stop" << std::endl;
- }
while (stop < event_aln.size() && (event_aln[stop].position - prev) < (Parameter::Instance()->max_dist_alns)) { // * abs(event_aln[stop].type) + 1)) {
prev = event_aln[stop].position;
@@ -1239,12 +1104,7 @@ vector<str_event> Alignment::get_events_Aln() {
double insert = 0;
double del = 0;
double mismatch = 0;
-// if (flag) {
-// cout << start << " " << stop << endl;
-// }
- if (flag) {
- std::cout << "check total len: " << start << " " << stop << std::endl;
- }
+
for (size_t k = start; k <= stop; k++) {
if (event_aln[k].type == 0) {
mismatch++;
@@ -1271,60 +1131,78 @@ vector<str_event> Alignment::get_events_Aln() {
}
tmp.length = (tmp.length - event_aln[start].position);
- if (flag) {
- std::cout << "decide" << std::endl;
- }
tmp.type = 0;
if (insert_max > Parameter::Instance()->min_length && insert > (del + del)) { //we have an insertion! //todo check || vs. &&
- if (flag) {
- cout << "store INS" << endl;
- }
- tmp.length = insert_max; //TODO not sure!
- tmp.pos = insert_max_pos;
- tmp.type |= INS;
- tmp.is_noise = false;
if (is_N_region && insert_max * Parameter::Instance()->avg_ins < Parameter::Instance()->min_length) {
tmp.type = 0;
+ } else {
+ tmp.length = insert_max; //TODO not sure!
+ while (start < stop && event_aln[start].readposition == -1) {
+ if (flag) {
+ cout << event_aln[start].readposition << " " << event_aln[start].type << endl;
+ }
+ start++;
+ }
+ if (flag) {
+ cout << event_aln[start].readposition << " " << event_aln[start].type << endl;
+ }
+ tmp.read_pos = event_aln[start].readposition;
+ if (Parameter::Instance()->print_seq) {
+ //if (tmp.read_pos + tmp.length > this->getAlignment()->QueryBases.size() || tmp.read_pos<0) {
+ // cerr << "BUG! ALN event INS: " << this->getName() << " " << tmp.read_pos << " " << tmp.length << " " << this->getAlignment()->QueryBases.size() << endl;
+ // }
+ if(flag){
+ std::cout<<"Seq+:"<<this->getAlignment()->QueryBases.substr(tmp.read_pos, tmp.length)<<std::endl;
+
+ }
+ tmp.sequence = this->getAlignment()->QueryBases.substr(tmp.read_pos, tmp.length);
+ } else {
+ tmp.sequence = "NA";
+ }
+ tmp.pos = insert_max_pos;
+ tmp.type |= INS;
+ tmp.is_noise = false;
}
} else if (del_max > Parameter::Instance()->min_length && (insert + insert) < del) { //deletion
- if (flag) {
- cout << "store DEL " << this->getName() << endl;
- }
- tmp.length = del_max;
- tmp.type |= DEL;
- tmp.is_noise = false;
if (is_N_region && del_max * Parameter::Instance()->avg_del < Parameter::Instance()->min_length) {
tmp.type = 0;
+ } else {
+ if (Parameter::Instance()->print_seq) {
+ for (size_t del_pos = 0; del_pos < dels.size(); del_pos++) {
+ if (abs(dels[del_pos].pos - tmp.pos) < 10) {
+ tmp.sequence = dels[del_pos].sequence;
+ }
+ }
+ } else {
+ tmp.sequence = "NA";
+ }
+ tmp.length = del_max;
+ tmp.type |= DEL;
+ tmp.is_noise = false;
}
} else if ((mismatch + del + insert) / 2 > Parameter::Instance()->min_length) { //TODO
- if (flag) {
- cout << "store Noise" << endl;
- }
- noise_events++;
- tmp.type |= DEL;
- tmp.type |= INV;
- tmp.is_noise = true;
- if (is_N_region || ((del_max> Parameter::Instance()->min_length && insert_max > Parameter::Instance()->min_length) && (del_max/insert_max)<Parameter::Instance()->min_length)) {
+ if (is_N_region || ((del_max > Parameter::Instance()->min_length && insert_max > Parameter::Instance()->min_length) && (del_max / insert_max) < Parameter::Instance()->min_length)) {
tmp.type = 0;
+ } else {
+ noise_events++;
+ tmp.type |= DEL;
+ tmp.type |= INV;
+ tmp.sequence = "NA";
+ tmp.is_noise = true;
}
- } else {
- // std::cout<<"ERROR we did not set the type!"<<std::endl;
- tmp.type = 0;
}
if (flag) {
- cout << "Read: " << " " << (double) this->getRefLength() << " events: " << event_aln.size() << " " << this->al->Name << std::endl;
- cout << "INS max " << insert_max << " del_max " << del_max << std::endl;
- cout << "INS:" << insert << " DEL: " << del << " MIS: " << mismatch << endl;
- cout << event_aln[start].position << " " << event_aln[stop].position << endl;
- cout << "store: " << tmp.pos << " " << tmp.pos + abs(tmp.length) << " " << tmp.length << endl;
- cout << endl;
- }
+ cout << "Read: " << " " << (double) this->getRefLength() << " events: " << event_aln.size() << " " << this->al->Name << std::endl;
+ cout << "INS max " << insert_max << " del_max " << del_max << std::endl;
+ cout << "INS:" << insert << " DEL: " << del << " MIS: " << mismatch << endl;
+ cout << event_aln[start].position << " " << event_aln[stop].position << endl;
+ cout << "store: " << tmp.pos << " " << tmp.pos + abs(tmp.length) << " " << tmp.length << endl;
+ cout << tmp.sequence<<endl;
+ cout << endl;
+ }
if (tmp.type != 0) {
- if (flag) {
- cout << "STORE" << endl;
- }
events.push_back(tmp);
}
}
diff --git a/src/Alignment.h b/src/Alignment.h
index 38fa46d..35e7aa4 100644
--- a/src/Alignment.h
+++ b/src/Alignment.h
@@ -38,8 +38,13 @@ typedef unsigned int uint;
struct differences_str{
int position;
+ int readposition;
short type;
};
+struct indel_str{
+ int pos;
+ std::string sequence;
+};
struct str_event{
short length;
@@ -47,6 +52,7 @@ struct str_event{
int read_pos;
char type;
bool is_noise;
+ std::string sequence; //just for indels;
};
struct aln_str{
int RefID;
@@ -74,7 +80,7 @@ private:
std::vector<CigarOp> translate_cigar(std::string cigar);
size_t get_length(std::vector<CigarOp> CigarData);
int get_id(RefVector ref, std::string chr);
- vector<differences_str> summarizeAlignment();
+ vector<differences_str> summarizeAlignment(std::vector<indel_str> &dels);
void sort_insert(aln_str tmp, vector<aln_str> &entries);
void sort_insert_ref(aln_str tmp, vector<aln_str> &entries);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2ff0828..3b62441 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -27,6 +27,8 @@ add_executable(sniffles
print/NGMPrinter.cpp
plane-sweep/PlaneSweep_slim.cpp
cluster/Cluster_SVs.cpp
+ force_calling/Force_calling.cpp
+ force_calling/VCF_parser.cpp
)
#target_link_libraries(ngm-core pthread)
@@ -54,6 +56,8 @@ add_executable(sniffles-debug
print/NGMPrinter.cpp
plane-sweep/PlaneSweep_slim.cpp
cluster/Cluster_SVs.cpp
+ force_calling/Force_calling.cpp
+ force_calling/VCF_parser.cpp
)
SET_TARGET_PROPERTIES(sniffles-debug PROPERTIES COMPILE_FLAGS "-g3 -O0")
diff --git a/src/Genotyper/Genotyper.cpp b/src/Genotyper/Genotyper.cpp
index 9be2a48..5a125b1 100644
--- a/src/Genotyper/Genotyper.cpp
+++ b/src/Genotyper/Genotyper.cpp
@@ -7,56 +7,58 @@
#include "Genotyper.h"
-std::string Genotyper::mod_breakpoint_vcf(char *buffer, int ref) {
- //find last of\t
- //parse #reads supporting
- //print #ref
- string entry;
- string tmp = string(buffer);
- int pos = 0;
-
- pos = tmp.find_last_of("GT");
- //tab
- entry = tmp.substr(0, pos - 2);
-
- tmp = tmp.substr(pos + 1); // the right part is only needed:
- pos = tmp.find_last_of(':');
- int support = atoi(tmp.substr(pos + 1).c_str());
+std::string Genotyper::assess_genotype(int ref, int support) {
double allele = (double) support / (double) (support + ref);
if (allele < Parameter::Instance()->min_allelel_frequency) {
return "";
}
+
std::stringstream ss;
ss << ";AF=";
ss << allele;
ss << "\tGT:DR:DV\t";
-
- if (allele > 0.8) {
+ if (allele > Parameter::Instance()->homfreq) {
ss << "1/1:";
- } else if (allele > 0.3) {
+ } else if (allele > Parameter::Instance()->hetfreq) {
ss << "0/1:";
} else {
ss << "0/0:";
}
-
ss << ref;
ss << ":";
ss << support;
+ return ss.str();
+}
- entry += ss.str();
+std::string Genotyper::mod_breakpoint_vcf(string buffer, int ref) {
+ //find last of\t
+ //parse #reads supporting
+ //print #ref
+ string entry;
+
+ int pos = 0;
+
+ pos = buffer.find_last_of("GT");
+ //tab
+ entry = buffer.substr(0, pos - 2);
+
+ buffer = buffer.substr(pos + 1); // the right part is only needed:
+ pos = buffer.find_last_of(':');
+ int support = atoi(buffer.substr(pos + 1).c_str());
+ entry += assess_genotype(ref, support);
return entry;
}
-std::string Genotyper::mod_breakpoint_bedpe(char *buffer, int ref) {
+std::string Genotyper::mod_breakpoint_bedpe( string buffer, int ref) {
- std::string tmp = string(buffer);
+ std::string tmp = buffer;
std::string entry = tmp;
entry += '\t';
//int ref = max(tree.get_ref(node,var.chr,var.pos),tree.get_ref(node,var.chr2,var.pos2));
- int pos = tmp.find_last_of('\t'); //TODO!!
+ int pos = tmp.find_last_of('\t'); //TODO!!
int support = atoi(tmp.substr(pos + 1).c_str());
double allele = (double) support / (double) (support + ref);
@@ -91,7 +93,7 @@ void Genotyper::parse_pos(char * buffer, int & pos, std::string & chr) {
}
}
-variant_str Genotyper::get_breakpoint_vcf(char *buffer) {
+variant_str Genotyper::get_breakpoint_vcf(string buffer) {
//TODO extend for TRA!
size_t i = 0;
int count = 0;
@@ -117,8 +119,7 @@ variant_str Genotyper::get_breakpoint_vcf(char *buffer) {
}
}
if (count > 6 && strncmp(";END=", &buffer[i], 5) == 0) {
- i += 5;
- tmp.pos2 = atoi(&buffer[i]); //stores right most breakpoint
+ tmp.pos2 = atoi(&buffer[i + 5]); //stores right most breakpoint
break;
}
@@ -129,7 +130,7 @@ variant_str Genotyper::get_breakpoint_vcf(char *buffer) {
}
return tmp;
}
-variant_str Genotyper::get_breakpoint_bedpe(char *buffer) {
+variant_str Genotyper::get_breakpoint_bedpe(string buffer) {
size_t i = 0;
int count = 0;
std::string chr;
@@ -171,16 +172,17 @@ void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) {
myfile.open(Parameter::Instance()->output_bedpe.c_str(), std::ifstream::in);
}
- FILE*file = fopen(Parameter::Instance()->tmp_file.c_str(), "w");
+ FILE*file = fopen(Parameter::Instance()->tmp_file.c_str(), "w"); //
if (!myfile.good()) {
std::cout << "SVParse: could not open file: " << std::endl;
exit(0);
}
- size_t buffer_size = 25000;
- char* buffer = new char[buffer_size];
- myfile.getline(buffer, buffer_size);
+
+ string buffer;
+ getline(myfile,buffer);
//parse SVs breakpoints in file
+
while (!myfile.eof()) { // TODO:if first -> we need to define AF!
if (buffer[0] != '#') {
@@ -192,6 +194,7 @@ void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) {
} else {
tmp = get_breakpoint_bedpe(buffer);
}
+
int ref = max(tree.get_ref(node, tmp.chr, tmp.pos), tree.get_ref(node, tmp.chr2, tmp.pos2));
if (is_vcf) {
to_print = mod_breakpoint_vcf(buffer, ref);
@@ -203,11 +206,11 @@ void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) {
fprintf(file, "%c", '\n');
}
} else {
- fprintf(file, "%s", buffer);
+ fprintf(file, "%s", buffer.c_str());
fprintf(file, "%c", '\n');
}
- myfile.getline(buffer, buffer_size);
+ getline(myfile,buffer);
}
myfile.close();
fclose(file);
@@ -219,8 +222,8 @@ void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) {
system(move.c_str());
}
-void Genotyper::read_SVs(Breakpoint_Tree & tree, breakpoint_node *& node) {
-
+std::vector<std::string> Genotyper::read_SVs(Breakpoint_Tree & tree, breakpoint_node * &node) {
+ std::vector<std::string> ref_dict;
std::ifstream myfile;
bool is_vcf = !Parameter::Instance()->output_vcf.empty();
@@ -234,11 +237,17 @@ void Genotyper::read_SVs(Breakpoint_Tree & tree, breakpoint_node *& node) {
std::cout << "SVParse: could not open file: " << std::endl;
exit(0);
}
- size_t buffer_size = 25000;
- char* buffer = new char[buffer_size];
- myfile.getline(buffer, buffer_size);
+ //size_t buffer_size = 250000000;
+ string buffer;
+
+ getline(myfile,buffer);
+ //char* buffer = new char[buffer_size];
+ //myfile.getline(buffer, buffer_size);
//parse SVs breakpoints in file
+ int num_sv=0;
+ int prev_pos1 = 0;
+ int prev_pos2 = 0;
while (!myfile.eof()) {
//cout << buffer << endl;
if (buffer[0] != '#') {
@@ -250,31 +259,45 @@ void Genotyper::read_SVs(Breakpoint_Tree & tree, breakpoint_node *& node) {
} else {
tmp = get_breakpoint_bedpe(buffer);
}
- tree.insert(node, tmp.chr, tmp.pos);
- tree.insert(node, tmp.chr2, tmp.pos2);
+ // std::cout << "SV: " << tmp.pos << " " << tmp.pos2 << std::endl;
+ tree.insert(node, tmp.chr, tmp.pos, true); //true: start;
+ tree.insert(node, tmp.chr2, tmp.pos2, false); //false: stop;//
+ num_sv++;
+ if(num_sv%1000==0){
+ cout<<"\t\tRead in SV: "<<num_sv<<endl;
+ }
+ } else if (buffer[2] == 'c' && buffer[3] == 'o') { //##contig=<ID=chr1,length=699930>
+ //fill the refdict.
+ std::string id = "";
+ for (size_t i = 13; i < buffer.size() && buffer[i] != ','; i++) {
+ id += buffer[i];
+ }
+ ref_dict.push_back(id);
}
- myfile.getline(buffer, buffer_size);
+ getline(myfile,buffer);
+ //myfile.getline(buffer, buffer_size);
}
myfile.close();
+ return ref_dict;
//tree.inorder(node);
}
-void Genotyper::compute_cov(Breakpoint_Tree & tree, breakpoint_node *& node) {
- std::string output = Parameter::Instance()->tmp_file.c_str();
- output += "ref_allele";
+void Genotyper::compute_cov(Breakpoint_Tree & tree, breakpoint_node *& node, std::vector<std::string> ref_dict) {
- FILE * ref_allel_reads = fopen(output.c_str(), "r");
+ FILE * ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "r");
if (ref_allel_reads == NULL) {
- std::cerr << "CovParse: could not open file: " << output.c_str() << std::endl;
+ std::cerr << "CovParse: could not open file: " << Parameter::Instance()->tmp_genotyp << std::endl;
}
-
//check if we want to compute the full coverage!
str_read tmp;
size_t nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
+ int prev_id = -1;
while (nbytes != 0) {
- if (!tmp.SV_support) {
- //if reads should be included-> Planesweep for +- breakpoint (Maybe hit -> extra function for that region around the breakpoint!
- tree.overalps(tmp.start, tmp.start + tmp.length, tmp.chr, node, tmp.SV_support);
+ // std::cout<<"Read: "<<" " <<tmp.chr_id<<":"<<ref_dict[tmp.chr_id]<<" " <<tmp.start<<" "<<tmp.length<<std::endl;
+ if (prev_id != tmp.chr_id) {
+ cout << "\t\tScanning CHR " << ref_dict[tmp.chr_id] << endl;
+ prev_id = tmp.chr_id;
}
+ tree.overalps(tmp.start, tmp.start + tmp.length, ref_dict[tmp.chr_id], node);
nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
}
fclose(ref_allel_reads);
@@ -283,13 +306,49 @@ void Genotyper::compute_cov(Breakpoint_Tree & tree, breakpoint_node *& node) {
void Genotyper::update_SVs() {
//parse SVs not just breakpoints and update with the coverage info
- read_SVs(this->tree, this->node);
- compute_cov(this->tree, this->node);
+ cout << "\tConstruct tree" << endl;
+ std::vector<std::string> ref_dict = read_SVs(this->tree, this->node);
+ cout << "\tUpdate reference alleles" << endl;
+ compute_cov(this->tree, this->node, ref_dict);
+ cout << "\tWriting SV calls" << endl;
update_file(this->tree, this->node);
- cout << "Cleaning tmp files" << endl;
+ cout << "\tCleaning tmp files" << endl;
string del = "rm ";
- del += Parameter::Instance()->tmp_file;
- del += "ref_allele";
- system(del.c_str());
+ del += Parameter::Instance()->tmp_genotyp;
+ //system(del.c_str());
+}
+
+void Genotyper::update_SVs(std::vector<Breakpoint *> & svs, long ref_space) { //refspace for the ref reads!!
+
+ FILE * ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "r");
+ if (ref_allel_reads == NULL) {
+ std::cerr << "Genotype Parser: could not open file: " << Parameter::Instance()->tmp_genotyp << std::endl;
+ }
+ str_read tmp;
+ size_t nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
+ int num_reads = 0;
+ while (nbytes != 0) {
+ for (size_t i = 0; i < svs.size(); i++) {
+ if (svs[i]->get_valid()) {
+ long start = tmp.start + ref_space;
+ long stop = start + (long) tmp.length;
+ //start - 100 orig!
+ if ((svs[i]->get_coordinates().start.min_pos - 100 > start && svs[i]->get_coordinates().start.min_pos + 100 < stop)) { //found
+ svs[i]->set_refcount(svs[i]->get_refcount() + 1);
+ }
+ //stop coordinate
+ if ((svs[i]->get_coordinates().stop.max_pos - 100 > start + 100 && svs[i]->get_coordinates().stop.max_pos + 100 < stop - 100)) { //found
+ svs[i]->set_refcount(svs[i]->get_refcount() + 1);
+ }
+ }
+ }
+ //if reads should be included-> Planesweep for +- breakpoint (Maybe hit -> extra function for that region around the breakpoint!
+ num_reads++;
+ if (num_reads % 1000 == 0) {
+ cout << "\tProcessed " << num_reads << endl;
+ }
+ nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
+ }
+ fclose(ref_allel_reads);
}
diff --git a/src/Genotyper/Genotyper.h b/src/Genotyper/Genotyper.h
index f0aef9f..2eb7954 100644
--- a/src/Genotyper/Genotyper.h
+++ b/src/Genotyper/Genotyper.h
@@ -20,14 +20,16 @@ class Genotyper{
private:
Breakpoint_Tree tree;
breakpoint_node * node;
- void read_SVs(Breakpoint_Tree & tree,breakpoint_node *& node );
- void compute_cov(Breakpoint_Tree & tree,breakpoint_node *& node);
+ std::vector<std::string> read_SVs(Breakpoint_Tree & tree,breakpoint_node *& node );
+ void compute_cov(Breakpoint_Tree & tree,breakpoint_node *& node,std::vector<std::string> ref_dict);
void update_file(Breakpoint_Tree & tree,breakpoint_node *& node);
- variant_str get_breakpoint_vcf(char *buffer);
- variant_str get_breakpoint_bedpe(char *buffer);
- std::string mod_breakpoint_vcf(char *buffer, int ref);
- std::string mod_breakpoint_bedpe(char *buffer, int ref);
+ variant_str get_breakpoint_vcf(string buffer);
+ variant_str get_breakpoint_bedpe(string buffer);
+ std::string mod_breakpoint_vcf(string buffer, int ref);
+ std::string mod_breakpoint_bedpe(string buffer, int ref);
void parse_pos(char * buffer, int & pos, std::string & chr);
+
+
public:
Genotyper(){
node=NULL;
@@ -36,5 +38,7 @@ public:
}
void update_SVs();
+ void update_SVs(std::vector<Breakpoint *> & points,long ref_space);
+ std::string assess_genotype(int ref, int support);
};
#endif /* GENOTYPER_H_ */
diff --git a/src/Paramer.h b/src/Paramer.h
index f171eff..34cb23a 100644
--- a/src/Paramer.h
+++ b/src/Paramer.h
@@ -14,6 +14,7 @@
#include <stdlib.h>
#include <iostream>
#include <ctime>
+#include <map>
struct region_str {
std::string chr;
int start;
@@ -24,7 +25,7 @@ class Parameter {
private:
Parameter() {
window_thresh=10;//TODO check!
- version="1.0.6";
+ version="1.0.7";
huge_ins = 2000;//TODO check??
}
~Parameter() {
@@ -39,9 +40,13 @@ public:
std::string read_name;
std::string ignore_regions_bed;
std::string tmp_file;
+ std::string tmp_genotyp;
+ std::string tmp_phasing;
std::string version;
+ std::string input_vcf;
std::vector<std::string> bam_files;
+ std::map<std::string, bool> chr_names;
short min_mq;
short report_n_reads;
short corridor;
@@ -51,6 +56,8 @@ public:
double min_allelel_frequency;
double avg_del;
double avg_ins;
+ double homfreq;
+ double hetfreq;
//double min_num_mismatches;
int window_thresh;
@@ -65,6 +72,7 @@ public:
int huge_ins;
int max_dist_alns;
int min_segment_size;
+ int min_zmw;
// bool splitthreader_output;
bool debug;
@@ -72,6 +80,7 @@ public:
bool phase;
bool ignore_std;
bool reportBND;
+ bool print_seq;
void set_regions(std::string reg) {
size_t i = 0;
diff --git a/src/Parser.h b/src/Parser.h
index a07d55d..f6a9cb5 100644
--- a/src/Parser.h
+++ b/src/Parser.h
@@ -11,10 +11,9 @@
#include "Alignment.h"
struct str_read{
- string chr;
+ uint chr_id;
uint start;
ushort length;
- bool SV_support;
};
class Parser {
diff --git a/src/Sniffles.cpp b/src/Sniffles.cpp
index 7f7cd7f..624dc70 100644
--- a/src/Sniffles.cpp
+++ b/src/Sniffles.cpp
@@ -22,17 +22,17 @@
#include "Ignore_Regions.h"
#include "plane-sweep/PlaneSweep_slim.h"
#include "print/BedpePrinter.h"
+#include "force_calling/Force_calling.h"
//cmake -D CMAKE_C_COMPILER=/opt/local/bin/gcc-mp-4.7 -D CMAKE_CXX_COMPILER=/opt/local/bin/g++-mp-4.7 ..
-
//TODO:
// strand bias??
// I think you could make your performance on PacBio reads even better with a few modifications:
-//a. The "snake" read that you show in Supplementary Figure 2.5 which limits calling of small inverted duplications likely results from an improperly constructed SMRTbell (see attached) that has adapters missing on one end. The PacBio library prep reordered a few steps (years ago now) to prevent formation of these types of molecules. So, hopefully it is no longer an issue. Even with old data, you should be able to call small inverted duplications by counting support of ZMWs not jus [...]
-//b. In pbsv, I use a simply mononucleotide consistency check to determine whether to cluster insertions from different reads as supporting the "same" events. In addition to looking at the similarity of length and breakpoints, you could measure [min(Act)+min(Cct)+min(Gct)+min(Tct) / max(Act)+max(Cct)+max(Gct)+max(Tct)] Even a lax criterion (>0.25) can avoid clustering phantom insertions (where one is say all A and the another is G+T).
-
-
+//b. In pbsv, I use a simply mononucleotide consistency check to determine whether to cluster insertions from different reads as supporting the "same" events. In addition to looking at the similarity of length and breakpoints,
+//you could measure [min(Act)+min(Cct)+min(Gct)+min(Tct) / max(Act)+max(Cct)+max(Gct)+max(Tct)] Even a lax criterion (>0.25)
+//can avoid clustering phantom insertions (where one is say all A and the another is G+T).
+//[min(A1,A2)+min(C1,C2)+min(G1,G2)+min(T1,T2)[/[max...]/
Parameter* Parameter::m_pInstance = NULL;
void read_parameters(int argc, char *argv[]) {
@@ -40,25 +40,36 @@ void read_parameters(int argc, char *argv[]) {
TCLAP::CmdLine cmd("Sniffles version ", ' ', Parameter::Instance()->version);
TCLAP::ValueArg<std::string> arg_bamfile("m", "mapped_reads", "Sorted bam File", true, "", "string");
TCLAP::ValueArg<std::string> arg_vcf("v", "vcf", "VCF output file name", false, "", "string");
+ TCLAP::ValueArg<std::string> arg_input_vcf("", "Ivcf", "Input VCF file name. Enable force calling", false, "", "string");
TCLAP::ValueArg<std::string> arg_bedpe("b", "bedpe", " bedpe output file name", false, "", "string");
+ //TCLAP::ValueArg<std::string> arg_chrs("c", "chrs", " comma seperated list of chrs to scan", false, "", "string");
TCLAP::ValueArg<int> arg_support("s", "min_support", "Minimum number of reads that support a SV. Default: 10", false, 10, "int");
TCLAP::ValueArg<int> arg_splits("", "max_num_splits", "Maximum number of splits per read to be still taken into account. Default: 7", false, 7, "int");
TCLAP::ValueArg<int> arg_dist("d", "max_distance", "Maximum distance to group SV together. Default: 1kb", false, 1000, "int");
TCLAP::ValueArg<int> arg_threads("t", "threads", "Number of threads to use. Default: 3", false, 3, "int");
TCLAP::ValueArg<int> arg_minlength("l", "min_length", "Minimum length of SV to be reported. Default: 30", false, 30, "int");
TCLAP::ValueArg<int> arg_mq("q", "minmapping_qual", "Minimum Mapping Quality. Default: 20", false, 20, "int");
- TCLAP::ValueArg<int> arg_numreads("n", "num_reads_report", "Report up to N reads that support the SV in the vcf file. Default: 0", false, 0, "int");
- TCLAP::ValueArg<int> arg_segsize("r","min_seq_size","Discard read if non of its segment is larger then this. Default: 2kb",false,2000,"int");
+ TCLAP::ValueArg<int> arg_numreads("n", "num_reads_report", "Report up to N reads that support the SV in the vcf file. -1: report all. Default: 0", false, 0, "int");
+ TCLAP::ValueArg<int> arg_segsize("r", "min_seq_size", "Discard read if non of its segment is larger then this. Default: 2kb", false, 2000, "int");
+ TCLAP::ValueArg<int> arg_zmw("z", "min_zmw", "Discard SV that are not supported by at least x zmws. This applies only for PacBio recognizable reads. Default: 0", false, 0, "int");
TCLAP::ValueArg<std::string> arg_tmp_file("", "tmp_file", "path to temporary file otherwise Sniffles will use the current directory.", false, "", "string");
TCLAP::SwitchArg arg_genotype("", "genotype", "Enables Sniffles to compute the genotypes.", cmd, false);
TCLAP::SwitchArg arg_cluster("", "cluster", "Enables Sniffles to phase SVs that occur on the same reads", cmd, false);
- TCLAP::SwitchArg arg_std("", "ignore_sd", "Ignores the sd based filtering. (default: false)", cmd, false);
- TCLAP::SwitchArg arg_bnd("", "report_BND", "Report BND instead of Tra in vcf output. (default: false)", cmd, false);
+ TCLAP::SwitchArg arg_std("", "ignore_sd", "Ignores the sd based filtering. Default: false", cmd, false);
+ TCLAP::SwitchArg arg_bnd("", "report_BND", "Report BND instead of Tra in vcf output. Default: false", cmd, false);
+ TCLAP::SwitchArg arg_seq("", "report_seq", "Report sequences for indels in vcf output. (Beta version!) Default: false", cmd, false);
TCLAP::ValueArg<int> arg_cluster_supp("", "cluster_support", "Minimum number of reads supporting clustering of SV. Default: 1", false, 1, "int");
- TCLAP::ValueArg<float> arg_allelefreq("f", "allelefreq", "Threshold on allele frequency (0-1).", false, 0.0, "float");
+ TCLAP::ValueArg<float> arg_allelefreq("f", "allelefreq", "Threshold on allele frequency (0-1). Default=0.0", false, 0.0, "float");
+ TCLAP::ValueArg<float> arg_hetfreq("", "min_het_af", "Threshold on allele frequency (0-1). Default=0.0", false, 0.3, "float");
+ TCLAP::ValueArg<float> arg_homofreq("", "min_homo_af", "Threshold on allele frequency (0-1). Default=0.0", false, 0.8, "float");
+
+ cmd.add(arg_homofreq);
+ cmd.add(arg_hetfreq);
+ cmd.add(arg_input_vcf);
cmd.add(arg_cluster_supp);
cmd.add(arg_numreads);
+ cmd.add(arg_zmw);
cmd.add(arg_segsize);
cmd.add(arg_tmp_file);
cmd.add(arg_dist);
@@ -71,13 +82,13 @@ void read_parameters(int argc, char *argv[]) {
cmd.add(arg_allelefreq);
cmd.add(arg_support);
cmd.add(arg_bamfile);
-
+// cmd.add(arg_chrs);
//parse cmd:
cmd.parse(argc, argv);
Parameter::Instance()->debug = true;
Parameter::Instance()->score_treshold = 10;
- Parameter::Instance()->read_name = "21_43213620_-";//21_16296949_+";//21_40181680_-";//m151102_123142_42286_c100922632550000001823194205121665_s1_p0/80643/0_20394"; //"22_36746138"; //just for debuging reasons!
+ Parameter::Instance()->read_name = " "; //21_16296949_+";//21_40181680_-";//m151102_123142_42286_c100922632550000001823194205121665_s1_p0/80643/0_20394"; //"22_36746138"; //just for debuging reasons!
Parameter::Instance()->bam_files.push_back(arg_bamfile.getValue());
Parameter::Instance()->min_mq = arg_mq.getValue();
Parameter::Instance()->output_vcf = arg_vcf.getValue();
@@ -94,22 +105,50 @@ void read_parameters(int argc, char *argv[]) {
Parameter::Instance()->min_grouping_support = arg_cluster_supp.getValue();
Parameter::Instance()->min_allelel_frequency = arg_allelefreq.getValue();
Parameter::Instance()->min_segment_size = arg_segsize.getValue();
- Parameter::Instance()->reportBND= arg_bnd.getValue();
-
- Parameter::Instance()->ignore_std=arg_std.getValue();
-
- if (Parameter::Instance()->min_allelel_frequency > 0) {
+ Parameter::Instance()->reportBND = arg_bnd.getValue();
+ Parameter::Instance()->input_vcf = arg_input_vcf.getValue();
+ Parameter::Instance()->print_seq = arg_seq.getValue();
+ Parameter::Instance()->ignore_std = arg_std.getValue();
+ Parameter::Instance()->min_zmw = arg_zmw.getValue();
+ Parameter::Instance()->homfreq = arg_homofreq.getValue();
+ Parameter::Instance()->hetfreq = arg_hetfreq.getValue();
+
+ //Parse IDS:
+ /*std::string buffer = arg_chrs.getValue();
+ int count = 0;
+ std::string name = "";
+ for (size_t i = 0; i < buffer.size(); i++) {
+ if (buffer[i] == ',') {
+ Parameter::Instance()->chr_names[name] = true;
+ name.clear();
+ } else {
+ name += buffer[i];
+ }
+ }
+ if (!name.empty()) {
+ Parameter::Instance()->chr_names[name] = true;
+ }
+*/
+ if (Parameter::Instance()->min_allelel_frequency > 0 || !Parameter::Instance()->input_vcf.empty()) {
std::cerr << "Automatically enabling genotype mode" << std::endl;
Parameter::Instance()->genotype = true;
}
- if (Parameter::Instance()->tmp_file.empty()) {
- std::stringstream ss;
- srand(time(NULL));
- ss << arg_bamfile.getValue();
- ss << "_tmp";
- Parameter::Instance()->tmp_file = ss.str(); //check if file exists! -> if yes throw the dice again
+ if (Parameter::Instance()->tmp_file.empty()) { //TODO change to genotyper file and phasing file!
+ if(Parameter::Instance()->output_bedpe.empty()){
+ Parameter::Instance()->tmp_file = Parameter::Instance()->output_vcf;
+ }else{
+ Parameter::Instance()->tmp_file = Parameter::Instance()->output_bedpe;
+ }
+
+ Parameter::Instance()->tmp_file += "_tmp";
}
+
+ Parameter::Instance()->tmp_genotyp = Parameter::Instance()->tmp_file;
+ Parameter::Instance()->tmp_phasing = Parameter::Instance()->tmp_file;
+ Parameter::Instance()->tmp_genotyp += "_genotype";
+ Parameter::Instance()->tmp_phasing += "_phase";
+ //should I check tmp file path??
}
//some toy/test functions:
@@ -183,9 +222,7 @@ double test_comp_std_quantile(std::vector<int> positions, int position) {
}
return std::sqrt(std_start / count);
-
}
-
void test_std() {
srand(time(NULL));
int start = rand() % 100000; /// sqrt(1/12) for ins. Plot TRA std vs. cov/support.
@@ -282,13 +319,9 @@ void test_slimming() {
int main(int argc, char *argv[]) {
try {
- // test_slimming();
- // test_std();
- //
- //exit(0);
+
//init parameter and reads user defined parameter from command line.
read_parameters(argc, argv);
-
//init openmp:
omp_set_dynamic(0);
omp_set_num_threads(Parameter::Instance()->num_threads);
@@ -309,7 +342,14 @@ int main(int argc, char *argv[]) {
}
printer->init();
- detect_breakpoints(Parameter::Instance()->bam_files[0], printer); //we could write out all read names for each sVs
+ if (Parameter::Instance()->input_vcf.empty()) {
+ //regular calling
+ detect_breakpoints(Parameter::Instance()->bam_files[0], printer); //we could write out all read names for each sVs
+ } else {
+ //force calling was selected:
+ force_calling(Parameter::Instance()->bam_files[0], printer);
+ }
+
printer->close_file();
//cluster the SVs together:
@@ -321,12 +361,11 @@ int main(int argc, char *argv[]) {
//determine genotypes:
if (Parameter::Instance()->genotype) {
- std::cout << "Start genotype calling" << std::endl;
+ std::cout << "Start genotype calling:" << std::endl;
Genotyper * go = new Genotyper();
go->update_SVs();
}
-
} catch (TCLAP::ArgException &e) // catch any exceptions
{
std::cerr << "Sniffles error: " << e.error() << " for arg " << e.argId() << std::endl;
diff --git a/src/Version.h b/src/Version.h
index bb36854..09ef3c8 100644
--- a/src/Version.h
+++ b/src/Version.h
@@ -1,9 +1,9 @@
#ifndef VERSION_H
#define VERSION_H
-#define VERSION_MAJOR "1"
-#define VERSION_MINOR "0"
-#define VERSION_BUILD "6"
+#define VERSION_MAJOR ""
+#define VERSION_MINOR ""
+#define VERSION_BUILD ""
#endif // VERSION_H
diff --git a/src/cluster/Cluster_SVs.cpp b/src/cluster/Cluster_SVs.cpp
index 509fdd6..4db10f6 100644
--- a/src/cluster/Cluster_SVs.cpp
+++ b/src/cluster/Cluster_SVs.cpp
@@ -8,12 +8,9 @@
#include "Cluster_SVs.h"
std::map<long, std::vector<int> > Cluster_SVS::parse_names_ids(int & max_ID) {
- std::string tmp_name_file = Parameter::Instance()->tmp_file; // this file is created in IPrinter and stores the names and ID of SVS.
- tmp_name_file += "Names";
-
- FILE * alt_allel_reads = fopen(tmp_name_file.c_str(), "r");
+ FILE * alt_allel_reads = fopen(Parameter::Instance()->tmp_phasing.c_str(), "r");
if (alt_allel_reads == NULL) {
- std::cerr << "ClusterParse: could not open tmp file: " << tmp_name_file.c_str() << std::endl;
+ std::cerr << "ClusterParse: could not open tmp file: " << Parameter::Instance()->tmp_phasing << std::endl;
}
std::map<long, std::vector<int> > names;
@@ -50,7 +47,7 @@ void Cluster_SVS::update_SVs(std::vector<combine_str> & ids) {
tmp_name_file += ".tmp";
FILE*file = fopen(tmp_name_file.c_str(), "w");
- size_t buffer_size = 250000;
+ size_t buffer_size = 2500000;
char* buffer = new char[buffer_size];
myfile.getline(buffer, buffer_size);
//parse SVs breakpoints in file
@@ -151,11 +148,9 @@ void Cluster_SVS::update_SVs() {
//3: Update the IDS in the VCF/Bedpe files.
update_SVs(ids);
- std::string tmp_name_file = Parameter::Instance()->tmp_file; // this file is created in IPrinter and stores the names and ID of SVS.
- tmp_name_file += "Names";
- std::cout << "Cleaning tmp files" << std::endl;
+ std::cout << "\tCleaning tmp files" << std::endl;
std::string del = "rm ";
- del += tmp_name_file;
- system(del.c_str());
+ del += Parameter::Instance()->tmp_phasing;
+// system(del.c_str());
}
diff --git a/src/force_calling/Force_calling.cpp b/src/force_calling/Force_calling.cpp
new file mode 100644
index 0000000..9dcf95e
--- /dev/null
+++ b/src/force_calling/Force_calling.cpp
@@ -0,0 +1,189 @@
+/*
+ * Force_calling.cpp
+ *
+ * Created on: Aug 24, 2017
+ * Author: sedlazec
+ */
+
+#include "Force_calling.h"
+
+char assign_type(short type) {
+
+ switch (type) {
+ case 0: //DEL
+ return DEL;
+ case 1: //DUP
+ return INV;
+ case 2: //INV
+ return INV;
+ case 3: //TRA
+ return TRA;
+ case 4: //INS
+ return INS;
+ case 6:
+ return NEST;
+ }
+ return ' '; //TODO check default. Should not happen!
+}
+void fill_tree(IntervallTree & final, TNode *& root_final, RefVector ref, std::map<std::string, long>& ref_lens) {
+ //prepare lookup:
+
+ long length = 0;
+ for (size_t i = 0; i < ref.size(); i++) {
+ ref_lens[ref[i].RefName.c_str()] = length;
+ length += ref[i].RefLength + Parameter::Instance()->max_dist;
+ }
+
+ //parse VCF file
+ std::vector<strvcfentry> entries = parse_vcf(Parameter::Instance()->input_vcf, 0);
+ std::cout << "\t\t" << entries.size() << " SVs found in input." << std::endl;
+ for (size_t i = 0; i < entries.size(); i++) {
+ if (entries[i].type != -1) {
+ position_str svs;
+ svs.start.min_pos = (long) entries[i].start.pos + ref_lens[entries[i].start.chr];
+ svs.stop.max_pos = (long) entries[i].stop.pos + ref_lens[entries[i].stop.chr];
+ read_str read;
+ read.SV = assign_type(entries[i].type);
+ read.strand=entries[i].strands;
+ read.type = 2; //called
+ svs.support["input"] = read;
+ Breakpoint * br = new Breakpoint(svs, (long) entries[i].sv_len);
+ final.insert(br, root_final);
+ //std::cout << "Print:" << std::endl;
+ //final.print(root_final);
+ } else {
+ cerr << "Invalid type found skipping" << endl;
+ }
+ }
+ entries.clear();
+}
+
+void force_calling(std::string bam_file, IPrinter *& printer) {
+ cout<<"Force calling SVs"<<endl;
+ //parse reads
+ //only process reads overlapping SV
+ estimate_parameters(Parameter::Instance()->bam_files[0]);
+ BamParser * mapped_file = 0;
+ RefVector ref;
+ std::string read_filename = Parameter::Instance()->bam_files[0];
+ if (read_filename.find("bam") != string::npos) {
+ mapped_file = new BamParser(read_filename);
+ ref = mapped_file->get_refInfo();
+ } else {
+ cerr << "File Format not recognized. File must be a sorted .bam file!" << endl;
+ exit(0);
+ }
+ std::cout << "Construct Tree..." << std::endl;
+
+ //construct the tree:
+ IntervallTree final;
+ TNode * root_final = NULL;
+ std::map<std::string, long> ref_lens;
+ fill_tree(final, root_final, ref, ref_lens);
+
+ std::cout << "Start parsing..." << std::endl;
+
+ int current_RefID = 0;
+
+ //FILE * alt_allel_reads;
+ FILE * ref_allel_reads;
+ if (Parameter::Instance()->genotype) {
+ ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb");
+ }
+ Alignment * tmp_aln = mapped_file->parseRead(Parameter::Instance()->min_mq);
+ long ref_space = ref_lens[ref[tmp_aln->getRefID()].RefName];
+ long num_reads = 0;
+ while (!tmp_aln->getQueryBases().empty()) {
+ if ((tmp_aln->getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800) && tmp_aln->get_is_save())) {
+ //change CHR:
+ if (current_RefID != tmp_aln->getRefID()) {
+ current_RefID = tmp_aln->getRefID();
+ ref_space = ref_lens[ref[tmp_aln->getRefID()].RefName];
+ std::cout << "\tSwitch Chr " << ref[tmp_aln->getRefID()].RefName << std::endl; //" " << ref[tmp_aln->getRefID()].RefLength
+ }
+
+ //check if overlap with any breakpoint!!
+ long read_start_pos = (long) tmp_aln->getPosition() - (long)Parameter::Instance()->max_dist;
+ read_start_pos += ref_space;
+ long read_stop_pos = read_start_pos + (long) tmp_aln->getAlignment()->Length + (long)Parameter::Instance()->max_dist; //getRefLength();//(long) tmp_aln->getPosition();
+
+ if (final.overlaps(read_start_pos, read_stop_pos, root_final)) {
+ //SCAN read:
+ std::vector<str_event> aln_event;
+ std::vector<aln_str> split_events;
+ if (tmp_aln->getMappingQual() > Parameter::Instance()->min_mq) {
+ double score = tmp_aln->get_scrore_ratio();
+#pragma omp parallel // starts a new team
+ {
+#pragma omp sections
+ {
+ {
+ // clock_t begin = clock();
+ if ((score == -1 || score > Parameter::Instance()->score_treshold)) {
+ aln_event = tmp_aln->get_events_Aln();
+ }
+ // Parameter::Instance()->meassure_time(begin, " Alignment ");
+ }
+#pragma omp section
+ {
+ // clock_t begin_split = clock();
+ split_events = tmp_aln->getSA(ref);
+ // Parameter::Instance()->meassure_time(begin_split," Split reads ");
+ }
+ }
+ }
+ //tmp_aln->set_supports_SV(aln_event.empty() && split_events.empty());
+
+ //Store reference supporting reads for genotype estimation:
+ str_read tmp;
+ bool SV_support = !(aln_event.empty() && split_events.empty());
+ if ((Parameter::Instance()->genotype && !SV_support) && (score == -1 || score > Parameter::Instance()->score_treshold)) {
+ //write read:
+ //cout<<"REf: "<<tmp_aln->getName()<<" "<<tmp_aln->getPosition()<<" "<<tmp_aln->getRefLength()<<endl;
+ tmp.chr_id = tmp_aln->getRefID();
+ tmp.start = tmp_aln->getPosition();
+ tmp.length = tmp_aln->getRefLength();
+ fwrite(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
+ }
+
+ //store the potential SVs:
+ if (!aln_event.empty()) {
+ add_events(tmp_aln, aln_event, 0, ref_space, final, root_final, num_reads, true);
+ }
+ if (!split_events.empty()) {
+ add_splits(tmp_aln, split_events, 1, ref, final, root_final, num_reads, true);
+ }
+ }
+ }
+ }
+ //get next read:
+ mapped_file->parseReadFast(Parameter::Instance()->min_mq, tmp_aln);
+
+ num_reads++;
+
+ if (num_reads % 10000 == 0) {
+ cout << "\t\t# Processed reads: " << num_reads << endl;
+ }
+ }
+
+ std::cout << "Print:" << std::endl;
+ //final.print(root_final);
+
+ //filter and copy results:
+ std::cout << "Finalizing .." << std::endl;
+
+ if (Parameter::Instance()->genotype) {
+ fclose(ref_allel_reads);
+ }
+ // sweep->finalyze();
+
+ std::vector<Breakpoint*> points;
+ final.get_breakpoints(root_final, points);
+
+ //std::cout<<"fin up"<<std::endl;
+ for (size_t i = 0; i < points.size(); i++) {
+ points[i]->calc_support();
+ points[i]->predict_SV();
+ printer->printSV(points[i]); //redo! Ignore min support + STD etc.
+ }
+}
diff --git a/src/force_calling/Force_calling.h b/src/force_calling/Force_calling.h
new file mode 100644
index 0000000..3c8816a
--- /dev/null
+++ b/src/force_calling/Force_calling.h
@@ -0,0 +1,32 @@
+/*
+ * Force_calling.h
+ *
+ * Created on: Aug 24, 2017
+ * Author: sedlazec
+ */
+
+#ifndef FORCE_CALLING_FORCE_CALLING_H_
+#define FORCE_CALLING_FORCE_CALLING_H_
+
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <math.h>
+#include <vector>
+#include <algorithm>
+#include "../Paramer.h"
+#include "../BamParser.h"
+#include "../tree/BinTree.h"
+#include "../sub/Breakpoint.h"
+#include "../sub/Detect_Breakpoints.h"
+#include "VCF_parser.h"
+
+
+void force_calling(std::string bam_file, IPrinter *& printer);
+
+
+
+
+#endif /* FORCE_CALLING_FORCE_CALLING_H_ */
diff --git a/src/force_calling/VCF_parser.cpp b/src/force_calling/VCF_parser.cpp
new file mode 100644
index 0000000..eccb30a
--- /dev/null
+++ b/src/force_calling/VCF_parser.cpp
@@ -0,0 +1,464 @@
+/*
+ * VCF_parser.cpp
+ *
+ * Created on: Aug 24, 2017
+ * Author: sedlazec
+ */
+
+#include "VCF_parser.h"
+
+strcoordinate parse_stop(const char * buffer) {
+ size_t i = 0;
+ bool chr_flag = false;
+ strcoordinate pos;
+ pos.chr = "";
+ pos.pos = -1;
+ while (buffer[i] != '\t' && (buffer[i] != '\n' && buffer[i] != '\0')) {
+ if (strncmp(&buffer[i], ";END=", 5) == 0) {
+ pos.pos = atoi(&buffer[i + 5]);
+ }
+ if ((strncmp(&buffer[i], "END=", 4) == 0 && i == 0)) {
+ pos.pos = atoi(&buffer[i + 4]);
+ //std::cout<<"pos"<<pos.pos<<std::endl;
+ }
+ if (strncmp(&buffer[i], "CHR2=", 5) == 0) {
+ i = i + 5;
+ chr_flag = true;
+ }
+ if (buffer[i] == ';') {
+ chr_flag = false;
+ }
+ if (chr_flag) {
+ pos.chr += buffer[i];
+ }
+
+ i++;
+ }
+ //std::cout<<"end: "<<pos.chr<<" "<<pos.pos<<std::endl;
+ return pos;
+}
+std::pair<bool, bool> parse_strands(const char * buffer) {
+ std::pair<bool, bool> strands;
+ size_t i = 0;
+ while (buffer[i] != '\t' && (buffer[i] != '\n' && buffer[i] != '\0')) {
+ if (strncmp(&buffer[i], "3to5", 4) == 0) {
+ strands.first = false;
+ strands.second = true;
+ return strands;
+ }
+ if (strncmp(&buffer[i], "3to3", 4) == 0) {
+ strands.first = false;
+ strands.second = false;
+ return strands;
+ }
+ if (strncmp(&buffer[i], "5to3", 4) == 0) {
+ strands.first = true;
+ strands.second = false;
+ return strands;
+ }
+ if (strncmp(&buffer[i], "5to5", 4) == 0) {
+ strands.first = true;
+ strands.second = true;
+ return strands;
+ }
+ i++;
+ }
+ return strands;
+}
+std::vector<int> parse_callers(char* buffer) {
+ size_t i = 0;
+ std::vector<int> entries;
+ //std::cout<<buffer[i]<<std::endl;
+ entries.push_back(atoi(&buffer[i]));
+ while (buffer[i] != ';' && buffer[i] != '\0') {
+ if (buffer[i] == ',') {
+ entries.push_back(atoi(&buffer[i + 1]));
+ }
+ i++;
+ }
+ //std::cout<<entries.size()<<std::endl;
+ return entries;
+}
+
+short get_type(std::string type) {
+ if (strncmp(type.c_str(), "DEL", 3) == 0) {
+ return 0;
+ } else if (strncmp(type.c_str(), "DUP", 3) == 0) {
+ return 1;
+ } else if (strncmp(type.c_str(), "INVDUP", 6) == 0) { //can be inv/inter/tra
+ return 6;
+ } else if (strncmp(type.c_str(), "INV", 3) == 0) {
+ return 2;
+ } else if (strncmp(type.c_str(), "TRA", 3) == 0) {
+ return 3;
+ } else if ((strncmp(type.c_str(), "INS", 3) == 0 || strncmp(type.c_str(), "ALU", 3) == 0) || (strncmp(type.c_str(), "LINE1", 3) == 0 || strncmp(type.c_str(), "SVA", 3) == 0)) { //needed for 1k genomes project
+ return 4;
+ } else if (strncmp(type.c_str(), "BND", 3) == 0) { //can be inv/inter/tra
+ return 5;
+ }
+ return -1;
+}
+
+std::string trans_type23(short type) {
+ switch (type) {
+ case 0:
+ return "DEL";
+ break;
+ case 1:
+ return "DUP";
+ break;
+ case 2:
+ return "INV";
+ break;
+ case 3:
+ return "TRA";
+ break;
+ case 4:
+ return "INS";
+ break;
+ case 5:
+ return "BND";
+ break;
+ }
+ return "NA";
+}
+
+strcoordinate parse_pos(char * buffer) {
+ strcoordinate pos;
+ pos.chr = "";
+ pos.pos = -1;
+// T]chr2:243185994]
+// A[chr1:231019[
+ size_t i = 0;
+ int count = 0;
+ while (buffer[i] != '\t') {
+ if (count == 1 && ((buffer[i] != '[' || buffer[i] != ']') && buffer[i] != ':')) {
+ pos.chr += buffer[i];
+ }
+ if (count == 2 && buffer[i - 1] == ':') {
+ pos.pos = atoi(&buffer[i]);
+ }
+ if ((buffer[i] == ']' || buffer[i] == '[') || buffer[i] == ':') {
+ count++;
+ }
+ i++;
+ }
+
+ /* std::string tmp = std::string(buffer);
+ size_t found = tmp.find(':');
+ strcoordinate pos;
+ pos.chr = tmp.substr(0, found);
+ found++;
+ pos.pos = atoi(&tmp[found]);*/
+// std::cout << pos.chr << " | " << pos.pos << "|" << std::endl;
+ return pos;
+}
+
+std::pair<int, int> parse_manta(char * buffer) {
+ std::pair<int, int> res;
+ res.first = 0;
+ res.second = 0;
+ size_t i = 0;
+ while (buffer[i] != '\t') {
+ i++;
+ }
+ //std::cout<<buffer<<std::endl;
+ // 0/1:PASS:170:220,0,561:5,3:9,4
+ int count = 0;
+ while (buffer[i] != '\n' && buffer[i] != '\0') {
+ if (count > 3) {
+ if (buffer[i - 1] == ':') {
+ res.first += atoi(&buffer[i]);
+ // std::cout<<"first: "<<atoi(&buffer[i])<<std::endl;
+ }
+ if (buffer[i - 1] == ',') {
+ res.second += atoi(&buffer[i]);
+ // std::cout<<"second: "<<atoi(&buffer[i])<<std::endl;
+ }
+ }
+ if (buffer[i] == ':') {
+ count++;
+ }
+ i++;
+ }
+ return res;
+}
+
+std::pair<int, int> parse_delly(char * buffer) {
+
+ // GT:GL:GQ:FT:RCL:RC:RCR:CN:DR:DV:RR:RV 0/1:-9.02876,0,-32.6842:90:PASS:0:22219093:0:-1:14:3:0:0
+
+ std::pair<int, int> res;
+ res.first = 0;
+ res.second = 0;
+ size_t i = 0;
+ while (buffer[i] != '\t') {
+ i++;
+ }
+ //std::cout<<buffer<<std::endl;
+ int count = 0;
+ while (buffer[i] != '\n' && buffer[i] != '\0') {
+ if ((count == 8 || count == 9) && buffer[i - 1] == ':') {
+ res.first += atoi(&buffer[i]);
+ //std::cout<<"first: "<< atoi(&buffer[i])<<std::endl;
+ }
+ if ((count == 10 || count == 11) && buffer[i - 1] == ':') {
+ res.second += atoi(&buffer[i]);
+ // std::cout<<"second: "<< atoi(&buffer[i])<<std::endl;
+ }
+ if (buffer[i] == ':') {
+ count++;
+ }
+ i++;
+ }
+ return res;
+}
+
+int parse_support(char * buffer) {
+ size_t i = 0;
+ int support = 0;
+ while (buffer[i] != '\t' && buffer[i] != '\0') {
+
+ if (strncmp(&buffer[i], "VT_AC=", 6) == 0) {
+ support = atoi(&buffer[i + 6]);
+ }
+ if ((strncmp(&buffer[i], ";SU=", 4) == 0 || strncmp(&buffer[i], ";RE=", 4) == 0) || (strncmp(&buffer[i], ";PE=", 4) == 0 || strncmp(&buffer[i], ";SR=", 4) == 0)) { // SU: Lumpy, RE: Sniffles
+ support += atoi(&buffer[i + 4]);
+ }
+//TOOD extned for the tags that other caller use!
+ i++;
+ }
+ return support;
+}
+std::pair<bool, bool> parse_strands_lumpy(char * buffer) {
+ std::pair<bool, bool> strand;
+ size_t i = 0;
+ bool is_first = true;
+ while (buffer[i] != '\t') {
+ if (buffer[i] == '[') {
+ if (is_first) {
+ strand.first = false;
+ is_first = false;
+ } else {
+ strand.second = false;
+ }
+ } else if (buffer[i] == ']') {
+ if (is_first) {
+ strand.first = true;
+ is_first = false;
+ } else {
+ strand.second = true;
+ }
+ }
+ i++;
+ }
+ return strand;
+}
+std::string get_most_effect(std::string alt, int ref) {
+ size_t i = 0;
+ std::string most_alt = "";
+
+ std::string tmp = "";
+ while (i < alt.size()) {
+ if (alt[i] == ',') {
+ int size = most_alt.size();
+ int curr = tmp.size();
+ if (abs(ref - curr) > abs(ref - size)) {
+ most_alt = tmp;
+ }
+ tmp.clear();
+ } else {
+ tmp += alt[i];
+ }
+ i++;
+ }
+ return most_alt;
+}
+
+std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
+ size_t buffer_size = 200000000;
+ char*buffer = new char[buffer_size];
+ std::ifstream myfile;
+ myfile.open(filename.c_str(), std::ifstream::in);
+ if (!myfile.good()) {
+ std::cout << "VCF Parser: could not open file: " << filename.c_str() << std::endl;
+ exit(0);
+ }
+
+ std::vector<strvcfentry> calls;
+ myfile.getline(buffer, buffer_size);
+
+ int num = 0;
+ while (!myfile.eof()) {
+ if (buffer[0] != '#') {
+ // std::cout<<num<<"\t"<<buffer<<std::endl;
+ num++;
+ int count = 0;
+ strvcfentry tmp;
+ tmp.stop.pos = -1;
+ tmp.type = -1;
+ bool set_strand = false;
+ std::string ref;
+ std::string alt;
+ tmp.genotype = "./.";
+ tmp.strands.first = true;
+ tmp.strands.second = true;
+ tmp.num_reads.first = 0;
+ tmp.num_reads.second = 0;
+ tmp.sv_len = -1;
+ for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) {
+
+ if (count == 0 && buffer[i] != '\t') {
+ tmp.start.chr += buffer[i];
+ }
+ if (count == 1 && buffer[i - 1] == '\t') {
+ tmp.start.pos = atoi(&buffer[i]);
+ //std::cout<<tmp.start.pos<<std::endl;
+ }
+ if (count == 3 && buffer[i] != '\t') {
+ ref += buffer[i];
+ }
+ if (count == 4 && buffer[i] != '\t') {
+ alt += buffer[i];
+ }
+ if (count == 4 && buffer[i - 1] == '\t') {
+ tmp.strands = parse_strands_lumpy(&buffer[i]);
+ }
+ if (tmp.stop.pos == -1 && (count == 7 && buffer[i - 1] == '\t')) {
+ tmp.stop = parse_stop(&buffer[i]);
+ //std::cout<<"Stop:"<<tmp.stop.pos<<std::endl;
+ }
+ if (count == 7 && strncmp(&buffer[i], "SVTYPE=", 7) == 0) {
+ tmp.type = get_type(std::string(&buffer[i + 7]));
+ }
+ if (count == 7 && strncmp(&buffer[i], ";SU=", 4) == 0) { //for lumpy!
+ tmp.num_reads.second = atoi(&buffer[i + 4]);
+ }
+ if (count == 7 && strncmp(&buffer[i], ";RE=", 4) == 0) { //for sniffles!
+ tmp.num_reads.second = atoi(&buffer[i + 4]);
+ }
+ if (count == 7 && strncmp(&buffer[i], ";CT=", 4) == 0) {
+ //parse strand delly:
+ set_strand = true;
+ tmp.strands.first = (bool) (buffer[i + 4] != '5');
+ tmp.strands.second = (bool) (buffer[i + 7] != '5');
+ }
+ if ((tmp.sv_len == -1 && count == 7) && (strncmp(&buffer[i], "HOMLEN=", 7) == 0 || strncmp(&buffer[i], "AVGLEN=", 7) == 0)) {
+ tmp.sv_len = abs((int) atof(&buffer[i + 7]));
+ }
+
+ if (count == 7 && (strncmp(&buffer[i], "SVLEN=", 6) == 0)) {
+ tmp.sv_len = abs((int) atof(&buffer[i + 6]));
+ }
+ if (count == 7 && strncmp(&buffer[i], ";STRANDS=", 9) == 0) {
+ set_strand = true;
+ tmp.strands.first = (bool) (buffer[i + 9] == '+');
+ tmp.strands.second = (bool) (buffer[i + 10] == '+');
+ }
+
+ if (count == 9 && buffer[i - 1] == '\t') { //parsing genotype;
+ size_t j = i;
+ tmp.genotype = "";
+ while (buffer[j] != '\0' && buffer[j] != ':') {
+ if (buffer[j] != '\t') {
+ tmp.genotype += buffer[j];
+ }
+ j++;
+ }
+ // std::cout<<"GO: "<<tmp.genotype<<std::endl;
+ }
+ if (count == 8 && strncmp(&buffer[i], "PR:SR", 5) == 0) {
+ //manta
+ tmp.num_reads = parse_manta(&buffer[i]);
+ }
+ if (count == 8 && strncmp(&buffer[i], "DR:DV:RR:RV", 11) == 0) {
+ //delly
+ tmp.num_reads = parse_delly(&buffer[i]);
+ }
+
+ if (count == 4 && buffer[i - 1] == '<') {
+ tmp.type = get_type(std::string(&buffer[i]));
+ }
+ if (tmp.stop.pos == -1 && (count == 4 && (buffer[i - 1] == '[' || buffer[i - 1] == ']'))) {
+ tmp.stop = parse_pos(&buffer[i - 1]);
+ }
+
+ if (count == 9 && buffer[i - 1] == '\t') {
+ tmp.calls[filename] = std::string(&buffer[i]);
+ break;
+ }
+
+ if (count < 9) {
+ tmp.header += buffer[i];
+ }
+ if (buffer[i] == '\t') {
+ count++;
+ }
+ }
+ if (!set_strand) {
+ if (tmp.type == 0 || tmp.type == 4) {
+ tmp.strands.first = true;
+ tmp.strands.second = false;
+ } else if (tmp.type == 1) {
+ tmp.strands.first = false;
+ tmp.strands.second = true;
+ } else { //should not happen??
+ tmp.strands.first = true;
+ tmp.strands.second = true;
+ }
+ }
+
+ if (tmp.stop.pos == -1 && tmp.sv_len != -1) {
+ tmp.stop.pos = tmp.start.pos + tmp.sv_len;
+ }
+ if (tmp.stop.pos == -1) {
+ std::size_t found = alt.find(",");
+ if (found != std::string::npos) {
+ alt = get_most_effect(alt, (int) ref.size());
+ }
+ tmp.stop.chr = tmp.start.chr;
+ tmp.sv_len = (int) ref.size() - (int) alt.size();
+ tmp.stop.pos = tmp.start.pos + abs(tmp.sv_len);
+ if (tmp.sv_len > 0) {
+ tmp.type = 0; //deletion
+ } else if (tmp.sv_len < 0) {
+ tmp.type = 4; //insertions
+ tmp.sv_len = abs(tmp.sv_len);
+ // std::cout<<"INS: "<<tmp.sv_len <<std::endl;
+ }
+ }
+ if (tmp.stop.chr.empty()) {
+ tmp.stop.chr = tmp.start.chr;
+ }
+ if (tmp.sv_len == -1) {
+ tmp.sv_len = abs(tmp.start.pos - tmp.stop.pos);
+ }
+ if ((strcmp(tmp.start.chr.c_str(), tmp.stop.chr.c_str()) != 0 || (tmp.sv_len >= min_svs))) { // || tmp.type==4
+ std::size_t found = tmp.stop.chr.find("chr");
+ if (found != std::string::npos) {
+ tmp.stop.chr.erase(tmp.stop.chr.begin() + found, tmp.stop.chr.begin() + found + 3);
+ }
+ found = tmp.start.chr.find("chr");
+ if (found != std::string::npos) {
+ tmp.start.chr.erase(tmp.start.chr.begin() + found, tmp.start.chr.begin() + found + 3);
+ }
+
+ if (tmp.type == 5) { //BND
+ if (strcmp(tmp.stop.chr.c_str(), tmp.start.chr.c_str()) == 0) {
+ tmp.type = 2;
+ } else {
+ tmp.type = 3;
+ }
+ }
+ calls.push_back(tmp);
+ }
+ tmp.calls.clear();
+ } else {
+
+ }
+ myfile.getline(buffer, buffer_size);
+ }
+ myfile.close();
+//std::cout << calls.size() << std::endl;
+ return calls;
+}
diff --git a/src/force_calling/VCF_parser.h b/src/force_calling/VCF_parser.h
new file mode 100644
index 0000000..7967664
--- /dev/null
+++ b/src/force_calling/VCF_parser.h
@@ -0,0 +1,47 @@
+/*
+ * VCF_parser.h
+ *
+ * Created on: Aug 24, 2017
+ * Author: sedlazec
+ */
+
+#ifndef FORCE_CALLING_VCF_PARSER_H_
+#define FORCE_CALLING_VCF_PARSER_H_
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <map>
+#include <stdlib.h>
+#include <string.h>
+#include <iosfwd>
+#include <fstream>
+
+struct strcoordinate{
+ int pos;
+ std::string chr;
+};
+
+struct strvcfentry{
+ std::string header;
+ strcoordinate start;
+ strcoordinate stop;
+ short type; //0=DEL,1=DUP,2=INV,3=TRA
+ std::map<std::string,std::string> calls;
+ int sup_lumpy;
+ int caller_id;
+ std::vector<int> caller_supports;
+ std::pair<bool,bool> strands;
+ std::pair<int,int> num_reads; //ref alt
+ std::string genotype;
+ int sv_len;
+ //int num_reads;
+};
+
+
+std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs);
+
+
+#endif /* FORCE_CALLING_VCF_PARSER_H_ */
diff --git a/src/print/BedpePrinter.cpp b/src/print/BedpePrinter.cpp
index 2de4b1f..9f1000f 100644
--- a/src/print/BedpePrinter.cpp
+++ b/src/print/BedpePrinter.cpp
@@ -22,8 +22,9 @@ void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
pair<double, double> kurtosis;
pair<double, double> std_quant;
double std_length = 0;
-
- if (to_print(SV, std_quant, kurtosis, std_length)) {
+ int zmws = 0;
+ bool ok_to_print = (to_print(SV, std_quant, kurtosis, std_length, zmws) || Parameter::Instance()->ignore_std);
+ if (ok_to_print && (zmws == 0 || zmws >= Parameter::Instance()->min_zmw)) {
std::string chr;
std::string strands = SV->get_strand(2);
@@ -75,3 +76,54 @@ void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
}
}
}
+
+void BedpePrinter::print_body_recall(Breakpoint * &SV, RefVector ref) {
+ std::string chr;
+ std::string strands = SV->get_strand(2);
+ int pos = IPrinter::calc_pos(SV->get_coordinates().start.min_pos, ref, chr);
+ fprintf(file, "%s", chr.c_str());
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", pos);
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", IPrinter::calc_pos(SV->get_coordinates().start.max_pos, ref, chr));
+ fprintf(file, "%c", '\t');
+ pos = IPrinter::calc_pos(SV->get_coordinates().stop.min_pos, ref, chr);
+ fprintf(file, "%s", chr.c_str());
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", pos);
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", IPrinter::calc_pos(SV->get_coordinates().stop.max_pos, ref, chr));
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", id);
+ id++;
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", -1); //TODO: score
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%c", strands[0]);
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%c", strands[1]);
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str());
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", SV->get_support());
+ fprintf(file, "%c", '\t');
+ pos = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr);
+ fprintf(file, "%s", chr.c_str());
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", pos);
+ fprintf(file, "%c", '\t');
+ pos = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr);
+ fprintf(file, "%s", chr.c_str());
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", pos);
+ fprintf(file, "%c", '\t');
+ if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && !SV->get_types().is_SR) {
+ fprintf(file, "%s", "NA");
+ } else {
+ fprintf(file, "%i", SV->get_length());
+ }
+ //fprintf(file, "%c", '\t');
+ //fprintf(file, "%i", SV->get_support());
+ fprintf(file, "%c", '\n');
+}
+
diff --git a/src/print/BedpePrinter.h b/src/print/BedpePrinter.h
index a91a6c7..96d7889 100644
--- a/src/print/BedpePrinter.h
+++ b/src/print/BedpePrinter.h
@@ -12,6 +12,7 @@ class BedpePrinter:public IPrinter{
private:
void print_header();
void print_body(Breakpoint *& SV, RefVector ref);
+ void print_body_recall(Breakpoint * &SV, RefVector ref);
public:
BedpePrinter(){
diff --git a/src/print/IPrinter.cpp b/src/print/IPrinter.cpp
index b3e5946..759f90c 100644
--- a/src/print/IPrinter.cpp
+++ b/src/print/IPrinter.cpp
@@ -7,25 +7,52 @@
#include "IPrinter.h"
+
+std::string IPrinter::assess_genotype(int ref, int support) {
+ double allele = (double) support / (double) (support + ref);
+
+ if (allele < Parameter::Instance()->min_allelel_frequency) {
+ return "";
+ }
+
+ std::stringstream ss;
+ ss << ";AF=";
+ ss << allele;
+ ss << "\tGT:DR:DV\t";
+ if (allele > Parameter::Instance()->homfreq) {
+ ss <<"1/1:";
+ } else if (allele > Parameter::Instance()->hetfreq) {
+ ss << "0/1:";
+ }else{
+ ss << "0/0:";
+ }
+ ss << ref;
+ ss << ":";
+ ss << support;
+ return ss.str();
+}
+
+
+
bool IPrinter::is_huge_ins(Breakpoint * &SV) {
- int counts=0;
+ int counts = 0;
std::map<std::string, read_str> support = SV->get_coordinates().support;
for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
- if(((*i).second.coordinates.second - (*i).second.coordinates.first) == Parameter::Instance()->huge_ins){
+ if (((*i).second.coordinates.second - (*i).second.coordinates.first) == Parameter::Instance()->huge_ins) {
counts++;
}
}
//std::cout<<"Ratio: "<<((double)counts/(double)support.size())<<std::endl;
- return ((double)counts/(double)support.size() > 0.3);
+ return ((double) counts / (double) support.size() > 0.3);
}
-bool IPrinter::to_print(Breakpoint * &SV, pair<double, double>& std, pair<double, double> & kurtosis, double & std_length) {
+bool IPrinter::to_print(Breakpoint * &SV, pair<double, double>& std, pair<double, double> & kurtosis, double & std_length, int & zmw_num) {
std.first = 0;
std.second = 0;
//comp_std(SV, std_start, std_stop);
std_length = 0;
- kurtosis = comp_std_quantile(SV, std, std_length);
+ kurtosis = comp_std_quantile(SV, std, std_length, zmw_num);
bool to_print = true;
if ((SV->get_SVtype() & INS) && is_huge_ins(SV)) {
@@ -34,23 +61,15 @@ bool IPrinter::to_print(Breakpoint * &SV, pair<double, double>& std, pair<double
if ((SV->get_SVtype() & INS) || (SV->get_SVtype() & DEL)) { //for insertions + deletions:
double dist = (double) (SV->get_coordinates().stop.most_support - SV->get_coordinates().start.most_support);
- //dist = (double)std::min(((int)dist * 4), Parameter::Instance()->max_dist);
dist = dist * 4.0 * (uniform_variance / 2); //because we test against corrected value!
-// std::cout<<"DIST: "<<(SV->get_coordinates().stop.most_support - SV->get_coordinates().start.most_support)<<" "<<dist<<" STD: "<<std.first<<" "<<std.second<<std::endl;
return ((std.first < dist && std.second < dist)); //0.2886751
}
- //TODO test!
if (SV->get_SVtype() & NEST) {
return true;
}
double max_allowed = 4 * Parameter::Instance()->max_dist * (uniform_variance / 2);
- if (SV->get_coordinates().support.size() < 8) { //not enough coverage
-// max_allowed = 4 * Parameter::Instance()->max_dist * (uniform_variance);
- //max_allowed = max_allowed * ((double) SV->get_coordinates().support.size()/10.0);
- }
-
return (std.first < max_allowed && std.second < max_allowed);
}
@@ -172,8 +191,26 @@ void IPrinter::comp_std_med(Breakpoint * &SV, double & std_start, double & std_s
std_start = std::sqrt(std_start_dists[median]);
std_stop = std::sqrt(std_stop_dists[median]);
}
+bool contains_zmw(std::string read_name, std::string & zmw) {
+ //{movieName}/{zmwNumber}/{start}_{end}/
+ size_t i = 0;
+ bool read = false;
+ while (i < read_name.size()) {
+ if (read && read_name[i] != '/') {
+ zmw += read_name[i];
+ }
+ if (read_name[i] == '/') {
+ read = !read;
+ if (!zmw.empty()) {
+ return true;
+ }
+ }
+ i++;
+ }
+ return false;
+}
-pair<double, double> IPrinter::comp_std_quantile(Breakpoint * &SV, pair<double, double> & std, double & std_length) {
+pair<double, double> IPrinter::comp_std_quantile(Breakpoint * &SV, pair<double, double> & std, double & std_length, int & zmw_num) {
double count = 0;
std::vector<int> std_start_dists;
std::vector<int> std_stop_dists;
@@ -186,8 +223,13 @@ pair<double, double> IPrinter::comp_std_quantile(Breakpoint * &SV, pair<double,
double s2_stop = 0;
std_length = 0;
std::map<std::string, read_str> support = SV->get_coordinates().support;
+ std::map<std::string, bool> zmws;
for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
- if ((*i).second.SV & SV->get_SVtype()) {
+ if (((*i).second.SV & SV->get_SVtype()) && strncmp((*i).first.c_str(), "input", 5) != 0) {
+ std::string zmw = "";
+ if (contains_zmw((*i).first, zmw)) {
+ zmws[zmw] = true;
+ }
long diff = SV->get_length() - ((*i).second.coordinates.second - (*i).second.coordinates.first);
// sort_insert(std::pow((double) diff, 2.0), std_length_dists); //TODO think about that!!
std_length += std::pow((double) diff, 2.0);
@@ -201,9 +243,6 @@ pair<double, double> IPrinter::comp_std_quantile(Breakpoint * &SV, pair<double,
}
if ((*i).second.coordinates.second != -1) {
diff = (SV->get_coordinates().stop.most_support - (*i).second.coordinates.second);
- //ss << '\t';
- //ss << diff;
-
sort_insert(std::pow((double) diff, 2.0), std_stop_dists);
s4_stop += std::pow((double) diff, 4.0);
s2_stop += std::pow((double) diff, 2.0);
@@ -211,35 +250,30 @@ pair<double, double> IPrinter::comp_std_quantile(Breakpoint * &SV, pair<double,
}
count++;
}
+ zmw_num = zmws.size();
std_length = std::sqrt(std_length / count);
- s4_start = s4_start / count;
- s4_stop = s4_stop / count;
- s2_start = s2_start / count;
- s2_stop = s2_stop / count;
-
count = 0;
+
for (int i = 0; i < std::max((int) std_stop_dists.size() / 2, 10) && i < std_start_dists.size(); i++) {
std.first += std_start_dists[i];
std.second += std_stop_dists[i];
count++;
-
- /*s4_start += std::pow((double) std_start_dists[i], 2.0);
- s4_stop += std::pow((double) std_stop_dists[i], 2.0);
- s2_start += std_start_dists[i];
- s2_stop += std_stop_dists[i];*/
}
- pair<double, double> kurtosis;
- kurtosis.first = (s4_start / std::pow(s2_start, 2.0)) - 3;
- kurtosis.second = (s4_stop / std::pow(s2_stop, 2.0)) - 3;
-
std.first = std::sqrt(std.first / count);
std.second = std::sqrt(std.second / count);
- for (size_t i = 0; i < std_length_dists.size(); i++) {
- }
+ s4_start = s4_start / count;
+ s4_stop = s4_stop / count;
+ s2_start = s2_start / count;
+ s2_stop = s2_stop / count;
+
+ pair<double, double> kurtosis;
+ kurtosis.first = (s4_start / std::pow(s2_start, 2.0)) - 3;
+ kurtosis.second = (s4_stop / std::pow(s2_stop, 2.0)) - 3;
+
return kurtosis;
}
diff --git a/src/print/IPrinter.h b/src/print/IPrinter.h
index 6096a05..5fad75e 100644
--- a/src/print/IPrinter.h
+++ b/src/print/IPrinter.h
@@ -15,6 +15,7 @@
#include "../Ignore_Regions.h"
#include "../sub/Breakpoint.h"
#include "../cluster/Cluster_SVs.h"
+#include "../Genotyper/Genotyper.h"
#include <math.h>
double const uniform_variance = 0.2886751; //sqrt(1/12) see variance of uniform distribution -> std
@@ -32,11 +33,14 @@ protected:
virtual void print_header()=0;
virtual void print_body(Breakpoint * &SV, RefVector ref)=0;
+ virtual void print_body_recall(Breakpoint * &SV, RefVector ref)=0;
+
long calc_pos(long pos, RefVector ref, std::string &chr);
std::string get_chr(long pos, RefVector ref);
std::string get_type(char type);
void sort_insert(int pos, std::vector<int> & positons);
bool is_huge_ins(Breakpoint * &SV);
+ std::string assess_genotype(int ref, int support);
public:
IPrinter() {
@@ -50,7 +54,11 @@ public:
}
void printSV(Breakpoint * SV) {
- print_body(SV, ref);
+ if(Parameter::Instance()->input_vcf.empty()){
+ print_body(SV, ref);
+ }else{
+ print_body_recall(SV,ref);
+ }
}
void init() {
try {
@@ -76,20 +84,19 @@ public:
std::cout << "Cross checking..." << std::endl;
initialize_bed(bed_tree, root, ref);
}
- string tmp_name_file = Parameter::Instance()->tmp_file;
+
if (Parameter::Instance()->phase) {
- tmp_name_file += "Names";
- tmp_file = fopen(tmp_name_file.c_str(), "wb");
+ tmp_file = fopen(Parameter::Instance()->tmp_phasing.c_str(), "wb");
}
}
- bool to_print(Breakpoint * &SV, pair<double, double> &std, pair<double, double> & kurtosis, double & std_length);
+ bool to_print(Breakpoint * &SV, pair<double, double> &std, pair<double, double> & kurtosis, double & std_length, int & zmw_num);
void store_readnames(std::vector<long> names, int id);
void close_file() {
fclose(this->file);
}
void comp_std(Breakpoint * &SV, double & std_start, double & std_stop);
void comp_std_med(Breakpoint * &SV, double & std_start, double & std_stop);
- pair<double, double> comp_std_quantile(Breakpoint * &SV, pair<double, double>& std, double & std_lenght);
+ pair<double, double> comp_std_quantile(Breakpoint * &SV, pair<double, double>& std, double & std_lenght, int & zmw_num);
const std::string currentDateTime();
};
diff --git a/src/print/VCFPrinter.cpp b/src/print/VCFPrinter.cpp
index 6bb2769..24c19c8 100644
--- a/src/print/VCFPrinter.cpp
+++ b/src/print/VCFPrinter.cpp
@@ -18,61 +18,42 @@ void VCFPrinter::print_header() {
for (size_t i = 0; i < this->ref.size(); i++) {
fprintf(file, "%s", "\n");
fprintf(file, "%s", "##contig=<ID=");
- fprintf(file, "%s",ref[i].RefName.c_str());
+ fprintf(file, "%s", ref[i].RefName.c_str());
fprintf(file, "%s", ",length=");
- fprintf(file, "%i",(int)ref[i].RefLength);
- fprintf(file, "%c",'>');
+ fprintf(file, "%i", (int) ref[i].RefLength);
+ fprintf(file, "%c", '>');
}
fprintf(file, "%s", "\n");
fprintf(file, "%s", "##ALT=<ID=DEL,Description=\"Deletion\">\n");
fprintf(file, "%s", "##ALT=<ID=DUP,Description=\"Duplication\">\n");
fprintf(file, "%s", "##ALT=<ID=INV,Description=\"Inversion\">\n");
- fprintf(file, "%s",
- "##ALT=<ID=INVDUP,Description=\"InvertedDUP with unknown boundaries\">\n");
+ fprintf(file, "%s", "##ALT=<ID=INVDUP,Description=\"InvertedDUP with unknown boundaries\">\n");
fprintf(file, "%s", "##ALT=<ID=TRA,Description=\"Translocation\">\n");
fprintf(file, "%s", "##ALT=<ID=INS,Description=\"Insertion\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=CHR2,Number=1,Type=String,Description=\"Chromosome for END coordinate in case of a translocation\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=MAPQ,Number=1,Type=Integer,Description=\"Median mapping quality of paired-ends\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=RE,Number=1,Type=Integer,Description=\"read support\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variation\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=PRECISE,Number=0,Type=Flag,Description=\"Precise structural variation\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Length of the SV\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Type of approach used to detect SV\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=STD_quant_start,Number=.,Type=Integer,Description=\"STD of the start breakpoints across the reads.\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=STD_quant_stop,Number=.,Type=Integer,Description=\"STD of the stop breakpoints across the reads.\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=Kurtosis_quant_start,Number=.,Type=Integer,Description=\"Kurtosis value of the start breakpoints accross the reads.\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=Kurtosis_quant_stop,Number=.,Type=Integer,Description=\"Kurtosis value of the stop breakpoints accross the reads.\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN)\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN)\">\n");
- fprintf(file, "%s",
- "##INFO=<ID=STRANDS,Number=.,Type=String,Description=\"Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)\">\n");
- fprintf(file, "%s",
- "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
- fprintf(file, "%s",
- "##FORMAT=<ID=DR,Number=1,Type=Integer,Description=\"# high-quality reference reads\">\n");
- fprintf(file, "%s",
- "##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"# high-quality variant reads\">\n");
-
- fprintf(file, "%s",
- "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
+ fprintf(file, "%s", "##INFO=<ID=CHR2,Number=1,Type=String,Description=\"Chromosome for END coordinate in case of a translocation\">\n");
+ fprintf(file, "%s", "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant\">\n");
+ fprintf(file, "%s", "##INFO=<ID=MAPQ,Number=1,Type=Integer,Description=\"Median mapping quality of paired-ends\">\n");
+ fprintf(file, "%s", "##INFO=<ID=RE,Number=1,Type=Integer,Description=\"read support\">\n");
+ fprintf(file, "%s", "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variation\">\n");
+ fprintf(file, "%s", "##INFO=<ID=PRECISE,Number=0,Type=Flag,Description=\"Precise structural variation\">\n");
+ fprintf(file, "%s", "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Length of the SV\">\n");
+ fprintf(file, "%s", "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Type of approach used to detect SV\">\n");
+ fprintf(file, "%s", "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n");
+ fprintf(file, "%s", "##INFO=<ID=STD_quant_start,Number=.,Type=Integer,Description=\"STD of the start breakpoints across the reads.\">\n");
+ fprintf(file, "%s", "##INFO=<ID=STD_quant_stop,Number=.,Type=Integer,Description=\"STD of the stop breakpoints across the reads.\">\n");
+ fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_start,Number=.,Type=Integer,Description=\"Kurtosis value of the start breakpoints accross the reads.\">\n");
+ fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_stop,Number=.,Type=Integer,Description=\"Kurtosis value of the stop breakpoints accross the reads.\">\n");
+ fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN)\">\n");
+ fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN)\">\n");
+ fprintf(file, "%s", "##INFO=<ID=STRANDS,Number=.,Type=String,Description=\"Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)\">\n");
+ fprintf(file, "%s", "##INFO=<ID=AF,Number=.,Type=Integer,Description=\"Allele Frequency.\">\n");
+ fprintf(file, "%s", "##INFO=<ID=ZMW,Number=.,Type=Integer,Description=\"Number of ZMWs (Pacbio) supporting SV.\">\n");
+ fprintf(file, "%s", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
+ fprintf(file, "%s", "##FORMAT=<ID=DR,Number=1,Type=Integer,Description=\"# high-quality reference reads\">\n");
+ fprintf(file, "%s", "##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"# high-quality variant reads\">\n");
+
+ fprintf(file, "%s", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
for (size_t i = 0; i < Parameter::Instance()->bam_files.size(); i++) {
fprintf(file, "%c", '\t');
fprintf(file, "%s", Parameter::Instance()->bam_files[i].c_str());
@@ -81,10 +62,7 @@ void VCFPrinter::print_header() {
}
void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
- if (!this->bed_tree.is_in(SV->get_coordinates().start.most_support,
- this->root)
- && !this->bed_tree.is_in(SV->get_coordinates().stop.most_support,
- this->root)) {
+ if (!this->bed_tree.is_in(SV->get_coordinates().start.most_support, this->root) && !this->bed_tree.is_in(SV->get_coordinates().stop.most_support, this->root)) {
//temp. store read names supporting this SVs to later group the SVs together.
double std_quant_start = 0;
double std_quant_stop = 0;
@@ -92,16 +70,15 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
pair<double, double> kurtosis;
pair<double, double> std_quant;
double std_length = 0;
- //to_print(SV, std_quant, kurtosis, std_length);
- bool ok_to_print = (to_print(SV, std_quant, kurtosis, std_length)
- || Parameter::Instance()->ignore_std);
- if (ok_to_print) {
+ int zmws = 0;
+ bool ok_to_print = (to_print(SV, std_quant, kurtosis, std_length, zmws) || Parameter::Instance()->ignore_std);
+ //std::cout << "Print check: " << std_quant.first << " " << std_quant.second << endl;
+ if (ok_to_print && (zmws == 0 || zmws >= Parameter::Instance()->min_zmw)) {
if (Parameter::Instance()->phase) {
store_readnames(SV->get_read_ids(), id);
}
std::string chr;
- int start = IPrinter::calc_pos(
- SV->get_coordinates().start.most_support, ref, chr);
+ int start = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr);
fprintf(file, "%s", chr.c_str());
fprintf(file, "%c", '\t');
fprintf(file, "%i", start);
@@ -109,14 +86,13 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%i", id);
id++;
- int end = IPrinter::calc_pos(
- SV->get_coordinates().stop.most_support, ref, chr);
+ int end = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr);
std::string strands = SV->get_strand(1);
fprintf(file, "%s", "\tN\t");
if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA)) {
//N[22:36765684[ +-
//]21:10540232]N -+
- if (strands[0] == '-' && strands[0] =='+') {
+ if (strands[0] == '-') { //&&
fprintf(file, "%s", "]");
fprintf(file, "%s", chr.c_str());
fprintf(file, "%c", ':');
@@ -133,11 +109,9 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
} else {
fprintf(file, "%c", '<');
- fprintf(file, "%s",
- IPrinter::get_type(SV->get_SVtype()).c_str());
+ fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str());
fprintf(file, "%c", '>');
}
-
fprintf(file, "%s", "\t.\tPASS\t");
if (std_quant.first < 10 && std_quant.second < 10) {
fprintf(file, "%s", "PRECISE");
@@ -153,11 +127,15 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%s", ";END=");
if (SV->get_SVtype() & INS) {
- fprintf(file, "%i",std::max((int) (end - SV->get_length()), start));
+ fprintf(file, "%i", std::max((int) (end - SV->get_length()), start));
} else {
fprintf(file, "%i", end);
}
}
+ if (zmws != 0) {
+ fprintf(file, "%s", ";ZMW=");
+ fprintf(file, "%i", zmws);
+ }
fprintf(file, "%s", ";STD_quant_start=");
fprintf(file, "%f", std_quant.first);
fprintf(file, "%s", ";STD_quant_stop=");
@@ -166,39 +144,23 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%f", kurtosis.first);
fprintf(file, "%s", ";Kurtosis_quant_stop=");
fprintf(file, "%f", kurtosis.second);
- // fprintf(file, "%s", ";STD_length=");
- // fprintf(file, "%f", std_length);
-
fprintf(file, "%s", ";SVTYPE=");
- if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA) ) {
+ if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA)) {
fprintf(file, "%s", "BND");
- }else{
+ } else {
fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str());
}
- if (Parameter::Instance()->report_n_reads > 0) {
- std::vector<std::string> names = SV->get_read_names(
- Parameter::Instance()->report_n_reads);
+
+ if (Parameter::Instance()->report_n_reads > 0 || Parameter::Instance()->report_n_reads == -1) {
fprintf(file, "%s", ";RNAMES=");
- for (size_t i = 0;
- i < names.size()
- && i < Parameter::Instance()->report_n_reads;
- i++) {
- fprintf(file, "%s", names[i].c_str());
- fprintf(file, "%s", ";");
- }
- } else {
- fprintf(file, "%s", ";");
+ fprintf(file, "%s", SV->get_read_names().c_str());
}
- fprintf(file, "%s", "SUPTYPE=");
+ fprintf(file, "%s", ";SUPTYPE=");
fprintf(file, "%s", SV->get_supporting_types().c_str());
fprintf(file, "%s", ";SVLEN=");
- // if (SV->get_SVtype() & NEST) {
- // fprintf(file, "%i", -1);
- // } else {
- if (((SV->get_SVtype() & INS)
- && SV->get_length() == Parameter::Instance()->huge_ins)
- && !SV->get_types().is_SR) {
+
+ if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && !SV->get_types().is_SR) {
fprintf(file, "%s", "NA");
} else {
fprintf(file, "%i", SV->get_length());
@@ -206,43 +168,108 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
// }
fprintf(file, "%s", ";STRANDS=");
fprintf(file, "%s", strands.c_str());
- /* fprintf(file, "%s", ";STRANDS2=");
-
- std::map<std::string, read_str> support =
- SV->get_coordinates().support;
- pair<int, int> tmp_start;
- pair<int, int> tmp_stop;
- tmp_start.first = 0;
- tmp_start.second = 0;
- tmp_stop.first = 0;
- tmp_stop.second = 0;
- for (std::map<std::string, read_str>::iterator i = support.begin();
- i != support.end(); i++) {
- if ((*i).second.read_strand.first) {
- tmp_start.first++;
- } else {
- tmp_start.second++;
- }
- if ((*i).second.read_strand.second) {
- tmp_stop.first++;
- } else {
- tmp_stop.second++;
- }
+ if (!SV->get_sequence().empty()) {
+ fprintf(file, "%s", ";SEQ=");
+ fprintf(file, "%s", SV->get_sequence().c_str());
}
- fprintf(file, "%i", tmp_start.first);
- fprintf(file, "%s", ",");
- fprintf(file, "%i", tmp_start.second);
- fprintf(file, "%s", ",");
- fprintf(file, "%i", tmp_stop.first);
- fprintf(file, "%s", ",");
- fprintf(file, "%i", tmp_stop.second);*/
-
fprintf(file, "%s", ";RE=");
fprintf(file, "%i", SV->get_support());
+ //if(Parameter::Instance()->genotype){
fprintf(file, "%s", "\tGT:DR:DV\t./.:.:");
fprintf(file, "%i", SV->get_support());
+ //}else{
+ // fprintf(file, "%s",this->assess_genotype(SV->get_refcount(),SV->get_support()).c_str());
+ //}
fprintf(file, "%c", '\n');
}
}
}
+void VCFPrinter::print_body_recall(Breakpoint * &SV, RefVector ref) {
+ if (Parameter::Instance()->phase) {
+ store_readnames(SV->get_read_ids(), id);
+ }
+ std::string chr;
+ int start = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr);
+ fprintf(file, "%s", chr.c_str());
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", start);
+ fprintf(file, "%c", '\t');
+ fprintf(file, "%i", id);
+ id++;
+
+ int end = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr);
+ std::string strands = SV->get_strand(1);
+ fprintf(file, "%s", "\tN\t");
+ if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA)) {
+ //N[22:36765684[ +-
+ //]21:10540232]N -+
+ if (strands[0] == '-' && strands[0] == '+') {
+ fprintf(file, "%s", "]");
+ fprintf(file, "%s", chr.c_str());
+ fprintf(file, "%c", ':');
+ fprintf(file, "%i", end);
+ fprintf(file, "%s", "]N");
+
+ } else {
+ fprintf(file, "%s", "N[");
+ fprintf(file, "%s", chr.c_str());
+ fprintf(file, "%c", ':');
+ fprintf(file, "%i", end);
+ fprintf(file, "%c", '[');
+ }
+ } else {
+
+ fprintf(file, "%c", '<');
+ fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str());
+ fprintf(file, "%c", '>');
+ }
+
+ fprintf(file, "%s", "\t.\tPASS\t");
+ fprintf(file, "%s", "IMPRECISE");
+ fprintf(file, "%s", ";SVMETHOD=Snifflesv");
+ fprintf(file, "%s", Parameter::Instance()->version.c_str());
+ if (!(Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA))) {
+ fprintf(file, "%s", ";CHR2=");
+ fprintf(file, "%s", chr.c_str());
+ fprintf(file, "%s", ";END=");
+
+ if (SV->get_SVtype() & INS) {
+ fprintf(file, "%i", std::max((int) (end - SV->get_length()), start));
+ } else {
+ fprintf(file, "%i", end);
+ }
+ }
+
+ fprintf(file, "%s", ";SVTYPE=");
+ if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA)) {
+ fprintf(file, "%s", "BND");
+ } else {
+ fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str());
+ }
+ if (Parameter::Instance()->report_n_reads > 0 || Parameter::Instance()->report_n_reads == -1) {
+ fprintf(file, "%s", ";RNAMES=");
+ fprintf(file, "%s", SV->get_read_names().c_str());
+ }
+ fprintf(file, "%s", ";SUPTYPE=");
+ fprintf(file, "%s", SV->get_supporting_types().c_str());
+ fprintf(file, "%s", ";SVLEN=");
+
+ if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && !SV->get_types().is_SR) {
+ fprintf(file, "%s", "NA");
+ } else {
+ fprintf(file, "%i", SV->get_length());
+ }
+ // }
+ fprintf(file, "%s", ";STRANDS=");
+ fprintf(file, "%s", strands.c_str());
+ fprintf(file, "%s", ";SEQ=");
+ fprintf(file, "%s", SV->get_sequence().c_str());
+ fprintf(file, "%s", ";RE=");
+ fprintf(file, "%i", SV->get_support());
+ fprintf(file, "%s", "\tGT:DR:DV\t./.:.:");
+ fprintf(file, "%i", SV->get_support());
+ fprintf(file, "%c", '\n');
+
+}
+
diff --git a/src/print/VCFPrinter.h b/src/print/VCFPrinter.h
index 9bd4c4a..3c0761a 100644
--- a/src/print/VCFPrinter.h
+++ b/src/print/VCFPrinter.h
@@ -14,6 +14,7 @@ class VCFPrinter:public IPrinter{
private:
void print_header();
void print_body(Breakpoint * &SV, RefVector ref);
+ void print_body_recall(Breakpoint * &SV, RefVector ref);
public:
VCFPrinter(){
diff --git a/src/sub/Breakpoint.cpp b/src/sub/Breakpoint.cpp
index 02128cb..50de038 100644
--- a/src/sub/Breakpoint.cpp
+++ b/src/sub/Breakpoint.cpp
@@ -139,7 +139,7 @@ long Breakpoint::overlap(Breakpoint * tmp) {
}
//merging "huge ins" and observed ins:
- if(is_same_strand(tmp) && (abs(tmp->get_coordinates().start.min_pos -tmp->get_coordinates().stop.max_pos) == Parameter::Instance()->huge_ins || abs(positions.start.min_pos -positions.stop.max_pos) == Parameter::Instance()->huge_ins) && (abs(tmp->get_coordinates().start.min_pos - positions.start.min_pos) < max_dist || abs(tmp->get_coordinates().stop.max_pos - positions.stop.max_pos) < max_dist)){
+ if (is_same_strand(tmp) && (abs(tmp->get_coordinates().start.min_pos - tmp->get_coordinates().stop.max_pos) == Parameter::Instance()->huge_ins || abs(positions.start.min_pos - positions.stop.max_pos) == Parameter::Instance()->huge_ins) && (abs(tmp->get_coordinates().start.min_pos - positions.start.min_pos) < max_dist || abs(tmp->get_coordinates().stop.max_pos - positions.stop.max_pos) < max_dist)) {
return 0;
}
@@ -168,11 +168,27 @@ long Breakpoint::overlap(Breakpoint * tmp) {
return diff; // + (tmp->get_coordinates().stop.max_pos - positions.stop.max_pos);
}
+long Breakpoint::overlap_breakpoint(long start, long stop) {
+ int max_dist = get_dist(this); // Parameter::Instance()->max_dist
+ if ((start < positions.start.min_pos && positions.start.min_pos < stop) || (start < positions.stop.max_pos && positions.stop.max_pos < stop)) {
+ return 0;
+ }
+ long diff = (start - positions.start.min_pos);
+ //if (abs(diff) < max_dist) {
+ //return (tmp->get_coordinates().stop.max_pos - positions.stop.max_pos);
+ //}
+ if (diff == 0) {
+ return 1;
+ }
+ return diff;
+
+}
void Breakpoint::add_read(Breakpoint * point) { //point = one read support!
if (point != NULL) {
//merge the support:
std::map<std::string, read_str> support = point->get_coordinates().support;
+ this->set_refcount(max(point->get_refcount(), this->get_refcount())); //set ref count!
for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
this->positions.support[(*i).first] = (*i).second;
}
@@ -180,15 +196,18 @@ void Breakpoint::add_read(Breakpoint * point) { //point = one read support!
}
///////////////////////////////// MERGING////////////////////////////////////////////
-std::vector<std::string> Breakpoint::get_read_names(int maxim) {
- std: vector<std::string> read_names;
- std::map<std::string, read_str> support = this->positions.support;
- int num = 0;
- for (std::map<std::string, read_str>::iterator i = support.begin(); (num < maxim || maxim == -1) && i != support.end(); i++) {
- read_names.push_back((*i).first);
- num++;
- }
- return read_names;
+std::string Breakpoint::get_read_names() {
+ std::string read_names;
+ int num = Parameter::Instance()->report_n_reads;
+ if (num == -1) {
+ num = this->positions.support.size();
+ }
+ for (std::map<std::string, read_str>::iterator i = this->positions.support.begin(); num != 0 && i != this->positions.support.end(); i++) {
+ read_names += ",";
+ read_names += (*i).first;
+ num--;
+ }
+ return read_names.substr(1);
}
std::vector<long> Breakpoint::get_read_ids() {
@@ -217,7 +236,7 @@ std::string Breakpoint::translate_strand(pair<bool, bool> strand_pair) {
return " ";
}
-void Breakpoint::summarize_type(char SV, std::vector<short>& array) {
+void Breakpoint::summarize_type(char SV, ushort * array) {
//std::string ss;
if (SV & DEL) {
// ss += "DEL; ";
@@ -261,58 +280,39 @@ char Breakpoint::get_SVtype() {
}
void Breakpoint::calc_support() {
- std::vector<short> SV;
- for (size_t i = 0; i < 6; i++) {
- SV.push_back(0);
- }
+
+ ushort sv[6] = { 0, 0, 0, 0, 0, 0 };
+
//run over all supports and check the majority type:
for (std::map<std::string, read_str>::iterator i = positions.support.begin(); i != positions.support.end(); i++) {
- summarize_type((*i).second.SV, SV);
+ summarize_type((*i).second.SV, sv);
}
//given the majority type get the stats:
- this->sv_type = eval_type(SV);
+ this->sv_type = eval_type(sv);
-}
+ if (get_SVtype() & TRA) { // we cannot make assumptions abut support yet.
+ set_valid((bool) (get_support() > 1)); // this is needed as we take each chr independently and just look at the primary alignment
+ } else if (get_support() >= Parameter::Instance()->min_support) {
+ predict_SV();
+ set_valid((bool) (get_length() > Parameter::Instance()->min_length));
+ }
-char Breakpoint::eval_type(std::vector<short> SV) {
- /*std::stringstream ss;
- if (SV[0] != 0) {
- ss << " DEL(";
- ss << SV[0];
- ss << ")";
- }
- if (SV[1] != 0) {
- ss << " DUP(";
- ss << SV[1];
- ss << ")";
- }
- if (SV[2] != 0) {
- ss << " INS(";
- ss << SV[2];
- ss << ")";
- }
- if (SV[3] != 0) {
- ss << " INV(";
- ss << SV[3];
- ss << ")";
- }
- if (SV[4] != 0) {
- ss << " TRA(";
- ss << SV[4];
- ss << ")";
- }
- this->sv_debug = ss.str(); //only for debug!
- std::cout << sv_debug << std::endl;*/
+}
+char Breakpoint::eval_type(ushort* SV) {
int maxim = 0;
int id = 0;
- for (size_t i = 0; i < SV.size(); i++) {
+ for (size_t i = 0; i < 6; i++) {
if (maxim < SV[i]) {
maxim = SV[i];
}
}
this->type_support = maxim;
+ if (!Parameter::Instance()->input_vcf.empty()) {
+ this->type_support--; // this is needed since we introduce a pseudo element
+ }
char max_SV = 0;
+
if (maxim == SV[0]) {
max_SV |= DEL;
}
@@ -340,8 +340,8 @@ long get_median(std::vector<long> corrds) {
return (corrds[corrds.size() / 2 - 1] + corrds[corrds.size() / 2]) / 2;
}
return corrds[corrds.size() / 2];
-
}
+
void Breakpoint::predict_SV() {
bool aln = false;
bool split = false;
@@ -356,7 +356,8 @@ void Breakpoint::predict_SV() {
std::vector<long> lengths2;
for (std::map<std::string, read_str>::iterator i = positions.support.begin(); i != positions.support.end(); i++) {
- if ((*i).second.SV & this->sv_type) { // && !((*i).second.SV & INS && (*i).second.length==Parameter::Instance()->huge_ins)) { ///check type
+
+ if (((*i).second.SV & this->sv_type) && strncmp((*i).first.c_str(), "input", 5) != 0) { // && !((*i).second.SV & INS && (*i).second.length==Parameter::Instance()->huge_ins)) { ///check type
if ((*i).second.coordinates.first != -1) {
if ((*i).second.length != Parameter::Instance()->huge_ins) {
@@ -368,6 +369,7 @@ void Breakpoint::predict_SV() {
}
start2.push_back((*i).second.coordinates.first);
}
+
if ((*i).second.coordinates.second != -1) { //TODO test
if ((*i).second.length != Parameter::Instance()->huge_ins) {
if (stops.find((*i).second.coordinates.second) == stops.end()) {
@@ -378,7 +380,8 @@ void Breakpoint::predict_SV() {
}
stops2.push_back((*i).second.coordinates.second);
}
- if (((*i).second.SV & INS) ) { //check lenght for ins only!
+
+ if (((*i).second.SV & INS)) { //check lenght for ins only!
if ((*i).second.length != Parameter::Instance()->huge_ins) {
if (lengths.find((*i).second.length) == lengths.end()) {
lengths[(*i).second.length] = 1;
@@ -415,88 +418,101 @@ void Breakpoint::predict_SV() {
long counts = 0;
int maxim = 0;
long coord = 0;
+ if (num > 0) {
- for (map<long, int>::iterator i = starts.begin(); i != starts.end(); i++) {
- // cout << "start:\t" << (*i).first << " " << (*i).second << endl;
- if ((*i).second > maxim) {
- coord = (*i).first;
- maxim = (*i).second;
+ for (map<long, int>::iterator i = starts.begin(); i != starts.end(); i++) {
+ if ((*i).second > maxim) {
+ coord = (*i).first;
+ maxim = (*i).second;
+ }
}
- //mean += ((*i).first * (*i).second);
- // counts += (*i).second;
- }
- if (maxim < 5) {
- this->positions.start.most_support = get_median(start2); //mean / counts;
- } else {
- this->positions.start.most_support = coord;
- }
-
- maxim = 0;
- coord = 0;
- mean = 0;
- counts = 0;
- for (map<long, int>::iterator i = stops.begin(); i != stops.end(); i++) {
- // cout << "stop:\t" << (*i).first << " " << (*i).second << endl;
- if ((*i).second > maxim) {
- coord = (*i).first;
- maxim = (*i).second;
+ if (maxim < 5) {
+ this->positions.start.most_support = get_median(start2); //check if "input"!
+ } else {
+ this->positions.start.most_support = coord;
}
- //mean += ((*i).first * (*i).second);
- // counts += (*i).second;
- }
- if (maxim < 5) {
- this->positions.stop.most_support = get_median(stops2); // mean / counts;
- } else {
- this->positions.stop.most_support = coord;
- }
- /*if(this->get_SVtype() & NEST){
- this->length = -1;
- }else
- */
-
- if (!(this->get_SVtype() & INS)) { //all types but Insertions:
- this->length = this->positions.stop.most_support - this->positions.start.most_support;
- } else { //compute most supported length for insertions:
+ this->indel_sequence = "";
+
+
maxim = 0;
coord = 0;
mean = 0;
counts = 0;
- for (map<long, int>::iterator i = lengths.begin(); i != lengths.end(); i++) {
- // std::cout<<"LENGTH: "<<(*i).first<<" : "<<(*i).second<<std::endl;
+ for (map<long, int>::iterator i = stops.begin(); i != stops.end(); i++) {
if ((*i).second > maxim) {
coord = (*i).first;
maxim = (*i).second;
}
- // mean += ((*i).first * (long) (*i).second);
- // counts += (*i).second;
}
- if (maxim < 3) {
- this->length = get_median(lengths2);
+ if (maxim < 5) {
+ this->positions.stop.most_support = get_median(stops2); // mean / counts;
} else {
- this->length = coord;
+ this->positions.stop.most_support = coord;
}
- }
+ if (!(this->get_SVtype() & INS)) { //all types but Insertions:
+ this->length = this->positions.stop.most_support - this->positions.start.most_support;
+
+ } else { //compute most supported length for insertions:
+ maxim = 0;
+ coord = 0;
+ mean = 0;
+ counts = 0;
+ for (map<long, int>::iterator i = lengths.begin(); i != lengths.end(); i++) {
+ if ((*i).second > maxim) {
+ coord = (*i).first;
+ maxim = (*i).second;
+ }
+
+ }
+ if (maxim < 3) {
+ this->length = get_median(lengths2);
+ } else {
+ this->length = coord;
+ }
- starts.clear();
- stops.clear();
+ // if(del)
+ }
- for (size_t i = 0; i < strands.size(); i++) {
- maxim = 0;
- std::string id;
- for (std::map<std::string, int>::iterator j = strands.begin(); j != strands.end(); j++) {
- if (maxim < (*j).second) {
- maxim = (*j).second;
- id = (*j).first;
- //std::cout << '\t' << id << std::endl;
+ starts.clear();
+ stops.clear();
+
+ for (size_t i = 0; i < strands.size(); i++) {
+ maxim = 0;
+ std::string id;
+ for (std::map<std::string, int>::iterator j = strands.begin(); j != strands.end(); j++) {
+ if (maxim < (*j).second) {
+ maxim = (*j).second;
+ id = (*j).first;
+ //std::cout << '\t' << id << std::endl;
+ }
+ }
+ if (maxim > 0) {
+ this->strand.push_back(id);
+ strands[id] = 0;
}
}
- if (maxim > 0) {
- this->strand.push_back(id);
- strands[id] = 0;
+ strands.clear();
+
+
+ std::map<std::string, read_str>::iterator tmp = positions.support.begin();
+ int start_prev_dist=1000;
+ int stop_prev_dist=1000;
+ while(tmp != positions.support.end()) {
+ int start_dist=abs((*tmp).second.coordinates.first- this->positions.start.most_support);
+ int stop_dist=abs((*tmp).second.coordinates.second- this->positions.stop.most_support);
+ if (((*tmp).second.SV & this->sv_type) && ( (start_dist<start_prev_dist && stop_dist<stop_prev_dist) && (strcmp((*tmp).second.sequence.c_str(), "NA") != 0 && !(*tmp).second.sequence.empty()))) {
+ this->indel_sequence = (*tmp).second.sequence;
+ }
+ tmp++;
}
}
- strands.clear();
+
+ if (num == 0 && positions.support.find("input") != positions.support.end()) {
+ this->positions.stop.most_support = this->positions.stop.max_pos;
+ this->positions.start.most_support = this->positions.start.min_pos;
+ this->length = this->positions.stop.max_pos - this->positions.start.min_pos;
+ }
this->supporting_types = "";
if (aln) {
@@ -606,15 +622,14 @@ std::string Breakpoint::to_string() {
ss << get_coordinates().stop.max_pos;
ss << " ";
ss << this->length;
- ss << " ";
+ ss << " PE=";
ss << positions.support.size();
ss << " ";
- ss << get_strand(2);
+ ss << get_strand(1);
}
return ss.str();
}
std::string Breakpoint::to_string(RefVector ref) {
-
std::stringstream ss;
ss << "(";
ss << get_chr(get_coordinates().start.min_pos, ref);
@@ -632,7 +647,10 @@ std::string Breakpoint::to_string(RefVector ref) {
ss << this->get_strand(2);
ss << "\n";
int num = 0;
- for (std::map<std::string, read_str>::iterator i = positions.support.begin(); i != positions.support.end() && num < Parameter::Instance()->report_n_reads; i++) {
+ for (std::map<std::string, read_str>::iterator i = positions.support.begin(); i != positions.support.end(); i++) {
+ if (num < Parameter::Instance()->report_n_reads && Parameter::Instance()->report_n_reads != -1) {
+ break;
+ }
ss << "\t";
ss << (*i).first;
ss << " ";
@@ -648,6 +666,7 @@ std::string Breakpoint::to_string(RefVector ref) {
ss << "-";
}
num++;
+
ss << "\n";
}
ss << " ";
diff --git a/src/sub/Breakpoint.h b/src/sub/Breakpoint.h
index b67b568..f238a22 100644
--- a/src/sub/Breakpoint.h
+++ b/src/sub/Breakpoint.h
@@ -46,6 +46,7 @@ struct read_str {
pair<long,long> coordinates; // I could use the bin tree for that!
char SV; // bit vector
int length;
+ std::string sequence; //just for indels!
};
struct position_str {
svs_breakpoint_str start;
@@ -55,8 +56,8 @@ struct position_str {
//std::vector<read_str> support; // map?? -> no duplicated reads, easy to catch up which read is included.
int coverage;
int lowmq_cov;
- short read_start;
- short read_stop;
+ int read_start;
+ int read_stop;
};
@@ -82,12 +83,15 @@ private:
BinTree grouped;
tree_node * grouped_node;
long length;
+ std::string indel_sequence;
+ bool should_be_stored;
+ int ref_allele;
void summarize_support(short type);
//void summarize_strand(pair<bool, bool> strand, std::vector<short>& array);
- void summarize_type(char SV, std::vector<short>& array);
+ void summarize_type(char SV, ushort * array);
//std::string translate_strand(short id);
- char eval_type(std::vector<short> SV);
+ char eval_type(ushort *SV);
std::string rev_complement(std::string seq);
bool is_in(short id);
std::string translate_strand(pair<bool, bool> strand);
@@ -98,7 +102,8 @@ private:
}
public:
Breakpoint(position_str sv,long len) {
-
+ ref_allele=0;
+ should_be_stored=false;
sv_type |= NA;
type.is_ALN=((*sv.support.begin()).second.type==0);
type.is_SR=((*sv.support.begin()).second.type==1);
@@ -114,6 +119,7 @@ public:
int get_support();
long overlap(Breakpoint * tmp);
+ long overlap_breakpoint(long start,long stop);
void set_coordinates(int start, int stop){
this->positions.start.min_pos=start;
this->positions.stop.max_pos=stop;
@@ -159,9 +165,24 @@ public:
str_types get_types(){
return this->type;
}
- std::vector<std::string> get_read_names(int num);
+ std::string get_read_names();
std::vector<long> get_read_ids();
std::string to_string();
+ std::string get_sequence(){
+ return this->indel_sequence;
+ }
+ void set_valid(bool valid){
+ this-> should_be_stored=valid;
+ }
+ bool get_valid(){
+ return this->should_be_stored;
+ }
+ int get_refcount(){
+ return this->ref_allele;
+ }
+ void set_refcount(int value){
+ this->ref_allele+=value;
+ }
};
#endif /* SUB_BREAKPOINT_H_ */
diff --git a/src/sub/Detect_Breakpoints.cpp b/src/sub/Detect_Breakpoints.cpp
index 39c8a6e..377382c 100644
--- a/src/sub/Detect_Breakpoints.cpp
+++ b/src/sub/Detect_Breakpoints.cpp
@@ -8,18 +8,6 @@
#include "Detect_Breakpoints.h"
#include "../print/IPrinter.h"
-long fuck_off(long pos, RefVector ref, std::string &chr) {
- size_t i = 0;
- pos -= (ref[i].RefLength + Parameter::Instance()->max_dist);
-
- while (i < ref.size() && pos >= 0) {
- i++;
- pos -= ((long) ref[i].RefLength + (long) Parameter::Instance()->max_dist);
- }
- chr = ref[i].RefName;
- return pos + ref[i].RefLength + (long) Parameter::Instance()->max_dist;
-}
-
void store_pos(vector<hist_str> &positions, long pos, std::string read_name) {
for (size_t i = 0; i < positions.size(); i++) {
if (abs(positions[i].position - pos) < Parameter::Instance()->min_length) {
@@ -35,6 +23,30 @@ void store_pos(vector<hist_str> &positions, long pos, std::string read_name) {
positions.push_back(tmp);
}
+std::string reverse_complement(std::string sequence) {
+ std::string tmp_seq;
+ for (std::string::reverse_iterator i = sequence.rbegin(); i != sequence.rend(); i++) {
+ switch ((*i)) {
+ case 'A':
+ tmp_seq += 'T';
+ break;
+ case 'C':
+ tmp_seq += 'G';
+ break;
+ case 'G':
+ tmp_seq += 'C';
+ break;
+ case 'T':
+ tmp_seq += 'A';
+ break;
+ default:
+ tmp_seq += (*i);
+ break;
+ }
+ }
+ return tmp_seq;
+}
+
Breakpoint * split_points(vector<std::string> names, std::map<std::string, read_str> support) {
std::map<std::string, read_str> new_support;
for (size_t i = 0; i < names.size(); i++) {
@@ -52,40 +64,26 @@ void detect_merged_svs(position_str point, RefVector ref, vector<Breakpoint *> &
new_points.clear(); //just in case!
vector<hist_str> pos_start;
vector<hist_str> pos_stop;
- for (std::map<std::string, read_str>::iterator i = point.support.begin(); i != point.support.end(); i++) {
- // std::cout << (*i).second.coordinates.first << "," << (*i).second.coordinates.second << std::endl;
+ for (std::map<std::string, read_str>::iterator i = point.support.begin(); i != point.support.end(); ++i) {
store_pos(pos_start, (*i).second.coordinates.first, (*i).first);
store_pos(pos_stop, (*i).second.coordinates.second, (*i).first);
}
-// std::cout << "Start: " << std::endl;
+
int start_count = 0;
- //std::cout<<"Start pos: ";
for (size_t i = 0; i < pos_start.size(); i++) {
//std::cout<<pos_start[i].hits <<",";
if (pos_start[i].hits > Parameter::Instance()->min_support) {
start_count++;
- /* std::string chr = "";
- long pos = fuck_off(pos_start[i].position, ref, chr);
- std::cout << chr << " " << pos << ":" << pos_start[i].hits << " ; ";
- */
+
}
}
-// std::cout << std::endl;
-// std::cout << "Stop: " << std::endl;
int stop_count = 0;
for (size_t i = 0; i < pos_stop.size(); i++) {
// std::cout << pos_stop[i].hits << ",";
if (pos_stop[i].hits > Parameter::Instance()->min_support) {
stop_count++;
- /* std::string chr = "";
- long pos = fuck_off(pos_stop[i].position, ref, chr);
- std::cout << chr << " " << pos << ":" << pos_stop[i].hits << " ; ";
- */
}
}
- //std::cout << std::endl;
- //std::cout << std::endl;
-
if (stop_count > 1 || start_count > 1) {
std::cout << "\tprocessing merged TRA" << std::endl;
if (start_count > 1) {
@@ -149,19 +147,12 @@ bool should_be_stored(Breakpoint *& point) {
point->calc_support(); // we need that before:
//std::cout << "Stored: " << point->get_support() << " " << point->get_length() << std::endl;
if (point->get_SVtype() & TRA) { // we cannot make assumptions abut support yet.
- return (point->get_support() > 1); // this is needed as we take each chr independently and just look at the primary alignment
+ point->set_valid((bool) (point->get_support() > 1)); // this is needed as we take each chr independently and just look at the primary alignment
} else if (point->get_support() >= Parameter::Instance()->min_support) {
point->predict_SV();
- /* std::cout<<"LEN check: "<<point->get_length()
- if(point->get_length() > Parameter::Instance()->min_length){
- cout<<" T "<<std::endl;
- }else{
- cout<<" T "<<std::endl;
- }*/
- return (point->get_length() > Parameter::Instance()->min_length);
+ point->set_valid((bool) (point->get_length() > Parameter::Instance()->min_length));
}
-
- return false;
+ return point->get_valid();
}
void polish_points(std::vector<Breakpoint *> & points, RefVector ref) { //TODO might be usefull! but why does the tree not fully work??
return;
@@ -196,9 +187,6 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
std::cout << "Start parsing..." << std::endl;
//Using Interval tree to store and manage breakpoints:
- //IntervallList final;
- //IntervallList bst;
-
IntervallTree final;
IntervallTree bst;
@@ -209,30 +197,41 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
//FILE * alt_allel_reads;
FILE * ref_allel_reads;
if (Parameter::Instance()->genotype) {
- std::string output = Parameter::Instance()->tmp_file.c_str();
- output += "ref_allele";
- ref_allel_reads = fopen(output.c_str(), "wb");
+ ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb");
}
Alignment * tmp_aln = mapped_file->parseRead(Parameter::Instance()->min_mq);
long ref_space = get_ref_lengths(tmp_aln->getRefID(), ref);
- //std::string prev = "test";
- //std::string curr = "curr";
long num_reads = 0;
+
+ /*Genotyper * go;
+ if (Parameter::Instance()->genotype) {
+ go = new Genotyper();
+ }*/
+
while (!tmp_aln->getQueryBases().empty()) {
- if ((tmp_aln->getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800) && tmp_aln->get_is_save())) {
+
+ if ((tmp_aln->getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800) && tmp_aln->get_is_save())){// && (Parameter::Instance()->chr_names.empty() || Parameter::Instance()->chr_names.find(ref[tmp_aln->getRefID()].RefName) != Parameter::Instance()->chr_names.end())) {
+
//change CHR:
if (current_RefID != tmp_aln->getRefID()) {
- current_RefID = tmp_aln->getRefID();
- ref_space = get_ref_lengths(tmp_aln->getRefID(), ref);
- std::cout << "\tSwitch Chr " << ref[tmp_aln->getRefID()].RefName << std::endl;//" " << ref[tmp_aln->getRefID()].RefLength
+
+ std::cout << "\tSwitch Chr " << ref[tmp_aln->getRefID()].RefName << std::endl; //" " << ref[tmp_aln->getRefID()].RefLength
std::vector<Breakpoint *> points;
- // clarify(points);
bst.get_breakpoints(root, points);
- polish_points(points, ref);
- for (int i = 0; i < points.size(); i++) {
+ //polish_points(points, ref);
+
+ /* if (Parameter::Instance()->genotype) {
+ fclose(ref_allel_reads);
+ cout<<"\t\tGenotyping"<<endl;
+ go->update_SVs(points, ref_space);
+ cout<<"\t\tGenotyping finished"<<endl;
+ ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb");
+ }*/
- if (should_be_stored(points[i])) {
- //detect_merged_svs(points[i]);
+ for (int i = 0; i < points.size(); i++) {
+ points[i]->calc_support();
+ if (points[i]->get_valid()) {
+ //invoke update over ref support!
if (points[i]->get_SVtype() & TRA) {
final.insert(points[i], root_final);
} else {
@@ -241,7 +240,10 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
}
}
bst.clear(root);
+ current_RefID = tmp_aln->getRefID();
+ ref_space = get_ref_lengths(tmp_aln->getRefID(), ref);
}
+
//SCAN read:
std::vector<str_event> aln_event;
std::vector<aln_str> split_events;
@@ -251,42 +253,42 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
{
#pragma omp sections
{
+#pragma omp section
{
- // clock_t begin = clock();
+ // clock_t begin = clock();
if ((score == -1 || score > Parameter::Instance()->score_treshold)) {
aln_event = tmp_aln->get_events_Aln();
}
- // Parameter::Instance()->meassure_time(begin, " Alignment ");
+ // Parameter::Instance()->meassure_time(begin, " Alignment ");
}
#pragma omp section
{
- //TODO ignore Splits that are shorter then XYbp??
- // clock_t begin_split = clock();
+ // clock_t begin_split = clock();
split_events = tmp_aln->getSA(ref);
- // Parameter::Instance()->meassure_time(begin_split," Split reads ");
+ // Parameter::Instance()->meassure_time(begin_split, " Split reads ");
}
}
}
//tmp_aln->set_supports_SV(aln_event.empty() && split_events.empty());
//Store reference supporting reads for genotype estimation:
- str_read tmp;
- tmp.SV_support = !(aln_event.empty() && split_events.empty());
- if ((Parameter::Instance()->genotype && !tmp.SV_support) && (score == -1 || score > Parameter::Instance()->score_treshold)) {
+
+ bool SV_support = (!aln_event.empty() && !split_events.empty());
+ if (Parameter::Instance()->genotype && !SV_support) {
//write read:
- tmp.chr = ref[tmp_aln->getRefID()].RefName;
+ str_read tmp;
+ tmp.chr_id = tmp_aln->getRefID(); //check string in binary???
tmp.start = tmp_aln->getPosition();
tmp.length = tmp_aln->getRefLength();
- tmp.SV_support = false;
fwrite(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
}
//store the potential SVs:
if (!aln_event.empty()) {
- add_events(tmp_aln, aln_event, 0, ref_space, bst, root, num_reads);
+ add_events(tmp_aln, aln_event, 0, ref_space, bst, root, num_reads, false);
}
if (!split_events.empty()) {
- add_splits(tmp_aln, split_events, 1, ref, bst, root, num_reads);
+ add_splits(tmp_aln, split_events, 1, ref, bst, root, num_reads, false);
}
}
}
@@ -297,45 +299,40 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
if (num_reads % 10000 == 0) {
cout << "\t\t# Processed reads: " << num_reads << endl;
- //" \r";
- //std::cout.flush();
}
}
//filter and copy results:
std::cout << "Finalizing .." << std::endl;
std::vector<Breakpoint *> points;
bst.get_breakpoints(root, points);
- //std::cout<<"got breakpoints"<<std::endl;
- //polish_points(points, ref);
+ /* if (Parameter::Instance()->genotype) {
+ fclose(ref_allel_reads);
+ go->update_SVs(points, ref_space);
+ string del = "rm ";
+ del += Parameter::Instance()->tmp_genotyp;
+ del += "ref_allele";
+ system(del.c_str());
+ }*/
+
for (int i = 0; i < points.size(); i++) {
- //std::cout<<"start check"<<" "<<i<<std::endl;
- if (should_be_stored(points[i])) {
+ points[i]->calc_support();
+ if (points[i]->get_valid()) {
+ //invoke update over ref support!
if (points[i]->get_SVtype() & TRA) {
-
final.insert(points[i], root_final);
- // std::cout<<"Done insert"<<" "<<i<<std::endl;
} else {
- // std::cout<<"start print"<<" "<<i<<std::endl;
printer->printSV(points[i]);
- // std::cout<<"Done print"<<" "<<i<<std::endl;
}
}
}
- //std::cout<<"Fine"<<std::endl;
bst.clear(root);
- if (Parameter::Instance()->genotype) {
- fclose(ref_allel_reads);
- }
-// sweep->finalyze();
-
points.clear();
final.get_breakpoints(root_final, points);
//std::cout<<"Detect merged tra"<<std::endl;
size_t points_size = points.size();
for (size_t i = 0; i < points_size; i++) { // its not nice, but I may alter the length of the vector within the loop.
if (points[i]->get_SVtype() & TRA) {
-
vector<Breakpoint *> new_points;
detect_merged_svs(points[i]->get_coordinates(), ref, new_points);
if (!new_points.empty()) { // I only allow for 1 split!!
@@ -357,7 +354,7 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
//std::cout<<"Done"<<std::endl;
}
-void add_events(Alignment *& tmp, std::vector<str_event> events, short type, long ref_space, IntervallContainer & bst, TNode *&root, long read_id) {
+void add_events(Alignment *& tmp, std::vector<str_event> events, short type, long ref_space, IntervallTree & bst, TNode *&root, long read_id, bool add) {
bool flag = (strcmp(tmp->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
for (size_t i = 0; i < events.size(); i++) {
@@ -369,6 +366,7 @@ void add_events(Alignment *& tmp, std::vector<str_event> events, short type, lon
read.type = 0;
}
read.SV = events[i].type;
+ read.sequence = events[i].sequence;
if (flag) {
std::cout << "ADD EVENT " << tmp->getName() << " " << tmp->getRefID() << " " << events[i].pos << " " << abs(events[i].length) << std::endl;
@@ -420,23 +418,22 @@ void add_events(Alignment *& tmp, std::vector<str_event> events, short type, lon
read.coordinates.first = svs.start.min_pos;
read.coordinates.second = svs.stop.max_pos;
}
- /*if(read.SV & DEL){
- std::cout<<"ADD DEL: "<<svs.start.min_pos<< " "<<svs.stop.max_pos<<" "<<tmp->getName()<<std::endl;
- }*/
- /*if(read.SV & INS){
- std::cout<<"ALN LEN: "<<events[i].length<<" Pos: "<<svs.start.min_pos<< " "<<svs.stop.max_pos<<" "<<tmp->getName()<<std::endl;
- }*/
+
read.id = read_id;
svs.support[tmp->getName()] = read;
svs.support[tmp->getName()].length = events[i].length;
Breakpoint * point = new Breakpoint(svs, events[i].length);
- bst.insert(point, root);
+ if (add) {
+ bst.insert_existant(point, root);
+ } else {
+ bst.insert(point, root);
+ }
//std::cout<<"Print:"<<std::endl;
//bst.print(root);
}
}
-void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVector ref, IntervallContainer & bst, TNode *&root, long read_id) {
+void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVector ref, IntervallTree& bst, TNode *&root, long read_id, bool add) {
bool flag = (strcmp(tmp->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
if (flag) {
@@ -455,6 +452,7 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
position_str svs;
//position_str stop;
read_str read;
+ read.sequence = "NA";
//read.name = tmp->getName();
read.type = type;
read.SV = 0;
@@ -488,20 +486,34 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
}
if ((svs.stop.max_pos - svs.start.min_pos) > Parameter::Instance()->min_length * -1 && ((svs.stop.max_pos - svs.start.min_pos) + (Parameter::Instance()->min_length) < (svs.read_stop - svs.read_start) && (svs.read_stop - svs.read_start) > (Parameter::Instance()->min_length * 2))) {
- if (flag) {
- cout << "INS: " << endl;
- }
-
- if (!events[i].cross_N || (double)((svs.stop.max_pos - svs.start.min_pos) + Parameter::Instance()->min_length) < ((double)(svs.read_stop - svs.read_start) * Parameter::Instance()->avg_ins)) {
+ if (!events[i].cross_N || (double) ((svs.stop.max_pos - svs.start.min_pos) + Parameter::Instance()->min_length) < ((double) (svs.read_stop - svs.read_start) * Parameter::Instance()->avg_ins)) {
svs.stop.max_pos += (svs.read_stop - svs.read_start); //TODO check!
+ if (Parameter::Instance()->print_seq) {
+ svs.read_stop = events[i].read_pos_start;
+ svs.read_start = events[i - 1].read_pos_stop;
+ if (svs.read_stop > tmp->getAlignment()->QueryBases.size()) {
+ cerr << "BUG: split read ins! " << svs.read_stop << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl;
+ }
+ if (!events[i - 1].strand) {
+ std::string tmp_seq = reverse_complement(tmp->getAlignment()->QueryBases);
+
+ read.sequence = reverse_complement(tmp_seq.substr(svs.read_start, svs.read_stop - svs.read_start));
+ } else {
+ read.sequence = tmp->getAlignment()->QueryBases.substr(svs.read_start, svs.read_stop - svs.read_start);
+ }
+ if (flag) {
+ cout << "INS: " << endl;
+ cout << "split read ins! " << events[i - 1].read_pos_stop << " " << events[i].read_pos_start << " " << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl;
+ cout << "Seq+:" << read.sequence << endl;
+ }
+ }
read.SV |= INS;
} else {
read.SV |= 'n';
}
} else if ((svs.start.min_pos - svs.stop.max_pos) * -1 > (svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length)) {
- //cout << "DEL1 "<<(double)(svs.start.min_pos - svs.stop.max_pos) * Parameter::Instance()->avg_del * -1.0 <<" "<<((svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length)) << endl;
- if (!events[i].cross_N || (double)(svs.start.min_pos - svs.stop.max_pos) * Parameter::Instance()->avg_del * -1.0 > (double)((svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length))) {
+ if (!events[i].cross_N || (double) (svs.start.min_pos - svs.stop.max_pos) * Parameter::Instance()->avg_del * -1.0 > (double) ((svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length))) {
read.SV |= DEL;
if (flag) {
cout << "DEL2" << endl;
@@ -546,10 +558,6 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
svs.start.min_pos = svs.stop.max_pos;
svs.stop.max_pos = tmp;
}
- // if(flag){
- // std::cout<<"NEST: "<<svs.start.min_pos- get_ref_lengths(events[i - 1].RefID, ref) << " "<<svs.stop.max_pos - get_ref_lengths(events[i - 1].RefID, ref)<<" "<<tmp->getName()<<std::endl;
- // }
- // svs.stop.max_pos = svs.start.min_pos + Parameter::Instance()->min_length * 2;
} else if (!is_overlapping) {
read.SV |= INV;
if (events[i - 1].strand) {
@@ -627,34 +635,18 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
read.coordinates.first = svs.start.min_pos;
read.coordinates.second = svs.stop.max_pos;
}
- /* if (read.SV & TRA) {
- std::cout << "SPLIT: " << TRANS_type(read.SV)<<" "<<svs.start.min_pos - get_ref_lengths(events[i].RefID, ref)<<" ";
- if (events[i - 1].strand) {
- std::cout << " +";
- } else {
- std::cout << " -";
- }
- if (events[i].strand) {
- std::cout << " +";
- } else {
- std::cout << " -";
- }
- std::cout <<std::endl;
- }*/
- /*if(read.SV & INS){
- std::cout<<"Split LEN: "<<abs(read.coordinates.second-read.coordinates.first)<<" Pos: "<<read.coordinates.first-get_ref_lengths(events[i - 1].RefID, ref)<<tmp->getName()<<std::endl;
- }*/
- //if( abs(read.coordinates.second-read.coordinates.first)<10){
- /// events[i].length=abs(read.coordinates.second-read.coordinates.first);//TODO try
- // std::cout<<"Split event ===1 "<<" "<<abs(read.coordinates.second-read.coordinates.first)<<" len: "<< events[i].length<<" Pos "<<events[i].pos <<" Name "<<tmp->getName() <<std::endl;
- // }
+
//pool out?
read.id = read_id;
svs.support[tmp->getName()] = read;
svs.support[tmp->getName()].length = abs(read.coordinates.second - read.coordinates.first);
Breakpoint * point = new Breakpoint(svs, abs(read.coordinates.second - read.coordinates.first));
//std::cout<<"split ADD: " << <<" Name: "<<tmp->getName()<<" "<< svs.start.min_pos- get_ref_lengths(events[i].RefID, ref)<<"-"<<svs.stop.max_pos- get_ref_lengths(events[i].RefID, ref)<<std::endl;
- bst.insert(point, root);
+ if (add) {
+ bst.insert_existant(point, root);
+ } else {
+ bst.insert(point, root);
+ }
// std::cout<<"Print:"<<std::endl;
// bst.print(root);
}
diff --git a/src/sub/Detect_Breakpoints.h b/src/sub/Detect_Breakpoints.h
index fb1eeae..ce0e993 100644
--- a/src/sub/Detect_Breakpoints.h
+++ b/src/sub/Detect_Breakpoints.h
@@ -31,8 +31,8 @@ void clarify(std::vector<Breakpoint *> & points);
void detect_breakpoints(std::string filename, IPrinter *& printer);
//void screen_for_events(Node * list,IntervallTree & bst ,TNode *&root, int cov, int lowMQ_cov,RefVector ref);
bool screen_for_events(Alignment * tmp, IntervallTree & bst, TNode *&root, RefVector ref, int cov);
-void add_events(Alignment *& tmp, std::vector<str_event> events, short type, long ref_space, IntervallContainer & bst, TNode *&root,long read_id);
-void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVector ref, IntervallContainer & bst, TNode *&root,long read_id);
+void add_events(Alignment *& tmp, std::vector<str_event> events, short type, long ref_space, IntervallTree & bst, TNode *&root,long read_id,bool add);
+void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVector ref, IntervallTree & bst, TNode *&root,long read_id,bool add);
void estimate_parameters(std::string read_filename);
bool overlaps(aln_str prev,aln_str curr);
void detect_merged_svs(Breakpoint * point);
diff --git a/src/tree/Breakpoint_Tree.cpp b/src/tree/Breakpoint_Tree.cpp
index 54488ad..36d3a28 100644
--- a/src/tree/Breakpoint_Tree.cpp
+++ b/src/tree/Breakpoint_Tree.cpp
@@ -25,54 +25,52 @@ void Breakpoint_Tree::find(int position, std::string chr, breakpoint_node *par,
}
}
-void Breakpoint_Tree::overalps(int start, int stop, std::string chr, breakpoint_node *par, bool SV_support) {
+void Breakpoint_Tree::overalps(int start, int stop, std::string chr, breakpoint_node *par) {
+ //start + stop: read coordinates.
if (par == NULL) { //not found
-
- par = NULL;
return;
}
- /*std::string chr2="II";
- int pos=275800;
- if ((pos+100 > start && pos-100 < stop) && strcmp(chr.c_str(), chr2.c_str()) == 0) { //found
- std::cout<<"hit: "<<start<<std::endl;
- }*/
- if ((par->position + 100 > start && par->position - 100 < stop) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found
- // if (SV_support) { Maybe for later!
- //par->SV_support++;
- //} else {
+ if (par->direction) { //start
+ if ((par->position-100 > start && par->position+100 < stop) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found
+ par->ref_support++;
+ // std::cout<<"start: "<<start<<" "<<stop<<std::endl;
+ }
+ } else { //stop coordinate
+ if ((par->position > start+100 && par->position < stop-100) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found
par->ref_support++;
- //}
- //par = NULL;
- //return; //this does not really work for close/phased SVs!
+ // std::cout<<"stop: "<< start<<" "<<stop<<std::endl;
+ }
}
+
//search goes on:
if (start < par->position) {
- overalps(start, stop, chr, par->left, SV_support);
+ overalps(start, stop, chr, par->left);
} else {
- overalps(start, stop, chr, par->right, SV_support);
+ overalps(start, stop, chr, par->right);
}
}
/*
* Inserting Element into the Tree
*/
-void Breakpoint_Tree::insert(breakpoint_node *&tree, std::string chr, int position) {
+void Breakpoint_Tree::insert(breakpoint_node *&tree, std::string chr, int position, bool direction) {
if (tree == NULL) {
tree = new breakpoint_node;
tree->position = position;
tree->ref_support = 0;
tree->chr = chr;
+ tree->direction = direction;
tree->left = NULL;
tree->right = NULL;
} else if (tree->position > position) {
- insert(tree->left, chr, position);
+ insert(tree->left, chr, position, direction);
} else if (tree->position < position) {
- insert(tree->right, chr, position);
+ insert(tree->right, chr, position, direction);
} else if (strcmp(chr.c_str(), tree->chr.c_str()) == 0) { // found element -> already exist!
//std::cerr << "Element exists!" << std::endl;
//TODO we should use this information to assess the reliability of this call!
} else {
- insert(tree->left, chr, position); //think about that!
+ insert(tree->left, chr, position,position); //think about that!
}
}
@@ -86,7 +84,7 @@ int Breakpoint_Tree::get_ref(breakpoint_node *&tree, std::string chr, int positi
return get_ref(tree->right, chr, position);
} else if (strcmp(chr.c_str(), tree->chr.c_str()) == 0) { // found element
return tree->ref_support;
- } else{
+ } else {
return get_ref(tree->left, chr, position); //just in case.
}
}
diff --git a/src/tree/Breakpoint_Tree.h b/src/tree/Breakpoint_Tree.h
index db85d2a..d617f3c 100644
--- a/src/tree/Breakpoint_Tree.h
+++ b/src/tree/Breakpoint_Tree.h
@@ -15,6 +15,7 @@
struct breakpoint_node {
std::string chr;
int position; // value to store!
+ bool direction;
int ref_support;
breakpoint_node *left;
breakpoint_node *right;
@@ -29,7 +30,7 @@ public:
~Breakpoint_Tree(){
}
void find(int position,std::string chr, breakpoint_node *par, breakpoint_node *&loc);
- void insert(breakpoint_node *&tree, std::string chr,int position);
+ void insert(breakpoint_node *&tree, std::string chr,int position,bool direction);
void del(int position,std::string chr);
void case_a(breakpoint_node *par, breakpoint_node *loc);
void case_b(breakpoint_node *par, breakpoint_node *loc);
@@ -39,7 +40,7 @@ public:
void postorder(breakpoint_node *ptr);
void display(breakpoint_node *ptr, int);
void get_nodes(breakpoint_node *ptr, std::vector<int> & nodes);
- void overalps(int start,int stop,std::string chr, breakpoint_node *par, bool SV_support);
+ void overalps(int start,int stop,std::string chr, breakpoint_node *par);
int get_ref(breakpoint_node *&tree, std::string chr, int position);
};
diff --git a/src/tree/IntervallTree.cpp b/src/tree/IntervallTree.cpp
index 76ee22d..74e52a1 100644
--- a/src/tree/IntervallTree.cpp
+++ b/src/tree/IntervallTree.cpp
@@ -72,6 +72,71 @@ void IntervallTree::insert(Breakpoint * new_break, TNode *&p) {
d = max(m, n);
p->set_height(d + 1);
}
+
+void IntervallTree::insert_existant(Breakpoint * new_break, TNode *&p) {
+ if (new_break->get_coordinates().start.min_pos == -1 && new_break->get_coordinates().stop.max_pos == -1) {
+ return;
+ }
+ if (p == NULL) { // add to tree:
+ return;
+ } else { // find on tree:
+ long score = p->get_data()->overlap(new_break); //comparison function
+ if (score == 0) { //add SV types?
+ p->get_data()->add_read(new_break);
+ new_break->set_coordinates(-1, -1);
+ //delete new_break;
+ return;
+ } else if (abs(score) < Parameter::Instance()->max_dist) { // if two or more events are too close:
+ //std::cout<<"Screen"<<std::endl;
+ careful_screening(new_break, p);
+ if (new_break->get_coordinates().start.min_pos == -1 && new_break->get_coordinates().stop.max_pos == -1) {
+ return;
+ }
+ }
+
+ if (score > 0) { // go left
+ insert_existant(new_break, p->left);
+ if ((bsheight(p->left) - bsheight(p->right)) == 2) {
+ score = p->left->get_data()->overlap(new_break);
+ if (score > 0) {
+ p = srl(p);
+ } else {
+ p = drl(p);
+ }
+ }
+ } else if (score < 0) { // go right
+ insert_existant(new_break, p->right);
+ if ((bsheight(p->right) - bsheight(p->left)) == 2) {
+ score = p->right->get_data()->overlap(new_break);
+ if (score < 0) {
+ p = srr(p);
+ } else {
+ p = drr(p);
+ }
+ }
+ }
+ }
+ int m, n, d;
+ m = bsheight(p->left);
+ n = bsheight(p->right);
+ d = max(m, n);
+ p->set_height(d + 1);
+}
+
+bool IntervallTree::overlaps(long start, long stop, TNode *p) {
+ if (p == NULL) {
+ return false;
+ } else {
+ long score = p->get_data()->overlap_breakpoint(start,stop);
+ if (score > 0) {
+ overlaps(start,stop, p->left);
+ } else if (score < 0) {
+ overlaps(start,stop, p->right);
+ } else {
+ return true;
+ }
+ }
+}
// Finding the Smallest
TNode * IntervallTree::findmin(TNode * p) {
if (p == NULL) {
diff --git a/src/tree/IntervallTree.h b/src/tree/IntervallTree.h
index 7bb0dc0..239c6a9 100644
--- a/src/tree/IntervallTree.h
+++ b/src/tree/IntervallTree.h
@@ -25,9 +25,13 @@ private:
void careful_screening(Breakpoint *& new_break, TNode *p);
public:
void insert(Breakpoint * point, TNode *&);
+ void insert_ref(Breakpoint * point, TNode *&);
+
+ void insert_existant(Breakpoint * new_break, TNode *&p);
void del(Breakpoint * point, TNode *&);
int deletemin(TNode *&);
void find(Breakpoint * point, TNode *&);
+ bool overlaps(long start, long stop,TNode *p);
TNode * findmin(TNode*);
TNode * findmax(TNode*);
void clear(TNode *&);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/sniffles.git
More information about the debian-med-commit
mailing list